In [87]:
import PyPDF2
from string import digits, punctuation
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
import random
import json

In [2]:
def AttachAndCleanTexts(NiceClass):
    
    pdfFileObj = open(r'./Desktop/Nice Classes/20170101-en-class-flat-' + str(NiceClass) + '.pdf','rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    
    TextBody = ''
    
    for i in range(0, pdfReader.numPages):
        
        pageObj = pdfReader.getPage(i)
        TextBody = TextBody + ' ' + pageObj.extractText().replace('\n', ' ')
        
        
    TextBody = TextBody.replace('/', ' ')
    TextBody = TextBody.replace('NICE CLASSIFICATION - 11th Edition, Version 2017 Class', ' ')
    TextBody = TextBody.replace('Page', '')
    TextBody = TextBody.replace('Class', '')
    
    remove_digits = str.maketrans('', '', digits)
    remove_puncs = str.maketrans('', '', punctuation)
    
    TextBody = TextBody.translate(remove_digits)
    TextBody = TextBody.translate(remove_puncs)
    TextBody = TextBody.lower()
    
    
    return TextBody

In [8]:
TextBody = AttachAndCleanTexts(33)

In [83]:
ClassTexts = []
Classes = []

for NiceClass in range(1, 46):
    
    TextBody = AttachAndCleanTexts(NiceClass)
    TokenizedText = TextBody.split()
    TokenSize = round(0.5*len(TokenizedText))
    
    for i in range(100):
        
        TokenizedSubText = random.sample(TokenizedText, TokenSize)
        SubText = ' '.join(TokenizedSubText)
        ClassTexts.append(SubText)
        Classes.append(NiceClass)

In [84]:
X = ClassTexts
Y = Classes

subject_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=42)
subject_clf = subject_clf.fit(x_train, y_train)
predicted = subject_clf.predict(x_test)

In [85]:
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          1       1.00      1.00      1.00        46
          2       1.00      1.00      1.00        51
          3       1.00      1.00      1.00        59
          4       1.00      1.00      1.00        46
          5       1.00      1.00      1.00        53
          6       1.00      1.00      1.00        54
          7       1.00      1.00      1.00        50
          8       1.00      1.00      1.00        40
          9       1.00      1.00      1.00        52
         10       1.00      1.00      1.00        44
         11       1.00      1.00      1.00        52
         12       1.00      1.00      1.00        54
         13       1.00      1.00      1.00        48
         14       1.00      1.00      1.00        40
         15       1.00      1.00      1.00        57
         16       1.00      1.00      1.00        53
         17       1.00      1.00      1.00        47
         18       1.00      1.00      1.00   

In [86]:
print('Nice Class: {}'.format(subject_clf.predict(['chemical'])[0]))

Nice Class: 1


In [88]:
BrandsData = json.load(open(r'./Desktop/BrandDataDict.json'))

In [89]:
def TargetBrandsForNiceClass(Class):
    
    TargetBrands = []
    
    for Holder in BrandsData.keys():
        for Brand in BrandsData[Holder].keys():
            if str(Class) in BrandsData[Holder][Brand][1].split(' | '):
                TargetBrands.append((Holder, Brand))
                
    return TargetBrands

In [90]:
def ClassifyAndGetBrands(Text):
    
    NiceClass = subject_clf.predict([Text])[0]
    
    return TargetBrandsForNiceClass(NiceClass)

In [94]:
ClassBrands = ClassifyAndGetBrands('pharmaceutical')

In [95]:
len(ClassBrands)

729412

In [97]:
len(TargetBrandsForNiceClass(5))

729412

In [96]:
subject_clf.predict(['pharmaceutical'])[0]

5

In [101]:
random.sample(Class1Brands, 20)

[('PT AMAN ASRI', 'CEVAX'),
 ('Chr. Hansen A/S', 'LGG Excellence by Chr. Hansen'),
 ('PBM PRODUCTS ,LLC', 'BRIGHT BEGINNINGS'),
 ('SAMRUDH PHARMACEUTICALS PVT, LTD', 'STRIAXUL'),
 ('Công ty trách nhiệm hữu hạn một thành viên dược phẩm và sinh học y tế',
  'MEBILIVO'),
 ('ESPACE COSMETIC SARL', 'AMOUD'),
 ('Công ty TNHH An Nông', 'FUANNONG'),
 ('GRACURE PHARMACEUTICALS LTD', 'Magycon'),
 ('PANACEA BIOTEC LTD.', 'NUCOVAC'),
 ('Công ty TNHH dược phẩm MEDISUN', 'ZO-ZO'),
 ('Douglas Pharmaceuticals Ltd', 'Fintral'),
 ('KREWI Medical Produkte GmbH', 'KREWI - cast'),
 ('Laboratorios Bagó S.A.', 'TRIFAMOX IBL'),
 ('ROXALL Medizin GmbH', 'CLUSTO-Scan>>'),
 ('Etec Crop Solutions Ltd', 'Z C STOPP'),
 ('Juan Carlos Font i Mas', 'MESOACNE COMPLEX'),
 ('AL ADHAM DETERGENT FACTORIES', 'PUREST'),
 ('Công ty TNHH một thành viên dược Nam Anh', 'Furagon'),
 ('UNION NACIONAL DE GRANDES SUPERFICIES, S.A. U.N.A.G.R.A.S.', 'U'),
 ('BSN medical GmbH', 'LEUKOSILK')]