In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [75]:
df = pd.read_csv('coded.csv', names=['NO.', 'diagnosis', 'indication', 'protocol'], header=0)
df.head(5)

Unnamed: 0,NO.,diagnosis,indication,protocol
0,44446,oichnilcnomragcaato lamisscooubrisl,"oichnilcnomragcaato lamisscooubrisl, inrcutstboos",1
1,50987,barnco fa ualrrierg (*),aaelnpsiep eart ceekh,2
2,26789,mturamaav auisdoctnalstarishilnuncbe of oichni...,gvcioincnn utc itersmoabplaaoipoy; ipaaolsrnav...,1
3,43310,smanaedo of ssiedmdo,begigr emov ssiedmdo ypssrcte,3
4,50548,afiliaccoicnt hcr nwyel (*),afiliaccoicnt hcr nwyel with mturamaav aspemhs...,4


In [76]:
df = df.drop(df.columns[0], axis=1)
df['comb'] = df["diagnosis"].str.cat(df["indication"],sep=" ")
df.head(5)

Unnamed: 0,diagnosis,indication,protocol,comb
0,oichnilcnomragcaato lamisscooubrisl,"oichnilcnomragcaato lamisscooubrisl, inrcutstboos",1,oichnilcnomragcaato lamisscooubrisl oichnilcno...
1,barnco fa ualrrierg (*),aaelnpsiep eart ceekh,2,barnco fa ualrrierg (*) aaelnpsiep eart ceekh
2,mturamaav auisdoctnalstarishilnuncbe of oichni...,gvcioincnn utc itersmoabplaaoipoy; ipaaolsrnav...,1,mturamaav auisdoctnalstarishilnuncbe of oichni...
3,smanaedo of ssiedmdo,begigr emov ssiedmdo ypssrcte,3,smanaedo of ssiedmdo begigr emov ssiedmdo ypss...
4,afiliaccoicnt hcr nwyel (*),afiliaccoicnt hcr nwyel with mturamaav aspemhs...,4,afiliaccoicnt hcr nwyel (*) afiliaccoicnt hcr ...


In [0]:
df['comb'] = df['comb'].astype(str) #int to string
df['indication'] = df['indication'].astype(str) #int to string

In [78]:
#seperate training and test data
X_train, X_test, y_train, y_test = train_test_split(df['indication'], df['protocol'], random_state = 0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)


(38505,) (38505,) (12835,) (12835,)


In [79]:
X_train[0:2]

38975    lrpvravaalua
39706    ssiedmdo dic
Name: indication, dtype: object

In [80]:
#build CountVectorizer
count_vect = CountVectorizer(ngram_range=(1,2))
X_train_counts = count_vect.fit_transform(X_train)
print(X_train_counts.shape)

(38505, 61260)


In [81]:
#build TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf)

  (0, 33062)	1.0
  (1, 50860)	0.6277130608272921
  (1, 50836)	0.5180485855778103
  (1, 14757)	0.581035262482117
  (2, 40771)	0.25293920314797613
  (2, 40766)	0.21962226857997558
  (2, 30346)	0.14131034250609506
  (2, 30279)	0.12592660038616266
  (2, 28257)	0.2719087675285945
  (2, 28242)	0.23492052403193253
  (2, 25191)	0.20867227925741805
  (2, 24363)	0.2687318039652264
  (2, 24205)	0.07087525840748465
  (2, 22645)	0.16533313600681648
  (2, 22439)	0.09929750385185021
  (2, 22198)	0.2793467865668677
  (2, 22143)	0.18754163410556432
  (2, 12821)	0.3316332855154866
  (2, 12760)	0.16143737611284376
  (2, 4409)	0.3255559814547858
  (2, 4408)	0.31204065835581185
  (2, 2047)	0.2631634088454576
  (2, 2039)	0.20736493961255428
  (3, 56797)	0.25965835720216984
  (3, 56790)	0.23292576565911868
  :	:
  (38502, 23747)	0.19727161615891575
  (38502, 23746)	0.10504111557102254
  (38502, 21404)	0.20491856505995226
  (38502, 21403)	0.1787734740785332
  (38502, 15411)	0.20491856505995226
  (38502, 15407

In [82]:
clf = MultinomialNB(alpha=0.01)
clf.fit(X_train_tfidf, y_train)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [83]:
pred = clf.predict(count_vect.transform(X_test))
actual = y_test.values
print(len(pred), len(actual))

12835 12835


In [84]:
total = len(actual)
correct = 0
for i in range(len(pred)):
    if pred[i] == actual[i]:
        correct += 1
print("For the dominant protocols, the accuracy of 1 to 1 mapping is {:0.2f}%".format((correct / total) * 100))

For the dominant protocols, the accuracy of 1 to 1 mapping is 69.52%
