In [1]:
import pandas as pd
import numpy as np
import re
import os
import glob

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

In [2]:
dataset = pd.read_csv('./DiseaseSymptomKB.csv', encoding='utf-8', index_col=None, header=0)

In [3]:
dataset

Unnamed: 0,Disease,Symptom
0,C0020538,C0008031
1,C0020538,C0392680
2,C0020538,C0012833
3,C0020538,C0004093
4,C0020538,C0085639
5,C0020538,C0039070
6,C0020538,C0042571
7,C0020538,C0038990
8,C0020538,C0030252
9,C0020538,C0027497


In [4]:
symptom_dummies = pd.get_dummies(dataset.Symptom)
diseases = dataset['Disease']
sd_pivoted = pd.concat([diseases,symptom_dummies], axis=1)

In [5]:
sd_pivoted

Unnamed: 0,Disease,C0000727,C0000731,C0000737,C0002416,C0002962,C0003123,C0003126,C0003862,C0003962,...,C1305739,C1313921,C1321756,C1384489,C1384606,C1405524,C1444773,C1511606,C1513183,C1517205
0,C0020538,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,C0020538,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,C0020538,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,C0020538,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,C0020538,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,C0020538,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,C0020538,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,C0020538,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,C0020538,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,C0020538,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
symptoms = sd_pivoted.columns[1:]

In [7]:
symptoms

Index(['C0000727', 'C0000731', 'C0000737', 'C0002416', 'C0002962', 'C0003123',
       'C0003126', 'C0003862', 'C0003962', 'C0004093',
       ...
       'C1305739', 'C1313921', 'C1321756', 'C1384489', 'C1384606', 'C1405524',
       'C1444773', 'C1511606', 'C1513183', 'C1517205'],
      dtype='object', length=397)

In [8]:
diseases

0       C0020538
1       C0020538
2       C0020538
3       C0020538
4       C0020538
5       C0020538
6       C0020538
7       C0020538
8       C0020538
9       C0020538
10      C0020538
11      C0020538
12      C0011847
13      C0011847
14      C0011847
15      C0011847
16      C0011847
17      C0011847
18      C0011847
19      C0011847
20      C0011847
21      C0011847
22      C0011847
23      C0011847
24      C0011847
25      C0011847
26      C0011570
27      C0011570
28      C0011570
29      C0011570
          ...   
1836    C0011253
1837    C0011253
1838    C0011253
1839    C0011253
1840    C0011253
1841    C0011253
1842    C0011253
1843    C0011253
1844    C0011253
1845    C0011253
1846    C0011253
1847    C0011253
1848    C0011253
1849    C0011253
1850    C0011253
1851    C0011253
1852    C0011253
1853    C0011253
1854    C0011253
1855    C0233472
1856    C0233472
1857    C0233472
1858    C0233472
1859    C0233472
1860    C0233472
1861    C0233472
1862    C0233472
1863    C00111

In [9]:
sd_pivoted = sd_pivoted.groupby('Disease').sum()

In [10]:
sd_pivoted

Unnamed: 0_level_0,C0000727,C0000731,C0000737,C0002416,C0002962,C0003123,C0003126,C0003862,C0003962,C0004093,...,C1305739,C1313921,C1321756,C1384489,C1384606,C1405524,C1444773,C1511606,C1513183,C1517205
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001175,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0001418,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0001511,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0001973,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0002395,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0002871,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
C0002895,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0003507,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0003537,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0003864,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [11]:
sd_pivoted = sd_pivoted.reset_index()

In [12]:
x = sd_pivoted[symptoms]

In [13]:
y = diseases

In [14]:
y = sd_pivoted['Disease']

In [15]:
mnb = MultinomialNB()
mnb = mnb.fit(x, y)

mnb.score(x, y)

1.0

In [16]:
disease_pred = mnb.predict(x)

In [17]:
disease_pred

array(['C0001175', 'C0001418', 'C0001511', 'C0001973', 'C0002395',
       'C0002871', 'C0002895', 'C0003507', 'C0003537', 'C0003864',
       'C0004096', 'C0004610', 'C0005001', 'C0005586', 'C0006142',
       'C0006266', 'C0006277', 'C0006826', 'C0006840', 'C0007097',
       'C0007102', 'C0007642', 'C0007787', 'C0008325', 'C0008350',
       'C0009319', 'C0009676', 'C0010054', 'C0011127', 'C0011168',
       'C0011175', 'C0011206', 'C0011253', 'C0011570', 'C0011847',
       'C0011880', 'C0012813', 'C0013405', 'C0014118', 'C0014544',
       'C0014549', 'C0015230', 'C0017152', 'C0017160', 'C0017168',
       'C0017601', 'C0018099', 'C0018801', 'C0018802', 'C0018989',
       'C0019112', 'C0019158', 'C0019163', 'C0019196', 'C0019204',
       'C0019270', 'C0019291', 'C0020433', 'C0020443', 'C0020456',
       'C0020473', 'C0020538', 'C0020542', 'C0020615', 'C0020676',
       'C0021167', 'C0021311', 'C0021400', 'C0022116', 'C0022658',
       'C0022660', 'C0022661', 'C0023267', 'C0024117', 'C00242

In [18]:
joblib.dump(mnb,'model.pkl', protocol=2)

['model.pkl']

In [19]:
features = symptoms.tolist()

In [20]:
features

['C0000727',
 'C0000731',
 'C0000737',
 'C0002416',
 'C0002962',
 'C0003123',
 'C0003126',
 'C0003862',
 'C0003962',
 'C0004093',
 'C0004134',
 'C0004604',
 'C0006157',
 'C0006318',
 'C0006625',
 'C0007398',
 'C0007859',
 'C0008031',
 'C0008033',
 'C0008301',
 'C0008767',
 'C0009024',
 'C0009806',
 'C0010200',
 'C0010520',
 'C0011991',
 'C0012833',
 'C0013132',
 'C0013144',
 'C0013362',
 'C0013404',
 'C0013428',
 'C0013491',
 'C0014394',
 'C0015672',
 'C0015967',
 'C0016204',
 'C0016382',
 'C0016512',
 'C0016579',
 'C0016927',
 'C0018681',
 'C0018800',
 'C0018834',
 'C0018862',
 'C0018932',
 'C0018965',
 'C0018991',
 'C0019079',
 'C0019080',
 'C0019209',
 'C0019214',
 'C0019572',
 'C0019825',
 'C0020175',
 'C0020303',
 'C0020440',
 'C0020458',
 'C0020461',
 'C0020578',
 'C0020580',
 'C0020598',
 'C0020621',
 'C0020625',
 'C0020639',
 'C0020649',
 'C0020672',
 'C0022107',
 'C0023222',
 'C0023380',
 'C0024031',
 'C0024103',
 'C0026827',
 'C0026961',
 'C0027066',
 'C0027497',
 'C0027498',

In [21]:
f= open("features.txt","w+")

In [22]:
f.write(','.join(features))

3572

In [23]:
f.close()