# Text Classification - Drug Fact Sheets: Different Sections

# Load Data

In [13]:
import zipfile
import io
import os.path
import pandas as pd

df = pd.read_csv('drugfactsheets.txt', delimiter='\t')
print(df.columns)
df

Index(['Category', 'GenericName', 'FormerName', 'EnglishName', 'Pinyin',
       'Traits', 'Pharmacology', 'Pharmacokinetics', 'Indications', 'Dosage',
       'AdverseReaction', 'Contraindications', 'Note', 'PregnancyLactation',
       'Children', 'Elderly', 'DrugInteractions', 'Overdose', 'Storage',
       'Packaging', 'ValidityPeriod', 'Ingredient', 'ChemicalName',
       'IngredientPinyin', 'IngredientEnglishName', 'CAS', 'ChemicalStructure',
       'ChemicalFormula', 'MolecularWeight', 'Specification', 'ATC', 'DDD',
       'DDDFactor'],
      dtype='object')


Unnamed: 0,Category,GenericName,FormerName,EnglishName,Pinyin,Traits,Pharmacology,Pharmacokinetics,Indications,Dosage,...,IngredientPinyin,IngredientEnglishName,CAS,ChemicalStructure,ChemicalFormula,MolecularWeight,Specification,ATC,DDD,DDDFactor
0,,,,,,,,,,,...,,,,,,,,,,
1,,0.9%氯化钠注射液,,,,,,,,,...,,,,,,,,,,
2,,0.9%氯化钠注射液(双管双层无菌包装),,,,本品为无色的澄明液体；味微咸。,,,,,...,,,,,,,10ml：0.09g,,,
3,,1.5%腹膜透析液(乳酸盐),,,,装在双联系统容器中的PD-2腹膜透析液为无菌、无热原的无色或微黄色的澄明溶液，只用于腹腔内给...,,,PD-2腹膜透析液适用于因非透析治疗无效而需要连续不卧床性腹膜透析治疗的慢性肾功能衰竭患者。,,...,,,,,,,含1.5%葡萄糖(2L：2.5L/袋)，其组份见表一,,,
4,,10%氯化钾注射液,,,,无色的澄明液体。,,,,,...,,,,,,,10ml：1.0g \n\n\n10m1：1.5g,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3663,,龙生蛭胶囊,,,,本品为胶囊剂，内容物为棕褐色粉末；气微腥，味微苦。,,,,口服，每次五粒，一日三次。疗程4周。,...,,,,,,,每粒装0.4g。,,,
3664,,龙胆碳酸氢钠片,,CHINESE GENTIAN AND SODIUM BICARBONATE TABLETS,LONGDAN TANSUANQINGNA PIAN,,有健胃、制酸作用。碳酸氢钠为可吸收性抗酸药，口服后中和胃酸作用迅速，\r\n但抗酸作用较弱，...,碳酸氢钠口服易被肠道吸收进入血液，经尿排泄，也可以二氧化碳形式经肺\r\n排出。,用于消化不良、食欲不振及反酸等。,口服，一次1～3片，一日3次，饭前服。,...,TANSUANQINGNA,SODIUM BICARBONATE,,,NaHCO3,84.01,0.15g,,,
3665,,龙胆紫,,,,,,,,,...,,,,,,,,,,
3666,,龙鹿丸,,,,本品为黑色的浓缩丸；气微腥。,,,,口服，一次3-5丸，每盒装2小盒。,...,,,,,,,每10丸重2g,,,


# Create Data Sets

In [33]:
pharmacology = list(df[df['Pharmacology'].isnull() == False]['Pharmacology'].values)
pharmacokinetics = list(df[df['Pharmacokinetics'].isnull() == False]['Pharmacokinetics'].values)
indications = list(df[df['Indications'].isnull() == False]['Indications'].values)
dosage = list(df[df['Dosage'].isnull() == False]['Dosage'].values.tolist())
adverseReaction = list(df[df['AdverseReaction'].isnull() == False]['AdverseReaction'].values)
contraindications = list(df[df['Contraindications'].isnull() == False]['Contraindications'].values)
pregnancyLactation = list(df[df['PregnancyLactation'].isnull() == False]['PregnancyLactation'].values)
drugInteractions = list(df[df['DrugInteractions'].isnull() == False]['DrugInteractions'].values)
storage = list(df[df['Storage'].isnull() == False]['Storage'].values)
ingredient = list(df[df['Ingredient'].isnull() == False]['Ingredient'].values)

labels = ['Pharmacology', 'pharmacokinetics', 'indications',
          'dosage', 'adverseReaction', 'contraindications',
          'pregnancyLactation', 'drugInteractions',
          'storage', 'ingredient']

In [5]:
import numpy as np

X = pharmacology + pharmacokinetics+ indications+ dosage + adverseReaction+ contraindications + pregnancyLactation + drugInteractions + storage + ingredient
TEXT = np.array(X)

In [6]:
y = np.concatenate((np.full(len(pharmacology), 0), 
                    np.full(len(pharmacokinetics), 1), 
                    np.full(len(indications), 2),
                    np.full(len(dosage), 3),
                    np.full(len(adverseReaction), 4),
                    np.full(len(contraindications), 5),
                    np.full(len(pregnancyLactation), 6),
                    np.full(len(drugInteractions), 7),
                    np.full(len(storage), 8),
                    np.full(len(ingredient), 9)), axis=0) 
y.shape

(23768,)

# Stop Words

In computing, stop words are words which are filtered out before or after processing of natural language data (text). Though "stop words" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. Some tools specifically avoid removing these stop words to support phrase search. 

In [7]:
%run get_stop_words.py
stpwrdlst = get_stop_words()
print(len(stpwrdlst))

2680


# 分词

参考: [Word Segmentation.ipynb](Word Segmentation.ipynb)

In [10]:
%run DTM_CHN.py
_, X, vocab = DTM_CHN(X, [], stpwrdlst)

  0%|                                                                                        | 0/23768 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\eleve\AppData\Local\Temp\jieba.cache
Loading model cost 0.638 seconds.
Prefix dict has been built successfully.
100%|███████████████████████████████████████████████████████████████████████████| 23768/23768 [00:43<00:00, 552.07it/s]


In [15]:
import jieba

# Get histogram
def word_frequency(s, stpwrdlst):
    wordfreq = {}
    wordlist = list(jieba.cut(s, cut_all=False))
    for w in set(wordlist):
        if w not in stpwrdlst:            
            wordfreq[w] = wordlist.count(w)
            
    return wordfreq

def doc2bow(s, stpwrdlst, vocab):
    wf = word_frequency(s, stpwrdlst)
    a = np.zeros(len(vocab))
    for i,w in enumerate(vocab):
        if w in wf.keys():
            a[i] = wf[w]
    return a 

# Logistic Regression (多分类)

## 训练

In [43]:
from sklearn.linear_model import LogisticRegression
# ‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty, whereas ‘liblinear’ and ‘saga’ handle L1 penalty.
clf = LogisticRegression(solver='liblinear', multi_class='ovr', penalty='l1').fit(X, y)

clf.score(X, y)

0.9216173005721979

## 测试

将病历拆分成单个句子，进行分类预测

In [44]:
text = ['2010年12月16日因右鼻翼外侧疼痛明显，来门诊就诊，得理多加量为0.2，每天3次，加巴喷丁3片，每天3次（原服用得理多0.1，每天3次，加巴喷丁2片，每天3次）。',
    '第二天早上服药后出现头晕，觉头部晕沉感，呈持续性，无天旋地转感，头晕与体位改变无明显关系，觉胸闷，并逐渐出现四肢震颤，双手静止及拿物体时均有震颤，觉四肢乏力，行走需家人搀扶，双上肢能持物，但无法持太重的物体，有恶心、呕吐',
    '出生并长大于原籍。否认疫区、疫水接触史，否认特殊化学品及放射线接触史。无吸烟饮酒等不良嗜好，性病冶游史不详',
    '行电子结肠镜示结肠多发性息肉，并行摘除术，病理示：结肠腺瘤性息肉。鼻内窥镜检查未见异常。;  胸片示：1.右上肺纤维、增殖灶；2.心影增大，请结合临床考虑；3.主动脉硬化；4.骨质疏松；5.胸椎骨质增生。肝胆胰脾B超示：肝囊肿，S5，单发；胆囊息肉，单发；胆管、脾脏、胰腺超声检查未见异常。 ']

for t in text:
    print(t)
    a = doc2bow(t, stpwrdlst, vocab)
    # print(a)
    print(clf.predict_proba([a]))
    print(np.array(labels)[clf.predict([a])])
    print('')

2010年12月16日因右鼻翼外侧疼痛明显，来门诊就诊，得理多加量为0.2，每天3次，加巴喷丁3片，每天3次（原服用得理多0.1，每天3次，加巴喷丁2片，每天3次）。
[[0.00414478 0.00792651 0.07635559 0.04881155 0.3127069  0.05797698
  0.09438418 0.3682393  0.00759648 0.02185775]]
['drugInteractions']

第二天早上服药后出现头晕，觉头部晕沉感，呈持续性，无天旋地转感，头晕与体位改变无明显关系，觉胸闷，并逐渐出现四肢震颤，双手静止及拿物体时均有震颤，觉四肢乏力，行走需家人搀扶，双上肢能持物，但无法持太重的物体，有恶心、呕吐
[[5.30916667e-03 6.31213354e-03 2.06402774e-03 9.22108694e-04
  9.44539180e-01 4.54456995e-06 1.44346879e-02 2.66049825e-03
  1.10357912e-03 2.26500740e-02]]
['adverseReaction']

出生并长大于原籍。否认疫区、疫水接触史，否认特殊化学品及放射线接触史。无吸烟饮酒等不良嗜好，性病冶游史不详
[[0.09800836 0.01109998 0.03030553 0.00883045 0.150945   0.04044392
  0.02509639 0.2896884  0.00258816 0.3429938 ]]
['ingredient']

行电子结肠镜示结肠多发性息肉，并行摘除术，病理示：结肠腺瘤性息肉。鼻内窥镜检查未见异常。;  胸片示：1.右上肺纤维、增殖灶；2.心影增大，请结合临床考虑；3.主动脉硬化；4.骨质疏松；5.胸椎骨质增生。肝胆胰脾B超示：肝囊肿，S5，单发；胆囊息肉，单发；胆管、脾脏、胰腺超声检查未见异常。 
[[0.0175544  0.0096202  0.26911215 0.01025077 0.13530124 0.00151328
  0.12495278 0.18787663 0.00300445 0.24081411]]
['indications']



In [45]:
for k in range(len(labels)):

    print()
    print(labels[k])
    idx = np.argsort(-clf.coef_[k])[:20] # np.where(clf.coef_[k]>1)
    print(np.array(vocab)[idx]) # get key words
    print(clf.coef_[k][idx])


Pharmacology
['为类' '药理作用' '毒理学' '相结合' '前期' '抑制作用' '易于' '病灶' '癌细胞' '抑菌作用' '选择性' '血流'
 'rna' '兴奋' '毒理' '结果表明' '有机' '从尿中' 'na' '作用']
[4.25108523 4.07539626 4.00088643 3.86727673 3.73998226 3.39482022
 3.35118657 3.12906869 2.7859804  2.69481149 2.66544868 2.60149977
 2.53695838 2.38894153 2.35841682 2.26239372 2.23521505 2.21623221
 2.09340912 2.06640854]

pharmacokinetics
['生育能力' '服药者' '溶酶体' '参数' '文献' 'ed95' '利用' '吸收' '排出' 't1' '眼后' '峰值' '途径'
 '半衰期' '分布' '起效' '参考文献' 'gt' '小白鼠' '排泄']
[6.92045018 5.43139841 3.75843663 3.6936322  3.20636832 3.16908544
 3.13741057 3.09184706 2.95668001 2.89730544 2.72426554 2.6820205
 2.52027858 2.31285164 2.22967519 2.19390777 2.19027264 2.15851869
 2.01350595 1.99868839]

indications
['用于' '骨关节炎' '骨质疏松症' '寻常' '高能量' '辅助' '十二指肠' '克罗恩' '疟疾' '尖锐湿疣' '老年性' '缺铁性'
 '阴道炎' '原因' '绝经' '慢性病' '预防' '营养' '湿疹' '感冒']
[4.91204543 4.23245798 3.80032537 3.79964492 3.65542755 3.62681426
 3.55095848 3.49245413 3.24821249 3.2257261  3.18519602 3.16889258
 3.10350835 3.097272   3