# 训练逻辑回归(LR)模型，进行医学文本分类

## 1. 获得训练数据，测试数据

In [None]:
import codecs
import numpy as np
import pandas as pd
import jieba
import pickle
import time
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
# 导入数据
# 这是训练数据
train_file = "../data/train.data" 
train_data = pd.read_csv(train_file, sep="\t", header=None, names=["id", "category", "sentence"])

# 这是测试数据
test_file = "../data/test.data" 
test_data = pd.read_csv(test_file, sep="\t", header=None, names=["id", "sentence"])
print("训练集数据有：{}条".format(len(train_data)))
print("测试集数据有：{}条".format(len(test_data)))

In [None]:
# 展示前10条训练数据
train_data.head(10)

In [None]:
# 查看训练数据的类别，及其数目
train_data.groupby(['category'],  as_index=False)['category'].agg({'count': 'count'})

In [None]:
# 展示前10条测试数据
test_data.head(10)

## 2. 数据格式转换

### 2.1 把文本转换为向量，使用TF-IDF算法

In [None]:
# 将文本转换为向量
def tfidf(raw_text):
    #将中文文本分词，并以空格隔开，使用jieba分词工具。
    import jieba
    preprocess_setences = []
    sentences =raw_text
    for sentence in sentences:
        words = [word for word in jieba.cut(sentence)]
        preprocess_setences.append(' '.join(words))

    # 使用sklearn自带的CoutVectorizer模块即可简单生成特征向量
    from sklearn.externals import joblib
    from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
    
    # 词向量的维度设定为1000，可根据情况设定，少则文本信息不足，多则增加训练时间
    counter = CountVectorizer(max_features=200)  
    counts = counter.fit_transform(preprocess_setences)
    print('countvectorizer词表:\n',counter.vocabulary_)  
    
    # 将词汇表输出到文件
    with open("LR_count_voca", "w") as f:  
        for word, freq in counter.vocabulary_.items():
            f.write("{}\t{}\n".format(word, freq))
    # print('词向量矩阵:\n',counts.toarray())  #fit_transform后查看具体向量
    
    tfidfer = TfidfTransformer()
    tfidf = tfidfer.fit_transform(counts)
    print('tfidf向量矩阵：\n',tfidf.toarray())  #fit_transform后查看具体向量矩阵
    joblib.dump(counter, 'LR_count_vect') #保存矢量化
    return list(tfidf.toarray())

train_text = list(train_data['sentence'])
result = tfidf(train_text)
# 将转化后的向量保存在data_X中
data_X = [np.array(i) for i in result]

In [None]:
# 词云显示所选的组成词向量的词
import matplotlib.pyplot as plt
from wordcloud import WordCloud
cloud_words = []
with open("LR_count_voca", "r") as f:
    for line in f:
        l = line.strip().split("\t")
        cloud_words.extend([l[0]] * int(l[1]))
wordcloud_ = WordCloud(background_color="white", collocations=False, max_words=1000, font_path="C:\\Windows\\Fonts\\msyh.ttc").generate(" ".join(cloud_words))

plt.figure()
plt.imshow(wordcloud_, interpolation='bilinear')
plt.axis("off")
plt.show()


### 2.1 将类别(15个)转换为数字(0-14)表示

In [None]:
# 将raw_category的15个类别转换为0-14的数字
raw_category = list(train_data['category'])
category_ = sorted(list(set(list(train_data['category']))))
category_index = {}
for i in category_:
    category_index[i] = category_.index(i)
    print(category_.index(i), i)
data_Y = np.array([category_index[c] for c in raw_category])

### 2.3 查看转换后的训练数据的输入与输出格式

In [None]:
print(data_X[1]) #应为太大，8000条数据，只显示第一行查看一下
print(data_Y) # 所有的类别，8000条

## 3. 训练逻辑回归模型

### 3.1 使用训练数据训练，并保存模型

In [None]:
from sklearn.linear_model import LogisticRegression
import pickle
import time

# 训练
start_time = time.time()
model = LogisticRegression(penalty='l2')
model.fit(data_X, data_Y)
print('training took %fs!' % (time.time() - start_time))

# 保存模型
model_save_file = "LR.mkl"
pickle.dump(model, open(model_save_file, 'wb'))

## 4. 如何预测新输入？

### 4.1 构建预测函数

In [None]:
import jieba
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
tags = ("Addictive Behavior", "Age", "Allergy Intolerance", "Compliance with Protocol", "Consent", "Diagnostic", "Disease", "Enrollment in other studies", "Laboratory Examinations", "Life Expectancy", "Organ or Tissue Status", "Pharmaceutical Substance or Drug", "Risk Assessment", "Smoking Status", "Therapy or Surgery")
def predict(sentence):
    # 利用之前的词汇构建特征向量
    words = [word for word in jieba.cut(sentence)]
    preprocess_s = ' '.join(words)

    # 导入tf-idf词表
    count_vect = joblib.load('LR_count_vect')
    tfidfer = TfidfTransformer()
    X_new_counts = count_vect.transform([preprocess_s])
    tfidf = tfidfer.fit_transform(X_new_counts)
    criteria_X = tfidf.toarray()

    # 使用之前训练的参数，进行预测
    predict_inputs = model.predict(criteria_X)
    predict_results = tags[predict_inputs[0]]
    return predict_results

### 4.2 输入一条句子，进行预测

In [None]:
# 输入一条新句子
# s = "3.有糖尿病的患者。"
# s = "1.术后发生非计划再次手术；"
s = "白细胞计数升高或大于10^9/L"
predict(s)

### 4.3 输入一份文件，进行预测，并保存结果

In [None]:
# 预测测试集数据的结果，并保存为文件test.predict
import codecs
test_sentences = list(test_data["sentence"])
n = 0
with codecs.open("../data/test.LR.predict", "w", encoding="utf-8") as f:
    for s in test_sentences:
        n += 1
        c = predict(s) # 预测
        f.write("s{}\t{}\t{}\n".format(n,c,s)) # 保存
        # 显示进度
        if n%10 ==0: print("[INFO] Processing: {0}/{1} \t {2:<50s} {3:10s}".format(str(n), str(len(test_sentences)), c, s))

### 4.4 系统评估

In [None]:
import sys
sys.path.append("../code")
import evaluation
results = evaluation.Record_results('../data/test.gold', '../data/test.LR.predict')
evaluation = evaluation.Evaluation(results.records)