# 训练神经网络模型，进行医学文本分类

## 1. 获得训练数据，测试数据

In [None]:
import codecs
import numpy as np
import pandas as pd
import jieba
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
# 导入数据
# 这是训练数据
train_file = "../data/train.data" 
train_data = pd.read_csv(train_file, sep="\t", header=None, names=["id", "category", "sentence"])

# 这是测试数据
test_file = "../data/test.data" 
test_data = pd.read_csv(test_file, sep="\t", header=None, names=["id", "sentence"])
print("训练集数据有：{}条".format(len(train_data)))
print("测试集数据有：{}条".format(len(test_data)))

In [None]:
# 展示前10条训练数据
train_data.head(10)

In [None]:
# 查看训练数据的类别，及其数目
train_data.groupby(['category'],  as_index=False)['category'].agg({'count': 'count'})

In [None]:
# 展示前10条测试数据
test_data.head(10)

## 2. 数据格式转换

### 2.1 把文本转换为向量，使用TF-IDF算法

In [None]:
# 将文本转换为向量
def tfidf(raw_text):
    #将中文文本分词，并以空格隔开，使用jieba分词工具。
    import jieba
    preprocess_setences = []
    sentences =raw_text
    for sentence in sentences:
        words = [word for word in jieba.cut(sentence)]
        preprocess_setences.append(' '.join(words))

    # 使用sklearn自带的CoutVectorizer模块即可简单生成特征向量
    from sklearn.externals import joblib
    from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
    
    # 词向量的维度设定为1000，可根据情况设定，少则文本信息不足，多则增加训练时间
    counter = CountVectorizer(max_features=1000)  
    counts = counter.fit_transform(preprocess_setences)
    print('countvectorizer词表:\n',counter.vocabulary_)  
    
    # 将词汇表输出到文件
    with open("count_voca", "w") as f:  
        for word, freq in counter.vocabulary_.items():
            f.write("{}\t{}\n".format(word, freq))
    # print('词向量矩阵:\n',counts.toarray())  #fit_transform后查看具体向量
    
    tfidfer = TfidfTransformer()
    tfidf = tfidfer.fit_transform(counts)
    print('tfidf向量矩阵：\n',tfidf.toarray())  #fit_transform后查看具体向量矩阵
    joblib.dump(counter, 'count_vect') #保存矢量化
    return list(tfidf.toarray())

train_text = list(train_data['sentence'])
result = tfidf(train_text)
# 将转化后的向量保存在data_X中
data_X = [np.array(i) for i in result]

In [None]:
# 词云显示所选的组成词向量的词
import matplotlib.pyplot as plt
from wordcloud import WordCloud
cloud_words = []
with open("count_voca", "r") as f:
    for line in f:
        l = line.strip().split("\t")
        cloud_words.extend([l[0]] * int(l[1]))
wordcloud_ = WordCloud(background_color="white", collocations=False, max_words=1000, font_path="C:\\Windows\\Fonts\\msyh.ttc").generate(" ".join(cloud_words))

plt.figure()
plt.imshow(wordcloud_, interpolation='bilinear')
plt.axis("off")
plt.show()


### 2.1 将类别(15个)转换为数字(0-14)表示

In [None]:
# 将raw_category的15个类别转换为0-14的数字
raw_category = list(train_data['category'])
category_ = sorted(list(set(list(train_data['category']))))
category_index = {}
for i in category_:
    category_index[i] = category_.index(i)
    print(category_.index(i), i)
data_Y = np.array([category_index[c] for c in raw_category])

### 2.3 查看转换后的训练数据的输入与输出格式

In [None]:
print(data_X[1]) #应为太大，8000条数据，只显示第一行查看一下
print(data_Y) # 所有的类别，8000条

## 3. 训练神经网络

### 3.1 将数据分隔为训练集&验证集

In [None]:
# 按照3:1的比例
from sklearn.model_selection import train_test_split
x_train,x_dev,y_train,y_dev= train_test_split(data_X, data_Y, test_size=0.25, random_state=0)
train = [x_train, y_train]
dev = [x_dev, y_dev]
print(len(x_train), len(y_train), len(x_dev), len(y_dev))

### 3.2 转换为神经网络输入层格式

In [None]:
def vectorized_result(j):
    e = np.zeros((15, 1))
    e[j] = 1.0
    return e

training_inputs = [np.reshape(x, (1000, 1)) for x in train[0]]  #训练输入为1000 X 1的矩阵
training_results = [vectorized_result(y) for y in train[1]] #训练输出为15 X 1的矩阵
training = zip(training_inputs, training_results)

dev_inputs = [np.reshape(x, (1000, 1)) for x in dev[0]]
dev_data = zip(dev_inputs, dev[1])

### 3.3 导入神经网络函数，设计自己的神经网络，并进行训练

In [None]:
# 导入网络模块
import sys
sys.path.append("../code")
from network import Network
# 设置网络结构，如下所示三层神经网络。第一层为输入层(1000个神经元)，第二层为隐藏层(30个神经元)，第三层为输出层(15个神经元)。
net = Network([1000, 30, 15])
# 设置超参数, 分别为。
# epoch为迭代期数量：设置为30，
# mini_batch_size为采样时的小批量数据的大小，设置为10，
# eta为学习速率，设置为3.0
net.SGD(training, epochs=30, mini_batch_size=10, eta=3, test_data=dev_data)

## 4. 如何预测新输入？

### 4.1 构建预测函数

In [None]:
import jieba
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
def predict(sentence):
    # 利用之前的词汇构建特征向量
    words = [word for word in jieba.cut(sentence)]
    preprocess_s = ' '.join(words)

    # 导入tf-idf词表
    count_vect = joblib.load('count_vect')
    tfidfer = TfidfTransformer()
    X_new_counts = count_vect.transform([preprocess_s])
    tfidf = tfidfer.fit_transform(X_new_counts)
    criteria_X = tfidf.toarray()

    # 使用之前训练的参数，进行预测
    predict_inputs = np.reshape(criteria_X, (1000, 1))  #将输入转换为1000 X 1的矩阵
    predict_results = np.argmax(net.feedforward(predict_inputs)) # 进行预测
    return category_[predict_results]

### 4.2 输入一条句子，进行预测

In [None]:
# 输入一条新句子
# s = "3.有糖尿病的患者。"
# s = "1.术后发生非计划再次手术；"
s = "白细胞计数升高或大于10^9/L"
predict(s)

### 4.3 输入一份文件，进行预测，并保存结果

In [None]:
# 预测测试集数据的结果，并保存为文件test.predict
import codecs
test_sentences = list(test_data["sentence"])
n = 0
with codecs.open("../data/test.predict", "w", encoding="utf-8") as f:
    for s in test_sentences:
        n += 1
        c = predict(s) # 预测
        f.write("s{}\t{}\t{}\n".format(n,c,s)) # 保存
        # 显示进度
        if n%10 ==0: print("[INFO] Processing: {0}/{1} \t {2:<50s} {3:10s}".format(str(n), str(len(test_sentences)), c, s))

### 4.4 系统评估

In [None]:
import sys
sys.path.append("../code")
import evaluation
results = evaluation.Record_results('../data/test.gold', '../data/test.predict')
evaluation = evaluation.Evaluation(results.records)