# 机器学习纳米学位
## 毕业项目: 自然语言处理 文档归类——第二部分：GloVe 实现

## ⚠️注意事项：
- 运行环境为 Python-3，Tensorflow-1.0.1，Keras-2.1.3
- GloVe 6B 大小 822 MB，下载地址：https://nlp.stanford.edu/data/glove.6B.zip
- GloVe 840B 大小 2.03 GB，下载地址：https://nlp.stanford.edu/data/glove.840B.300d.zip
- 如需使用 840B，则需要将“在嵌入向量中创建嵌入索引映射”的 line.split() 修改为 line.split(' ')
- 文本预处理部分已经注释掉，如需预处理，直接取消注释，并修改数据赋值部分
- 文本预处理部分如需手动导入 NLTK 路径，需要将 nltk.data.path.append('nltk_data') 路径修改为本地路径，大小 10 MB，下载地址：[百度云](https://pan.baidu.com/s/1Tp-NsX9vWDgBVp14P3jNxw)
- 使用不同维度的预训练数据，需要修改 EMBEDDING_DIM 到对应的维度

---

## 导入原始数据

In [1]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

from sklearn.datasets import fetch_20newsgroups

newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=233)

from pprint import pprint
pprint(list(newsgroups.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [2]:
from __future__ import print_function

import os
import sys
import time
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 数据预处理（可选）

In [3]:
# import regex as re

# import nltk
# # nltk.data.path.append('nltk_data')
# from nltk.stem.porter import PorterStemmer
# from nltk.stem import WordNetLemmatizer

# #---------------------------------------------------------

# # 简单清理文本

# def clean_text(text):
#     text = text.lower() # 大小写转换
#     text = re.sub("\d+", " ", text) # 去除数字
#     text = re.sub("\p{P}+", " ", text) # 去除标点符号
#     text = re.sub("<", " ", text)
#     text = re.sub(">", " ", text)
#     text = re.sub("\|", " ", text)
#     text = re.sub("\`", " ", text)
#     text = re.sub(r'\s+', " ", text) # 多个空格合并一个空格   
#     return text

# #---------------------------------------------------------

# # 词干提取

# def stemmed_word(word):
#     porter_stemmer = PorterStemmer()
#     return porter_stemmer.stem(word)

# def stemmed_text(text):
#     text = [stemmed_word(word) for word in text.split(" ")]
#     text = ' '.join(text)
#     return text

# #---------------------------------------------------------

# # 词性还原

# def lemmatizer_word(word):
#     lemmatizer = WordNetLemmatizer()
#     return lemmatizer.lemmatize(word)

# def lemmatizer_text(text):
#     text = [lemmatizer_word(word) for word in text.split(" ")]
#     text = ' '.join(text)
#     return text

# #---------------------------------------------------------

# start_time = time.time()

# lemmatizer_newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=233)

# i = 0
# length = len(lemmatizer_newsgroups.data)

# while i < length:
#     lemmatizer_newsgroups.data[i] = lemmatizer_text(clean_text(lemmatizer_newsgroups.data[i]))
#     i += 1

# end_time = time.time()
# print("\nTime cost: {:.2f} minutes".format((end_time - start_time) / 60))

## 生成数据

In [4]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

In [5]:
# 在嵌入向量中创建嵌入索引映射

embeddings_index = {}
with open('glove.6B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [6]:
# 数据赋值

texts = newsgroups.data
labels = newsgroups.target

# 如需使用预处理数据，则使用下边两行：
# texts = lemmatizer_newsgroups.data
# labels = lemmatizer_newsgroups.target

print('Found %s texts.' % len(texts))

Found 18846 texts.


In [7]:
# 将文本样本矢量化为二维整数张量

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 179209 unique tokens.
Shape of data tensor: (18846, 1000)
Shape of label tensor: (18846, 20)


In [8]:
# 划分训练集和测试集

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [9]:
# 准备 embedding matrix

num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # 不在 embedding index 的单词为零矩阵
        embedding_matrix[i] = embedding_vector

In [10]:
# 将预先训练的单词嵌入加载到嵌入层
# 设置 trainable = False 以便保持嵌入固定

embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [11]:
start_time = time.time()

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)

# 20 newsgroups 有 20 个种类
preds = Dense(20, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))

end_time = time.time()
print("\nTotal time cost: {:.2f} minutes".format((end_time - start_time) / 60))

Train on 15077 samples, validate on 3769 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Total time cost: 2.24 minutes
