README: The scripts below used to build the basic pipeline of classification modeling. More to try include: <br>
 - embedding: try pretrained models
 - add: tf-idf processing
 - modeling: try other modeling methods except for naive bayes; hyperparameter tuning
 

In [73]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import jieba
import jieba.posseg as pseg
import jieba.analyse

import glob
import numpy as np
import time

In [74]:
'''
combine dataset (multiple categories) into one single category;
add a column called 'label'
'''

files= glob.glob('../output_data/*.txt')

df_lst = []
for f in files:
    label = f.split('/')[-1][:2]
    df = pd.read_csv(f,header=None)
    df['label'] = label
    df_lst.append(df)

all_df = pd.concat(df_lst)
print('the whole dataset include %d reviews'%len(all_df))
all_df = all_df.rename(columns = {0:'review_tokens'})
all_df.head(10)

the whole dataset include 1623 reviews


Unnamed: 0,review_tokens,label
0,11 月 15 日 提前 预订 2018 年 11 月 27 日 长沙 飞往 沈阳 cz3...,出发
1,航班 延误 登机口 升舱 活动 以原 航班 起飞时间 为准 办理 理解,出发
2,重庆 乌鲁木齐 南航 航班 天气 原因 延误 和田 乘坐 天津 航班,出发
3,沿途 停靠 理解 延误 小时,出发
4,飞机 无故 延误 小时 脸,出发
5,延误 五个 小时 算上 值机 时间 机场 八个 小时 早上 晚上 解释 解决方案 机长 人影...,出发
6,cz3842 航班 延误 投诉无门 十点 五十 起飞 下午 三点 弄 飞机 两个 小时 告知...,出发
7,南航 航班 延误 发 短信 太 严谨 回复 改 航班 用户名 密码 我要 变更 航班 做 延...,出发
8,行李 延误 重大损失,出发
9,确认 航班 延误 订 票 显示 确认,出发


In [75]:
# get the data size for each label
labels = all_df.label.unique().tolist()
label_size = {}
for label in labels:
    label_size[label] = len(all_df[all_df.label == label])

print(label_size)

{'出发': 352, '到达': 147, '性能': 148, '售后': 166, '设计': 47, '计划': 38, '机上': 299, '预订': 218, '中转': 147, '行程': 61}


In [76]:
# encode text label into numbers
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
targets = le.fit_transform(all_df.label)
all_df['label_encoded'] = targets
all_df.head()

Unnamed: 0,review_tokens,label,label_encoded
0,11 月 15 日 提前 预订 2018 年 11 月 27 日 长沙 飞往 沈阳 cz3...,出发,1
1,航班 延误 登机口 升舱 活动 以原 航班 起飞时间 为准 办理 理解,出发,1
2,重庆 乌鲁木齐 南航 航班 天气 原因 延误 和田 乘坐 天津 航班,出发,1
3,沿途 停靠 理解 延误 小时,出发,1
4,飞机 无故 延误 小时 脸,出发,1


In [77]:
# descriptive analysis: get the average length of user reviews 
lengths = []
for i in range(len(all_df)):
    length = len(all_df['review_tokens'].iloc[i])
    lengths.append(length)

print('average review length: '+ '{:.2f}'.format(np.average(lengths)))
print('maximum review length: '+ '{:.2f}'.format(np.max(lengths)))
print('minimum review length: '+ '{:.2f}'.format(np.min(lengths)))

average review length: 51.01
maximum review length: 648.00
minimum review length: 2.00


In [23]:
# train, test split data
# from sklearn.model_selection import train_test_split

# train, test = train_test_split(all_df, test_size=0.33, random_state=42)
# print('training data has %d examples' %len(train))
# print('test data has %d examples' %len(test))

In [6]:
MAX_SEQUENCE_LENGTH = 100 # 每条新闻最大长度
EMBEDDING_DIM = 200 # 词向量空间维度
VALIDATION_SPLIT = 0.16 # 验证集比例
TEST_SPLIT = 0.2 # 测试集比例

In [20]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

all_texts = all_df['review_tokens']
all_labels = all_df['label_encoded']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_texts)
sequences = tokenizer.texts_to_sequences(all_texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(all_labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 5169 unique tokens.
Shape of data tensor: (1623, 100)
Shape of label tensor: (1623, 10)


In [22]:
# 再将处理后的新闻数据按 6.4：1.6：2 分为训练集，验证集，测试集
p1 = int(len(data)*(1-VALIDATION_SPLIT-TEST_SPLIT))
p2 = int(len(data)*(1-TEST_SPLIT))
x_train = data[:p1]
y_train = labels[:p1]
x_val = data[p1:p2]
y_val = labels[p1:p2]
x_test = data[p2:]
y_test = labels[p2:]
print ('train docs: '+str(len(x_train)))
print ('val docs: '+str(len(x_val)))
print ('test docs: '+str(len(x_test)))

train docs: 1038
val docs: 260
test docs: 325


In [34]:
x_train
y_train

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [25]:
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential

model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Dropout(0.2))
model.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(3))
model.add(Flatten())
model.add(Dense(EMBEDDING_DIM, activation='relu'))
model.add(Dense(labels.shape[1], activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 200)          1034000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 200)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 250)           150250    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 32, 250)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 200)               1600200   
_________________________________________________________________
dense_2 (Dense)              (None, 10)                2010      
Total para

In [35]:
import keras
model.compile(loss='categorical_crossentropy',
              optimizer='SGD',
               metrics=['accuracy'])

model.fit(x_train, y_train, epochs= 5, verbose=1, validation_data=(x_val, y_val))

score = model.evaluate(x_test, y_test, verbose=1)
print(score)

Train on 1038 samples, validate on 260 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[10.973212864215558, 0.0]


In [60]:
# train word2vec model using Chinese text resources (2005, by cityu, msr, pku)
# reference word2vec documentation: https://radimrehurek.com/gensim/models/word2vec.html
# code reference: https://github.com/kavgan/nlp-in-practice/blob/master/word2vec/Word2Vec.ipynb

# import modules & set up logging
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

class MySentences(object):
    def __init__(self, path_name):
        self.path_name = path_name
 
    def __iter__(self):
        # notice: can only pass on text data
        files = glob.glob(self.path_name)
        for file in files:
            with open(file) as f:
                lines = f.readlines()
                for line in lines:
                    yield line.split()

# a memory-friendly iterator
sentences = MySentences('../pretrained/icwb2-data/training/utf8_files/*.utf8') 
documents = list (sentences.__iter__())
logging.info ("Done reading data file")

2019-03-17 12:27:36,144 : INFO : Done reading data file


In [61]:
model = gensim.models.Word2Vec (documents, window=5, min_count=2, workers=2)
model.train(documents,total_examples=len(documents),epochs=3)

2019-03-17 12:28:36,010 : INFO : collecting all words and their counts
2019-03-17 12:28:36,014 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-03-17 12:28:36,116 : INFO : PROGRESS: at sentence #10000, processed 258856 words, keeping 26668 word types
2019-03-17 12:28:36,190 : INFO : PROGRESS: at sentence #20000, processed 530513 words, keeping 39226 word types
2019-03-17 12:28:36,306 : INFO : PROGRESS: at sentence #30000, processed 812550 words, keeping 49387 word types
2019-03-17 12:28:36,386 : INFO : PROGRESS: at sentence #40000, processed 1089335 words, keeping 57562 word types
2019-03-17 12:28:36,459 : INFO : PROGRESS: at sentence #50000, processed 1348940 words, keeping 65941 word types
2019-03-17 12:28:36,539 : INFO : PROGRESS: at sentence #60000, processed 1629421 words, keeping 72909 word types
2019-03-17 12:28:36,617 : INFO : PROGRESS: at sentence #70000, processed 1898087 words, keeping 78848 word types
2019-03-17 12:28:36,694 : INFO : PROGRESS:

(24960971, 31150647)

In [78]:
model.save('word2vec.model')

2019-03-17 13:54:27,040 : INFO : saving Word2Vec object under word2vec.model, separately None
2019-03-17 13:54:27,043 : INFO : storing np array 'vectors' to word2vec.model.wv.vectors.npy
2019-03-17 13:54:27,444 : INFO : not storing attribute vectors_norm
2019-03-17 13:54:27,447 : INFO : storing np array 'syn1neg' to word2vec.model.trainables.syn1neg.npy
2019-03-17 13:54:27,754 : INFO : not storing attribute cum_table
2019-03-17 13:54:28,912 : INFO : saved word2vec.model


In [64]:
# take a look at some example
w1 = "改善"
model.wv.most_similar (positive=w1)

[('紓解', 0.6620273590087891),
 ('改進', 0.6593138575553894),
 ('缓解', 0.6353970766067505),
 ('提昇', 0.6322538256645203),
 ('整頓', 0.6136868000030518),
 ('降低', 0.6037946939468384),
 ('用水', 0.601423442363739),
 ('確保', 0.5922541618347168),
 ('完善', 0.5906819105148315),
 ('改进', 0.5863096714019775)]

In [65]:
# similarity between two different words
model.wv.similarity(w1="使用",w2="服务")

0.2764854474595333

In [67]:
# Which one is the odd one out in this list?
model.wv.doesnt_match(["服务","行李","飞行"])

'行李'

In [71]:
# get word vector
model["服务"]

array([ 1.0809084 ,  0.9275192 , -0.29201597, -0.81767106, -1.6600899 ,
        2.2001536 ,  1.0700545 , -0.8595912 ,  2.9435027 ,  0.8626768 ,
       -0.72343904, -0.961094  ,  0.6855952 , -1.0474694 , -3.453282  ,
        1.5999223 ,  2.5249639 ,  3.4341424 ,  1.3286707 ,  2.2232103 ,
       -1.7445086 , -2.2384393 ,  0.28517032,  0.81533754, -0.12209349,
        0.50130093,  1.8650556 ,  2.2093637 ,  1.5667093 ,  1.1401491 ,
        0.6444202 ,  1.30712   , -1.1403345 , -1.706028  ,  1.5846382 ,
        0.9832784 , -2.7226467 , -1.2501054 , -1.4398317 ,  0.2591129 ,
       -2.0718944 ,  0.7041562 , -1.1992522 , -0.05537521,  0.36487073,
       -2.2382112 , -0.14641441,  0.6495357 , -0.31467563, -1.811874  ,
        1.3798348 ,  0.13909039, -2.669851  , -0.49505627, -0.38020018,
        1.3446413 , -0.05372661,  0.21918966, -0.61543125, -2.154251  ,
       -0.36112723,  1.1066729 ,  1.2332584 ,  0.9838205 ,  1.59936   ,
       -1.3186089 , -1.2351164 , -0.7976479 , -1.2930409 ,  1.16

In [76]:
a = np.array(model['服务'])
b = np.array(model['改善'])

# compute cosine similarity between two words
similarity = a.dot(b) / (np.sqrt(np.sum(np.power(a, 2))) * np.sqrt(np.sum(np.power(b, 2))))
print(similarity)

0.33687514


<h3> CNN:

In [78]:
MAX_SEQUENCE_LENGTH = 100 # 每条新闻最大长度
EMBEDDING_DIM = 200 # 词向量空间维度
VALIDATION_SPLIT = 0.16 # 验证集比例
TEST_SPLIT = 0.2 # 测试集比例

In [79]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_texts)
sequences = tokenizer.texts_to_sequences(all_texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(all_labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 5169 unique tokens.
Shape of data tensor: (1623, 100)
Shape of label tensor: (1623, 10)


In [39]:
p1 = int(len(data)*(1-VALIDATION_SPLIT-TEST_SPLIT))
p2 = int(len(data)*(1-TEST_SPLIT))
x_train = data[:p1]
y_train = labels[:p1]
x_val = data[p1:p2]
y_val = labels[p1:p2]
x_test = data[p2:]
y_test = labels[p2:]
print ('train docs: '+str(len(x_train)))
print ('val docs: '+str(len(x_val)))
print ('test docs: '+str(len(x_test)))

train docs: 1038
val docs: 260
test docs: 325


In [80]:
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential

model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Dropout(0.2))
model.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(3))
model.add(Flatten())
model.add(Dense(EMBEDDING_DIM, activation='relu'))
model.add(Dense(labels.shape[1], activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 200)          1034000   
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 200)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 98, 250)           150250    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 32, 250)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 200)               1600200   
_________________________________________________________________
dense_6 (Dense)              (None, 10)                2010      
Total para

In [47]:
# model fit training data
model.fit(x_train,y_train,epochs = 10,verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a38f76e10>

In [50]:
# evaluate model
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

score, acc = model.evaluate(x_val, y_val,verbose=1)
print('val score:', score)
print('val accuracy:', acc)

val score: 7.592899425213154
val accuracy: 0.25


<h3> 基于预训练的 word2vec 的 CNN:

In [103]:
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100 # 词向量空间维度
VALIDATION_SPLIT = 0.16 # 验证集比例
TEST_SPLIT = 0.2 # 测试集比例

In [104]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
embedding_matrix.shape

(5170, 100)

In [105]:
embedding_matrix[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [106]:
import six
import gensim

def unicode(unicode_or_str):
    '''convert between python2 and python3'''
    if isinstance(unicode_or_str, str):
        text = unicode_or_str
        decoded = False
    else:
        text = unicode_or_str.decode(encoding)
        decoded = True
    return text

# load pre-trained word2vev model
w2v_model = gensim.models.Word2Vec.load("word2vec.model")
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items(): 
    if unicode(word) in w2v_model:
        embedding_matrix[i] = np.asarray(w2v_model[unicode(word)],
                                         dtype='float32')
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [116]:
from keras.layers import Dense, Input, Flatten, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential
from keras.utils import plot_model

model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.2))
model.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(3))
model.add(Flatten())
model.add(Dense(EMBEDDING_DIM, activation='relu'))
model.add(Dense(labels.shape[1], activation='softmax'))
model.summary()
# plot_model(model, to_file='model.png',show_shapes=True)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5)
model.save('word_vector_cnn.h5')
print (model.evaluate(x_test, y_test))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 100)          517000    
_________________________________________________________________
dropout_11 (Dropout)         (None, 100, 100)          0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 98, 250)           75250     
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 32, 250)           0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 8000)              0         
_________________________________________________________________
dense_21 (Dense)             (None, 100)               800100    
_________________________________________________________________
dense_22 (Dense)             (None, 10)                1010      
Total para