#### readme: 
* reference: [在Keras模型中使用预训练的词向量](https://keras-cn-docs.readthedocs.io/zh_CN/latest/blog/word_embedding/)
* summary: 
    - method: use pretrained word2vec 百度百科 + CNN
    - performance: not good, based on metrics of [accuracy] and [confusion matrix]

In [2]:
import os
import numpy as np
import pandas as pd
os.chdir('/Users/liyuan/desktop/CSAir')

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
from keras.layers import Flatten
from keras.layers import Embedding
import tensorflow as tf

Using TensorFlow backend.


In [5]:
class word2vec():
    def __init__(self):
        self.embeddings_index = {}
        self.MAX_SEQUENCE_LENGTH = 1000
        self.MAX_NUM_WORDS = 20000
        self.EMBEDDING_DIM = 300
        self.VALIDATION_SPLIT = 0.2
        self.all_labeled_data = pd.DataFrame()
        self.labels_index = {}
        self.word_index  = {}
        self.texts = np.array([])
        self.labels = np.array([])
        self.data = np.array([])
        self.X_train = np.array([])
        self.y_train = np.array([])
        self.X_val = np.array([])
        self.y_val = np.array([])
        self.embedding_matrix = np.array([])

    def load_pretrained_vectors(self, file_path):
        f = open(file_path)
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            self.embeddings_index[word] = coefs
        f.close()
        print('Found %s word vectors.' % len(self.embeddings_index))
        return self.embeddings_index 

    def prepare_data(self,data_file_path):
        self.all_labeled_data = pd.read_csv(data_file_path)
        self.texts = self.all_labeled_data.review_tokens.astype('str').values
        self.labels = self.all_labeled_data.label_encoded.values
        
        # get a dictionary that map each original label to its encoded label, e.g., {'中转': 0,...}
        for label in self.all_labeled_data.label.unique().tolist():
            self.labels_index[label] = self.all_labeled_data[self.all_labeled_data['label'] == label]['label_encoded'].unique()[0]

        tokenizer = Tokenizer(nb_words=self.MAX_NUM_WORDS)
        tokenizer.fit_on_texts(self.texts)
        sequences = tokenizer.texts_to_sequences(self.texts)

        self.word_index = tokenizer.word_index
        print('Found %s unique tokens.' % len(self.word_index))

        self.data = pad_sequences(sequences, maxlen=self.MAX_SEQUENCE_LENGTH)
        print('Shape of data tensor:', self.data.shape)
        print('Shape of label tensor:', self.labels.shape)
        
        # Converts a class vector (integers) to binary class matrix
        self.labels = to_categorical(np.asarray(self.labels))
        print('Shape of data tensor:', self.data.shape)
        print('Shape of label tensor:', self.labels.shape)

        # split the data into a training set and a validation set
        indices = np.arange(self.data.shape[0])
        np.random.shuffle(indices)
        self.data = self.data[indices]
        self.labels = self.labels[indices]
        nb_validation_samples = int(self.VALIDATION_SPLIT * self.data.shape[0])

        self.X_train = self.data[:-nb_validation_samples]
        self.y_train = self.labels[:-nb_validation_samples]
        self.X_val = self.data[-nb_validation_samples:]
        self.y_val = self.labels[-nb_validation_samples:]
        return  self.X_train, self.y_train, self.X_val, self.y_val
    
    def train_data(self):
        # 据得到的字典生成上文所定义的词向量矩阵
        self.embedding_matrix = np.zeros((len(self.word_index) + 1, self.EMBEDDING_DIM))
        for word, i in self.word_index.items():
            embedding_vector = self.embeddings_index.get(word)
            # updated:
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                self.embedding_matrix[i] = embedding_vector
            
        # 将这个词向量矩阵加载到Embedding层
        embedding_layer = Embedding(len(self.word_index) + 1,
                                    self.EMBEDDING_DIM,
                                    weights=[self.embedding_matrix],
                                    input_length=self.MAX_SEQUENCE_LENGTH,
                                    trainable=False)
        
        # 使用一个小型的1D卷积解决分类问题
        sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32')
        embedded_sequences = embedding_layer(sequence_input)
        x = Conv1D(128, 5, activation='relu')(embedded_sequences)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(35)(x)  # global max pooling
        x = Flatten()(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(len(self.labels_index), activation='softmax')(x)

        model = Model(sequence_input, preds)
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])
        model.fit(self.X_train, self.y_train, validation_data=(self.X_val, self.y_val),
                  nb_epoch=2, batch_size=128)
        return model

In [6]:
w2v = word2vec()
embeddings_index = w2v.load_pretrained_vectors('./Source_Data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5')
# X_train, y_train, X_val, y_val = w2v.prepare_data('./res/all_labeled_data_v3.csv')
X_train, y_train, X_val, y_val = w2v.prepare_data('./res/labeled_data_with_without_tk.csv')
model = w2v.train_data()

Found 635922 word vectors.




Found 4747 unique tokens.
Shape of data tensor: (1551, 1000)
Shape of label tensor: (1551,)
Shape of data tensor: (1551, 1000)
Shape of label tensor: (1551, 10)




Train on 1241 samples, validate on 310 samples
Epoch 1/2
Epoch 2/2


In [7]:
# check label index
print('label encoding dictionary:', w2v.labels_index)

label encoding dictionary: {'计划': 7, '机上': 5, '中转': 0, '售后': 3, '预订': 9, '设计': 8, '出发': 1, '性能': 4, '行程': 6, '到达': 2}


In [8]:
# get predicted class label
output = model.predict(X_val)
predicted_label_list = []
for i in range(len(output)):
    predicted_label = output[i].argmax(axis=-1)
    predicted_label_list.append(predicted_label)
print(predicted_label_list)

[5, 5, 9, 5, 1, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 9, 9, 5, 9, 5, 5, 5, 5, 0, 5, 1, 5, 5, 5, 5, 5, 1, 0, 5, 1, 5, 5, 9, 1, 5, 5, 5, 5, 5, 5, 9, 5, 5, 5, 5, 1, 1, 5, 5, 5, 5, 9, 1, 5, 5, 5, 5, 9, 9, 1, 5, 5, 5, 1, 1, 5, 1, 5, 5, 5, 5, 5, 5, 5, 1, 5, 9, 1, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 5, 1, 9, 5, 1, 5, 5, 9, 5, 1, 5, 5, 5, 5, 5, 5, 1, 1, 5, 5, 9, 5, 1, 9, 5, 5, 5, 1, 1, 5, 9, 9, 0, 5, 1, 0, 5, 0, 5, 1, 5, 1, 5, 5, 5, 5, 5, 9, 1, 5, 5, 5, 9, 5, 1, 5, 0, 9, 5, 5, 1, 5, 5, 5, 5, 5, 5, 9, 1, 5, 5, 1, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5, 5, 1, 5, 5, 5, 1, 5, 5, 5, 5, 5, 1, 5, 5, 9, 9, 9, 5, 5, 5, 1, 5, 1, 9, 5, 5, 9, 5, 5, 5, 5, 5, 5, 5, 9, 5, 9, 5, 5, 0, 5, 5, 1, 1, 0, 5, 5, 1, 5, 5, 5, 5, 5, 5, 9, 1, 5, 5, 9, 5, 5, 1, 9, 5, 5, 5, 0, 9, 1, 5, 5, 5, 5, 5, 5, 9, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 1, 5, 1, 5, 5, 5, 9, 5, 5, 1, 5, 5, 1, 1, 5, 1, 5, 5, 5, 5, 5, 5, 5, 9, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 9, 5, 1, 0, 9, 5]


In [97]:
# evaluate model using model.evaluate()
scores = model.evaluate(X_val, y_val, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 27.74%


In [98]:
# -----------------------------------------
# task: return prediction results back to df
# -----------------------------------------
# indices is a numpy array, need to convert to a list of indices before feed into df to get sub df
# recreate df based on the shuffled indices
all_labeled_data = w2v.all_labeled_data.iloc[list(indices)]
nb_validation_samples = int(w2v.VALIDATION_SPLIT * w2v.data.shape[0])
print(nb_validation_samples)
# need to get the indices of the validation data
train_val_bound = w2v.data.shape[0] - nb_validation_samples
# get validation dataset
val_df = all_labeled_data[train_val_bound:]
val_df.head()

310


Unnamed: 0,review,review_tokens,label,label_encoded
870,一些未按照常规用户流程思路设计，开发者思维而不是使用者思维，使用体验差；补打电子票麻烦；内容...,未 常规 用户 流程 思路 设计 开发者 思维 使用者 思维 体验 差 补打 电子 票 麻烦...,设计,8
772,App上买机票居然自动分配座位，还是中间位置，不给选座机会，不如第三方软件，体验差！写着有修...,买 机票 自动 分配 座位 位置 选座 机会 第三方 软件 体验 差 写 修改 选座 点 只...,预订,9
1301,明珠会员 每次失败,明珠 会员 每次 失败,性能,4
103,公务舱空乘服务非常好。,公务舱 空乘 服务,机上,5
919,航班延误以后，登机口升舱活动仍以原航班起飞时间为准办理，让人不理解,航班 延误 登机口 升舱 活动 以原 航班 起飞时间 为准 办理 理解,出发,1


In [99]:
# map predicted labels to original class
print(predicted_label_list[:10])
label_dct = w2v.labels_index
print(label_dct)

# get reversed labels_index dictionary
reversed_label_dct = {}
for i in range(len(label_dct)):
    reversed_label_dct[list(label_dct.values())[i]] = list(label_dct.keys())[i]
reversed_label_dct   
# map predicted labels
pred_label = [reversed_label_dct.get(label) for label in predicted_label_list]
val_df['pred_label'] = pred_label
val_df.head()

[5, 5, 9, 5, 1, 1, 5, 5, 5, 5]
{'计划': 7, '机上': 5, '中转': 0, '售后': 3, '预订': 9, '设计': 8, '出发': 1, '性能': 4, '行程': 6, '到达': 2}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Unnamed: 0,review,review_tokens,label,label_encoded,pred_label
870,一些未按照常规用户流程思路设计，开发者思维而不是使用者思维，使用体验差；补打电子票麻烦；内容...,未 常规 用户 流程 思路 设计 开发者 思维 使用者 思维 体验 差 补打 电子 票 麻烦...,设计,8,机上
772,App上买机票居然自动分配座位，还是中间位置，不给选座机会，不如第三方软件，体验差！写着有修...,买 机票 自动 分配 座位 位置 选座 机会 第三方 软件 体验 差 写 修改 选座 点 只...,预订,9,机上
1301,明珠会员 每次失败,明珠 会员 每次 失败,性能,4,预订
103,公务舱空乘服务非常好。,公务舱 空乘 服务,机上,5,机上
919,航班延误以后，登机口升舱活动仍以原航班起飞时间为准办理，让人不理解,航班 延误 登机口 升舱 活动 以原 航班 起飞时间 为准 办理 理解,出发,1,出发


In [100]:
# evaluate performance
from sklearn.metrics import confusion_matrix
from pandas_ml import ConfusionMatrix
# get confusion matrix
def get_confusion_matrix(y_test,y_pred):
    '''get tp,tn,fp,fn for each class'''
    cm = ConfusionMatrix(y_test, y_pred)
    cm.print_stats()
y_val_true = val_df.label.values
y_val_pred = val_df.pred_label.values
get_confusion_matrix(y_val_true,y_val_pred)

Confusion Matrix:

Predicted  中转  出发  到达  售后  性能   机上  行程  计划  设计  预订  __all__
Actual                                                     
中转          0   7   0   0   0   20   0   0   0   5       32
出发          4  17   0   0   0   35   0   0   0   6       62
到达          1   4   0   0   0   16   0   0   0   5       26
售后          1   4   0   0   0   16   0   0   0   2       23
性能          3   5   0   0   0   14   0   0   0   4       26
机上          0  11   0   0   0   48   0   0   0   3       62
行程          0   0   0   0   0    9   0   0   0   3       12
计划          0   2   0   0   0    3   0   0   0   1        6
设计          0   2   0   0   0   11   0   0   0   1       14
预订          1   4   0   0   0   36   0   0   0   6       47
__all__    10  56   0   0   0  208   0   0   0  36      310


Overall Statistics:

Accuracy: 0.22903225806451613
95% CI: (0.1834298397205966, 0.2798963697544362)
No Information Rate: ToDo
P-Value [Acc > NIR]: 0.9999999999999999
Kappa: 0.046706124549665474
Mcnem