#### readme: 
* reference: [在Keras模型中使用预训练的词向量](https://keras-cn-docs.readthedocs.io/zh_CN/latest/blog/word_embedding/)
* summary: 
    - method: use pretrained word2vec 百度百科 + CNN
    - performance: not good, based on metrics of [accuracy] and [confusion matrix]

In [2]:
import os
import numpy as np
import pandas as pd
os.chdir('/Users/liyuan/desktop/CSAir/codes')

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
from keras.layers import Flatten
from keras.layers import Embedding
import tensorflow as tf

import warnings
warnings.simplefilter('ignore')

from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from pandas_ml import ConfusionMatrix

from word2vec_v2 import word2vec

In [94]:
class word2vec():
    def __init__(self):
        self.embeddings_index = {}
        self.MAX_SEQUENCE_LENGTH = 1000
        self.MAX_NUM_WORDS = 20000
        self.EMBEDDING_DIM = 300
        self.VALIDATION_SPLIT = 0.2
        self.all_labeled_data = pd.DataFrame()
        self.labels_index = {}
        self.word_index  = {}
        self.texts = np.array([])
        self.labels = np.array([])
        self.data = np.array([])
        self.X_train = np.array([])
        self.y_train = np.array([])
        self.X_val = np.array([])
        self.y_val = np.array([])
        self.embedding_matrix = np.array([])

    def load_pretrained_vectors(self, file_path):
        f = open(file_path)
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            self.embeddings_index[word] = coefs
        f.close()
        print('Found %s word vectors.' % len(self.embeddings_index))
        return self.embeddings_index 

    def prepare_data(self,data_file_path):
        self.all_labeled_data = pd.read_csv(data_file_path)
        self.texts = self.all_labeled_data.review_tokens.astype('str').values
        self.labels = self.all_labeled_data.label_encoded.values
        
        # get a dictionary that map each original label to its encoded label, e.g., {'中转': 0,...}
        for label in self.all_labeled_data.label.unique().tolist():
            self.labels_index[label] = self.all_labeled_data[self.all_labeled_data['label'] == label]['label_encoded'].unique()[0]

        tokenizer = Tokenizer(nb_words=self.MAX_NUM_WORDS)
        tokenizer.fit_on_texts(self.texts)
        sequences = tokenizer.texts_to_sequences(self.texts)

        self.word_index = tokenizer.word_index
        print('Found %s unique tokens.' % len(self.word_index))

        self.data = pad_sequences(sequences, maxlen=self.MAX_SEQUENCE_LENGTH)
        print('Shape of data tensor:', self.data.shape)
        print('Shape of label tensor:', self.labels.shape)
        
        # Converts a class vector (integers) to binary class matrix
        self.labels = to_categorical(np.asarray(self.labels))
        print('Shape of data tensor:', self.data.shape)
        print('Shape of label tensor:', self.labels.shape)

        # split the data into a training set and a validation set
        self.indices = np.arange(self.data.shape[0])
        np.random.shuffle(self.indices)
        self.data = self.data[self.indices]
        self.labels = self.labels[self.indices]
        nb_validation_samples = int(self.VALIDATION_SPLIT * self.data.shape[0])

        self.X_train = self.data[:-nb_validation_samples]
        self.y_train = self.labels[:-nb_validation_samples]
        self.X_val = self.data[-nb_validation_samples:]
        self.y_val = self.labels[-nb_validation_samples:]
        return  self.X_train, self.y_train, self.X_val, self.y_val
    
    
    def get_embedding_matrix(self):
        # 据得到的字典生成上文所定义的词向量矩阵
        embedding_matrix = np.zeros((len(self.word_index) + 1, self.EMBEDDING_DIM))
        for word, i in self.word_index.items():
            embedding_vector = self.embeddings_index.get(word)
            # updated:
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector
        return embedding_matrix
    
    def setup_neural_net(self):
        # get word embedding matrix
        self.embedding_matrix = self.get_embedding_matrix()

        # 将这个词向量矩阵加载到Embedding层
        embedding_layer = Embedding(len(self.word_index) + 1,
                                    self.EMBEDDING_DIM,
                                    weights=[self.embedding_matrix],
                                    input_length=self.MAX_SEQUENCE_LENGTH,
                                    trainable=False)
        
        # 使用一个小型的1D卷积解决分类问题
        sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH,), dtype='int32')
        embedded_sequences = embedding_layer(sequence_input)
        x = Conv1D(128, 5, activation='relu')(embedded_sequences)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)
        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(35)(x)  # global max pooling
        x = Flatten()(x)
        x = Dense(128, activation='relu')(x)
        preds = Dense(len(self.labels_index), activation='softmax')(x)
        return sequence_input,preds
    
    
    def train_data(self,X_train,y_train,X_val,y_val):
        sequence_input,preds = self.setup_neural_net()
        model = Model(sequence_input, preds)
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])
        # can change the number of epoch accordingly
        model.fit(X_train, y_train, validation_data=(X_val, y_val),
                  nb_epoch=50, batch_size=128)  
        
        # evaluate model using model.evaluate()
        scores = model.evaluate(X_val, y_val, verbose=0)
        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        
        # get predicted class label
        output = model.predict(X_val)
        predicted_label_list = self.get_pred_label(output)
        return predicted_label_list
    
    
    def get_pred_label(self,output):
        '''get predicted class label based on prediction output'''
        predicted_label_list = []
        for i in range(len(output)):
            predicted_label = output[i].argmax(axis=-1)
            predicted_label_list.append(predicted_label)        
        return predicted_label_list
    
    
    def incorporate_pred_label(self):
        '''return prediction results back to df'''
        # indices is a numpy array, need to convert to a list of indices before feed into df to get sub df 
        # recreate df based on the shuffled indices
        indices = self.indices
        all_labeled_data = w2v.all_labeled_data.iloc[list(self.indices)]
        nb_validation_samples = int(w2v.VALIDATION_SPLIT * w2v.data.shape[0])
        print(nb_validation_samples)
        # need to get the indices of the validation data
        train_val_bound = w2v.data.shape[0] - nb_validation_samples
        # get validation dataset
        val_df = all_labeled_data[train_val_bound:]
        return val_df

    def map_label(self,df,predicted_label_list):
        '''map predicted labels to original class'''
        # print(predicted_label_list[:10])
        label_dct = self.labels_index
        df['pred_label_encodes'] = predicted_label_list
        # get reversed labels_index dictionary
        reversed_label_dct = {}
        for i in range(len(label_dct)):
            reversed_label_dct[list(label_dct.values())[i]] = list(label_dct.keys())[i]

        # map predicted labels
        pred_label = [reversed_label_dct.get(label) for label in predicted_label_list]
        df['pred_label'] = pred_label
        return df
    
    
    def evaluate_performance(self,val_df):
        # evaluate performance
        y_val_true = val_df.label.values
        y_val_pred = val_df.pred_label.values
        self.get_confusion_matrix(y_val_true,y_val_pred) 
        
    
    def get_confusion_matrix(self,y_test,y_pred):
        '''get tp,tn,fp,fn for each class'''
        cm = ConfusionMatrix(y_test, y_pred)
        cm.print_stats()
        
        
    def over_sampling(self):
        '''modeling after over sampling'''
        smote = SMOTE('minority')
        X_train_sm, y_train_sm = smote.fit_sample(self.X_train,self.y_train)
        print(X_train_sm.shape, y_train_sm.shape)
        
        # fit model based on new data set
        predicted_label_list = self.train_data(X_train_sm,y_train_sm,X_val,y_val)
        return predicted_label_list
        

In [3]:
w2v = word2vec()
embeddings_index = w2v.load_pretrained_vectors('../Source_Data/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5')
# X_train, y_train, X_val, y_val = w2v.prepare_data('./res/all_labeled_data_v3.csv')
X_train, y_train, X_val, y_val = w2v.prepare_data('../res/labeled_data_with_without_tk.csv')
predicted_label_list = w2v.train_data(X_train,y_train,X_val,y_val)

Found 635922 word vectors.
Found 4747 unique tokens.
Shape of data tensor: (1551, 1000)
Shape of label tensor: (1551,)
Shape of data tensor: (1551, 1000)
Shape of label tensor: (1551, 10)
Train on 1241 samples, validate on 310 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 53.55%


In [5]:
# check label index
print('label encoding dictionary:', w2v.labels_index)

label encoding dictionary: {'计划': 7, '机上': 5, '中转': 0, '售后': 3, '预订': 9, '设计': 8, '出发': 1, '性能': 4, '行程': 6, '到达': 2}


In [10]:
# TODO:
def map_label(w2v,df,predicted_label_list):
    '''map predicted labels to original class'''
    # print(predicted_label_list[:10])
    label_dct = w2v.labels_index
    df['pred_label_encodes'] = predicted_label_list
    
    # get reversed labels_index dictionary
    reversed_label_dct = {}
    for i in range(len(label_dct)):
        reversed_label_dct[list(label_dct.values())[i]] = list(label_dct.keys())[i]

    # map predicted labels
    pred_label = [reversed_label_dct.get(label) for label in predicted_label_list]
    df['pred_label'] = pred_label
    return df



{'计划': 7, '机上': 5, '中转': 0, '售后': 3, '预订': 9, '设计': 8, '出发': 1, '性能': 4, '行程': 6, '到达': 2}
{7: '计划', 5: '机上', 0: '中转', 3: '售后', 9: '预订', 8: '设计', 1: '出发', 4: '性能', 6: '行程', 2: '到达'}


In [97]:
val_df = w2v.incorporate_pred_label()
val_df = w2v.map_label(val_df,predicted_label_list)
val_df.head()

310


Unnamed: 0,review,review_tokens,label,label_encoded
1117,托运行李等待1小时才取到。无语。,托运 行李 等待 小时 取到 语,出发,1
1034,飞机延误时间太长。,飞机 延误 时间 太 长,出发,1
751,注册登陆使用繁琐，辣鸡,注册 登陆 繁琐 辣鸡,预订,9
304,1. 济南机场2舱休息室也就是形同虚设，没有泡面，差评，咖啡机坏了，水桶的水发黄，2.空乘服...,济南 机场 舱 休息室 形同虚设 泡面 差评 咖啡机 坏 水桶 水 发黄 空乘 服务 算 热...,机上,5
918,11月24日从杭州到重庆的CZ8180航班因雾晚点，其他登机口各航空公司都有工作人员服务，南...,月 日 杭州 重庆 航班 因雾 晚点 登机口 航空公司 工作人员 服务 南航 登机口 上午 ...,出发,1


In [89]:
# evaluate performance
w2v.evaluate_performance(val_df)

  num = df[df > 1].dropna(axis=[0, 1], thresh=1).applymap(lambda n: choose(n, 2)).sum().sum() - np.float64(nis2 * njs2) / n2


Confusion Matrix:

Predicted  中转  出发  到达  售后  性能  机上  行程  计划  设计  预订  __all__
Actual                                                    
中转          3   5   0   2   0   3   0  10   0   3       26
出发          5  16  10   4   1   6   0  19   0  14       75
到达          0   2  11   0   0   4   0  17   0   0       34
售后          0   2   0  10   3   1   1   2   0   8       27
性能          0   1   0   0   8   1   0   3   0   9       22
机上          0   4   4   1   2  34   1   9   0   1       56
行程          0   0   0   0   3   0   0   2   0   8       13
计划          0   0   0   0   3   0   0   5   0   1        9
设计          0   0   0   0   3   0   0   0   1   1        5
预订          0   2   0   2   9   0   0   7   0  23       43
__all__     8  32  25  19  32  49   2  74   1  68      310


Overall Statistics:

Accuracy: 0.3580645161290323
95% CI: (0.304666114521708, 0.4142047918295947)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.6248443439288728e-06
Kappa: 0.27474723724429817
Mcnemar's Test P-

  return(np.float64(self.TPR) / self.FPR)


In [87]:
# implement oversampling
predicted_label_list_os = w2v.over_sampling()

(1472, 1000) (1472, 10)




Train on 1472 samples, validate on 310 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 35.81%


In [91]:
val_df = incorporate_pred_label(w2v)
val_df_os = map_label(w2v,val_df,predicted_label_list_os)
val_df_os.head()

310


Unnamed: 0,review,review_tokens,label,label_encoded,pred_label
1419,我投诉南航大连机场行李查询工作人员就坐在办公室玩手机不给客人解决问题,投诉 南航 大连 机场 行李 查询 工作人员 坐在 办公室 玩 手机 客人 解决问题,到达,2,计划
214,食物的新鲜度令人质疑，我吃的在飞机上上吐下泻，我伴侣也闹肚子了。非常不愉快的旅行经历...,食物 新鲜度 令人 质疑 吃 飞机 上吐下泻 伴侣 闹肚子 愉快 旅行 经历,机上,5,机上
1513,在北京中转办理托运行李希望改进 ，不要客户在服务大厅等托运行李，希望要改进,北京 中转 办理 托运 行李 希望 改进 客户 服务 大厅 托运 行李 希望 改进,到达,2,计划
1234,很完美。。就是我申请了WiFi这个机舱竟然没有,完美 申请 机舱,性能,4,性能
1106,进休息室才知道，真行啊你们南航！,进 休息室 真行 南航,出发,1,机上


In [92]:
# evaluate performance after over sampling
w2v.evaluate_performance(val_df_os)

Confusion Matrix:

Predicted  中转  出发  到达  售后  性能  机上  行程  计划  设计  预订  __all__
Actual                                                    
中转          3   5   0   2   0   3   0  10   0   3       26
出发          5  16  10   4   1   6   0  19   0  14       75
到达          0   2  11   0   0   4   0  17   0   0       34
售后          0   2   0  10   3   1   1   2   0   8       27
性能          0   1   0   0   8   1   0   3   0   9       22
机上          0   4   4   1   2  34   1   9   0   1       56
行程          0   0   0   0   3   0   0   2   0   8       13
计划          0   0   0   0   3   0   0   5   0   1        9
设计          0   0   0   0   3   0   0   0   1   1        5
预订          0   2   0   2   9   0   0   7   0  23       43
__all__     8  32  25  19  32  49   2  74   1  68      310


Overall Statistics:

Accuracy: 0.3580645161290323
95% CI: (0.304666114521708, 0.4142047918295947)
No Information Rate: ToDo
P-Value [Acc > NIR]: 1.6248443439288728e-06
Kappa: 0.27474723724429817
Mcnemar's Test P-

In [None]:
# summary: oversampling vs. non-oversampling has very similar result

In [96]:
from sklearn.metrics import roc_auc_score

# evaluate_ROC_AUC
y_val_true = val_df.label.values
y_val_pred = val_df.pred_label.values
score = roc_auc_score(y_val_true, y_val_pred)


ValueError: multiclass format is not supported