README: 
- updated: reorganize codes based on v5
- use vectorized method for multi-class classification

In [1]:
import pandas as pd
import os
os.chdir('/Users/liyuan/desktop/CSAir/codes')
import numpy as np
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB  
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from modeling_main import ReviewClassify
from tokenization import Tokenization
from help import get_tokenized_sent, get_stopwords

# updated: 04/17/2019
from prepare_data import PrepareData
from modeling import Modeling

import keras
from keras.utils import to_categorical
from sklearn.multiclass import OneVsRestClassifier
import seaborn as sns
from sklearn.metrics import confusion_matrix
from pandas_ml import ConfusionMatrix

Using TensorFlow backend.


In [2]:
def predict_label(model,X_train,y_train):
    ''' predict label for each review, by picking the class with highest probability'''
    multi_class_clf = OneVsRestClassifier(model, n_jobs=-1)
    multi_class_clf.fit(X_train, y_train)
    # each review has proba for 10 classes
    scores = multi_class_clf.predict_proba(X_test)
    return scores

def get_class_label_name(scores,idx):
    ''' input a review index, and get the predicted label 
    (the one with highest probability) for this review'''
    label_encoded = np.argmax(scores[idx])
    return [key for key in labels_index if labels_index[key] ==label_encoded ].pop()

def add_pred_to_df(scores, df):
    '''add predicted labels to original df'''
    predicted_labels = []
    for i in range(len(scores)):
        label_pred = get_class_label_name(scores,i)
        predicted_labels.append(label_pred)
    # add predicted labels to original test df
    df['pred_label'] = predicted_labels
    return df

def get_confusion_matrix(y_test,y_pred):
    '''get confusion matrix (tp,tn,fp,fn) for each class'''
    cm = ConfusionMatrix(y_test, y_pred)
    cm.print_stats()

In [3]:
## Data Preprocess
# load data and split data
data_p = PrepareData()
data = data_p.load_data('../res/labeled_data_with_without_tk.csv')
train, test = data_p.split_data()

# X_train, X_test is the vectorized tokens that represent each review
X_train, y_train, X_test, y_test = data_p.preprocess_data()

# get label dictionary
labels_index = data_p.get_labels_index()
print(labels_index)

# convert encoded labels into one-hot encoding, using keras to_categorical functions
y_train_transformed = to_categorical(y_train, dtype='float32')
y_test_transformed = to_categorical(y_test, dtype='float32')

training data has 1038 examples
test data has 512 examples
{'计划': 7, '机上': 5, '中转': 0, '售后': 3, '预订': 9, '设计': 8, '出发': 1, '性能': 4, '行程': 6, '到达': 2}


In [4]:
# define modeling method
model = LogisticRegression()
# get predicted proba for each review of the 10 classes
scores = predict_label(model,X_train,y_train_transformed)
# add prediction results to original df
test_with_predictions = add_pred_to_df(scores, test)

y_test = test.label.values
y_pred = test.pred_label.values
# get confusion matrix
get_confusion_matrix(y_test,y_pred)

Confusion Matrix:

Predicted  中转   出发  到达  售后  性能   机上  行程  计划  设计  预订  __all__
Actual                                                      
中转         24   19   1   1   0    8   0   0   0   1       54
出发          7   68  12   7   1   16   0   0   0   1      112
到达          2    8  21   0   0   16   0   0   0   0       47
售后          0   13   0  24   1    5   1   0   0   4       48
性能          0    8   0   4  15    9   2   0   0  17       55
机上          1   16   2   0   0   72   0   0   0   2       93
行程          0    6   0   2   0    2   5   0   0   3       18
计划          0    0   0   0   1    2   0   0   0   4        7
设计          0    1   0   0   4    2   0   0   1   4       12
预订          0    6   1   3   3    5   0   0   0  48       66
__all__    34  145  37  41  25  137   8   0   1  84      512


Overall Statistics:

Accuracy: 0.54296875
95% CI: (0.49870046846820293, 0.5867382423122843)
No Information Rate: ToDo
P-Value [Acc > NIR]: 7.479843526496029e-35
Kappa: 0.4567639257294429

In [6]:
# check prediction results
test_with_predictions.head()

Unnamed: 0,review,review_tokens,label,label_encoded,pred_label
1520,行李箱丢了！在荷兰机场语言不通，你们航空公司也没个人来管事，解释！,行李箱 丢 荷兰 机场 语言不通 航空公司 管事 解释,到达,2,出发
1442,昨晚（11月11日）我搭乘cz8278次18:25长沙飞海口的航班，飞机下降临近海口时，我突...,昨晚 月 日 搭乘 cz 次 长沙 飞 海口 航班 飞机 下降 临近 海口 时 突发...,到达,2,机上
351,延误能不能早点通知，我就好修改航班，下午16点到机场，延误就早点通知，我就能修改18点的航班...,延误 早点 通知 修改 航班 下午 点到 机场 延误 早点 通知 修改 点 航班 延误 ...,中转,0,出发
352,严重晚点,晚点,中转,0,出发
578,我在网上补登里程、只能䃼登上9月24日南京-广州、国际段䃼登不上、麻烦处理一下。 zhu n...,网上 补登 里程 只能 䃼 登上 月 日 南京 广州 国际 段 䃼 登不上 麻烦 zh...,售后,3,售后
