README: 
- reorganize codes based on v5
- use vectorized method for multi-class classification
- handle cross validation by using stratified k-fold cross validation => [reference](https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py)

In [2]:
import pandas as pd
import os
os.chdir('/Users/liyuan/desktop/CSAir/codes')
import numpy as np
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB  
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb

from modeling_main import ReviewClassify
from tokenization import Tokenization
from help import get_tokenized_sent, get_stopwords

from prepare_data import PrepareData
from modeling import Modeling

import keras
from keras.utils import to_categorical
from sklearn.multiclass import OneVsRestClassifier
import seaborn as sns
from sklearn.metrics import confusion_matrix
from pandas_ml import ConfusionMatrix

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

Using TensorFlow backend.


In [3]:
def predict_label(model,X_train,y_train):
    ''' predict label for each review, by picking the class with highest probability'''
    multi_class_clf = OneVsRestClassifier(model, n_jobs=-1)
    multi_class_clf.fit(X_train, y_train)
    # each review has proba for 10 classes
    scores = multi_class_clf.predict_proba(X_test)
    return scores

def get_class_label_name(scores,idx):
    ''' input a review index, and get the predicted label 
    (the one with highest probability) for this review'''
    label_encoded = np.argmax(scores[idx])
    return [key for key in labels_index if labels_index[key] ==label_encoded ].pop()

def add_pred_to_df(scores, df):
    '''add predicted labels to original df'''
    predicted_labels = []
    for i in range(len(scores)):
        label_pred = get_class_label_name(scores,i)
        predicted_labels.append(label_pred)
    # add predicted labels to original test df
    df['pred_label'] = predicted_labels
    return df

def get_confusion_matrix(y_test,y_pred):
    '''get confusion matrix (tp,tn,fp,fn) for each class'''
    cm = ConfusionMatrix(y_test, y_pred)
    cm.print_stats()

In [82]:
## Data Preprocess
# load data and split data
data_p = PrepareData()
data = data_p.load_data('../res/labeled_data_with_without_tk.csv')
train, test = data_p.split_data()

# X_train, X_test is the vectorized tokens that represent each review
X_train, y_train, X_test, y_test = data_p.preprocess_data()

# get label dictionary
labels_index = data_p.get_labels_index()
print(labels_index)

# convert encoded labels into one-hot encoding, using keras to_categorical functions
y_train_transformed = to_categorical(y_train, dtype='float32')
y_test_transformed = to_categorical(y_test, dtype='float32')

training data has 1038 examples
test data has 512 examples
{'计划': 7, '机上': 5, '中转': 0, '售后': 3, '预订': 9, '设计': 8, '出发': 1, '性能': 4, '行程': 6, '到达': 2}


In [7]:
def modeling_with_stratified_CV(parameters,classifier):
    '''stratified cross validation '''
    # data preprocess
    data_p = PrepareData()
    data = data_p.load_data('../res/labeled_data_with_without_tk.csv')
    train, test = data_p.split_data()
    # X_train, X_test is the vectorized tokens that represent each review
    X_train, y_train, X_test, y_test = data_p.preprocess_data()
    # get label dictionary
    labels_index = data_p.get_labels_index()

    clf = GridSearchCV(classifier, param_grid = parameters, cv=StratifiedKFold(n_splits=6)).fit(X_train, y_train)
    print('best params found:',clf.best_params_)
    # Returns the probability of the sample for each class in the model. 
    proba = clf.predict_proba(X_test)
    print(proba)
    # find the corresponding class in clf.predict_proba()
    # print('corresponding classes:',clf.classes_)
    
    ## pick the label with the highest probability for each review
    labels_pred = [] # for test reviews
    for i in range(proba.shape[0]):
        # argsort() Returns the indices that would sort an array
        label_idx = proba[i].argsort()[-1] # the last one is with the highest proba
        label_pred = [l for l in labels_index if label_idx == labels_index[l]].pop()
        labels_pred.append(label_pred)
        
    # append predicted values to test df
    test['label_pred'] = labels_pred
    get_confusion_matrix(test.label,test.label_pred)

In [91]:
## SVC
parameters = {'kernel':('rbf', 'sigmoid','linear','poly'), 'C':[0.01,0.1,1,10]}
classifier = SVC(probability=True, random_state=0)
modeling_with_stratified_CV(params,classifier)

training data has 1038 examples
test data has 512 examples
{'计划': 7, '机上': 5, '中转': 0, '售后': 3, '预订': 9, '设计': 8, '出发': 1, '性能': 4, '行程': 6, '到达': 2}




best params found: {'C': 1, 'kernel': 'linear'}
[[0.0682616  0.71582707 0.08192494 ... 0.01786782 0.0044849  0.00973469]
 [0.02495059 0.03520913 0.03886073 ... 0.0101893  0.00453401 0.00986273]
 [0.30916227 0.55167862 0.00960373 ... 0.00852011 0.00440798 0.03519039]
 ...
 [0.02693399 0.20677117 0.0099614  ... 0.02608977 0.00960379 0.11591225]
 [0.00516026 0.01115477 0.00249253 ... 0.00672308 0.01097724 0.92266424]
 [0.19303635 0.26402876 0.13225824 ... 0.01090331 0.00482456 0.08883291]]
corresponding classes: [0 1 2 3 4 5 6 7 8 9]
Confusion Matrix:

Predicted  中转   出发  到达  售后  性能  机上  行程  计划  设计  预订  __all__
Actual                                                     
中转         31   18   1   0   0   3   0   0   0   1       54
出发          9   61  17   8   2  10   1   0   1   3      112
到达          2    6  28   0   0  11   0   0   0   0       47
售后          0   13   0  25   1   1   1   0   2   5       48
性能          1    5   0   4  23   2   2   0   2  16       55
机上          5   12   3  

In [10]:
## logistic regression
parameters = parameters = {'penalty':('l2', 'l1'), 'C':[0.01,0.1,1,10]}
classifier = LogisticRegression()
modeling_with_stratified_CV(parameters,classifier)

training data has 1038 examples
test data has 512 examples




best params found: {'C': 1, 'penalty': 'l1'}
[[0.08620987 0.18791991 0.30881663 ... 0.02604515 0.02716385 0.08687026]
 [0.04419972 0.13000754 0.09866927 ... 0.02329662 0.02429727 0.07299397]
 [0.36781948 0.40697256 0.02438994 ... 0.02042302 0.02130023 0.02286403]
 ...
 [0.08057969 0.25028551 0.06349886 ... 0.03464047 0.03612836 0.12716453]
 [0.03059674 0.11014725 0.03203041 ... 0.01826327 0.01904772 0.66219275]
 [0.12718712 0.16021288 0.03661956 ... 0.0223019  0.02325982 0.06098841]]
Confusion Matrix:

Predicted  中转   出发  到达  售后  性能   机上  行程  计划  设计  预订  __all__
Actual                                                      
中转         29   11   1   1   0   11   0   0   0   1       54
出发          8   61  13   7   1   20   0   0   0   2      112
到达          2    2  28   0   0   15   0   0   0   0       47
售后          0   11   0  27   1    6   1   0   0   2       48
性能          1    6   0   4  16   10   2   0   2  14       55
机上          3   12   3   0   0   71   0   0   0   4       93
行程  

In [13]:
## XGBoost (takes too long)
classifier = xgb.XGBClassifier()
parameters = {'max_depth':[3,5,10], 'learning_rate':[0.01,0.1,1,10],'alpha':[0.01,0.1,1,10], 'n_estimators' :[5,10,15]}
modeling_with_stratified_CV(parameters,classifier)

training data has 1038 examples
test data has 512 examples




best params found: {'alpha': 0.01, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 5}
[[0.09920567 0.11119309 0.11131599 ... 0.09874339 0.0960158  0.09678072]
 [0.09713943 0.10487739 0.10077821 ... 0.09727766 0.09724468 0.09801938]
 [0.11060452 0.0992583  0.09915625 ... 0.09823646 0.09820315 0.09898549]
 ...
 [0.10103454 0.09919333 0.09808734 ... 0.09781905 0.09778588 0.09856489]
 [0.09744515 0.09895479 0.09785146 ... 0.09758382 0.09755073 0.11711488]
 [0.09696807 0.10774171 0.0967259  ... 0.09646133 0.09642863 0.09719682]]
Confusion Matrix:

Predicted  中转  出发  到达  售后  性能   机上  行程  计划  设计  预订  __all__
Actual                                                     
中转         27   6   4   1   1   14   0   0   0   1       54
出发         11  58  18   9   0   11   1   0   1   3      112
到达          3   1  24   0   2   17   0   0   0   0       47
售后          1   5   2  27   1    5   1   2   0   4       48
性能          1   5   0   3  15   10   1   1   3  16       55
机上          8   7   3  

In [12]:
## multinomial NB
parameters = {'alpha':[0.01,0.1,1,10]}
classifier = MultinomialNB()
modeling_with_stratified_CV(parameters,classifier)

training data has 1038 examples
test data has 512 examples




best params found: {'alpha': 0.1}
[[0.04287305 0.85944752 0.0308812  ... 0.00393023 0.00170072 0.00560451]
 [0.00368573 0.00144378 0.00821474 ... 0.00196728 0.00138827 0.00102845]
 [0.14359389 0.57234101 0.01697995 ... 0.00471432 0.0038382  0.10255332]
 ...
 [0.05126846 0.23771444 0.01794068 ... 0.0138162  0.00811133 0.15947844]
 [0.00711496 0.14430022 0.00297002 ... 0.00215432 0.00608253 0.7888894 ]
 [0.15764098 0.21831626 0.28708494 ... 0.00572531 0.00502447 0.07252129]]
Confusion Matrix:

Predicted  中转   出发  到达  售后  性能  机上  行程  计划  设计  预订  __all__
Actual                                                     
中转         13   27   1   2   0  10   0   0   0   1       54
出发          8   72  12   8   1   7   1   0   0   3      112
到达          2   18  18   0   0   9   0   0   0   0       47
售后          1   15   0  23   1   0   0   0   2   6       48
性能          0    7   0   3  19   3   1   0   1  21       55
机上          5   19   3   0   0  63   0   0   0   3       93
行程          0    7   0 