readme: 
- for building 10 classifiers
- implement cross validation for each label class + hyperparameter tuning

In [6]:
import pandas as pd
import os
os.chdir('/Users/liyuan/desktop/CSAir/codes')
import numpy as np
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt

from sklearn.naive_bayes import MultinomialNB  
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

from modeling_main import ReviewClassify
from tokenization import Tokenization
from help import get_tokenized_sent, get_stopwords

from prepare_data import PrepareData
from modeling import Modeling

import keras
from keras.utils import to_categorical
from sklearn.multiclass import OneVsRestClassifier
import seaborn as sns
from sklearn.metrics import confusion_matrix
from pandas_ml import ConfusionMatrix

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 

In [7]:
def predict_label(model,X_train,y_train):
    ''' predict label for each review, by picking the class with highest probability'''
    multi_class_clf = OneVsRestClassifier(model, n_jobs=-1)
    multi_class_clf.fit(X_train, y_train)
    # each review has proba for 10 classes
    scores = multi_class_clf.predict_proba(X_test)
    return scores

def get_class_label_name(scores,idx):
    ''' input a review index, and get the predicted label 
    (the one with highest probability) for this review'''
    label_encoded = np.argmax(scores[idx])
    return [key for key in labels_index if labels_index[key] ==label_encoded ].pop()

def add_pred_to_df(scores, df):
    '''add predicted labels to original df'''
    predicted_labels = []
    for i in range(len(scores)):
        label_pred = get_class_label_name(scores,i)
        predicted_labels.append(label_pred)
    # add predicted labels to original test df
    df['pred_label'] = predicted_labels
    return df

def get_confusion_matrix(y_test,y_pred):
    '''get confusion matrix (tp,tn,fp,fn) for each class'''
    cm = ConfusionMatrix(y_test, y_pred)
    cm.print_stats()

In [3]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    # updated: drop na values
    data = data.dropna()
    return data

def split_data(data):
    train, test = train_test_split(data, test_size = 0.33, random_state=42)
#     print('training data has %d examples' %len(train))
#     print('test data has %d examples' %len(test))
    return train, test

def preprocess_data(data, train, test):
    '''use countvectorizer and tf-idf transformer to get valid one-hot encoding for reviews'''
    # use countVectorizer for one-hot encoding
    count_v0= CountVectorizer()
    counts_all = count_v0.fit_transform(data['review_tokens'])
    count_v1= CountVectorizer(vocabulary=count_v0.vocabulary_)  
    counts_train = count_v1.fit_transform(train.review_tokens)

    count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_)
    counts_test = count_v2.fit_transform(test.review_tokens)

    # implement tf-idf
    tfidftransformer = TfidfTransformer()
    train_data = tfidftransformer.fit(counts_train).transform(counts_train)
    test_data = tfidftransformer.fit(counts_test).transform(counts_test)

    X_train = train_data
    # y_train = train.label_encoded
    y_train = train.label_or_not.values
    X_test = test_data
    # y_test = test.label_encoded
    y_test = test.label_or_not.values
    return X_train, y_train, X_test, y_test

# multi-class modeling
def multi_class_preprocess(data,label):
    '''data preprocess for multi-class'''
    data.loc[data.label == label, 'label_or_not'] = 1
    data.loc[data.label != label, 'label_or_not'] = 0
    return data

def get_class_prior(data, label):
    '''get class prior
    class_prior = class_size / data_size'''
    class_prior = len(data[data['label']== label]) / len(data)
    return class_prior

def get_class_threshold(class_prob, class_prior):
    '''use class_priors are percentile for each class label '''
    # there are 10 class in total
    # col = 1 represent 'positive'
    # first index represents the class, e.g., prob_scores[0][:,1] -> prob. when labeled as class 0 for each review
    # class_prob = prob_scores[:,1] 
    # get the higher bound percentile
    percentile = (1 - class_prior)*100
    threshold = np.percentile(class_prob, percentile) 
    return threshold

def get_label(idx, labels, positive_review_dct):
    '''input an index and output a list of predicted labels'''
    label_pred = []
    for label in labels:
        if idx in positive_review_dct[label]:
            label_pred.append(label)
    return label_pred

def get_prob(data, model, parameters, label):
    '''get probability predicted for one label class'''
    label_data = multi_class_preprocess(data,label)
    # split data
    train, test = split_data(label_data)
    # vectorize reviews
    X_train, y_train, X_test, y_test = preprocess_data(label_data, train, test)
    # implement gridSearch CV
    model = GridSearchCV(model, parameters, cv=5)

    # modeling
    model.fit(X_train,y_train)
    print('best params found:',model.best_params_)
    
    # get the proba score for one class (using the best model from gridSearch to predict)
    class_prob = model.predict_proba(X_test)[:,1] 
    # e.g., class_proba: [0.15,0.3,...] => 512 records in total
    class_prob_values = class_prob.reshape(-1,1)
    return class_prob_values


def manual_classify(data, label, class_prob):
    '''classify by setting manual threshold of probability (for one class)'''
    # get class_prior
    class_prior = get_class_prior(data, label)
    # set manual threshol
    threshold = get_class_threshold(class_prob, class_prior)
    class_labels = []
    proba_dct = {}
    for score in class_prob:
        if score > threshold:
            label = 1
        else:
            label = 0
        class_labels.append(label)
    positive_reviews = [ idx for idx in range(len(class_labels)) if class_labels[idx] == 1  ]
    return positive_reviews

# get label_picked for review with multiple labels predicted
def pick_label(review_idx, label_pred):
    '''compare proba of each label_pred, and pick the one with higher proba'''
    # get label_idx
    label_idx_dct = {}
    for idx,label in enumerate(labels):
        label_idx_dct[label] = idx
  
    # pick label
    label_proba_dct = {}
    for i,label_pred in enumerate(label_pred[review_idx]):
        label_index = label_idx_dct.get(label_pred)
        label_proba_dct[label_pred] = prob_matrix[review_idx,label_index]
    label_picked = [key for key in label_proba_dct if label_proba_dct[key] == max(label_proba_dct.values())]
    return label_picked

In [None]:

data = load_data('../res/labeled_data_with_without_tk.csv')
# train, test data that include all label classes
train, test = split_data(data)
class_prob_values_dct = {}
labels = data.label.unique().tolist()
print(labels)
for label in labels:
    model = xgb.XGBClassifier()
    parameters = {'max_depth':[3,5,10], 'learning_rate':[0.01,0.1,1,10],'alpha':[0.01,0.1,1,10], 'n_estimators' :[5,10,15]}
    class_prob_values = get_prob(data, model, parameters, label)
    class_prob_values_dct[label] = class_prob_values
    
prob_matrix = np.hstack((list(class_prob_values_dct.values())))
prob_matrix

# get a dictionary: {0:[0.13,0.25,...], 1:[..], 2:[..],.. 511:[..]} that list the probability 
# of each user review across all 10 classes (each list within the values of the dictionary has 10 proba values)

['计划', '机上', '中转', '售后', '预订', '设计', '出发', '性能', '行程', '到达']
best params found: {'alpha': 0.01, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5}
best params found: {'alpha': 0.01, 'learning_rate': 1, 'max_depth': 5, 'n_estimators': 5}
best params found: {'alpha': 0.01, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 10}
best params found: {'alpha': 0.01, 'learning_rate': 1, 'max_depth': 3, 'n_estimators': 10}
best params found: {'alpha': 0.01, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 5}
best params found: {'alpha': 0.01, 'learning_rate': 10, 'max_depth': 3, 'n_estimators': 5}


In [10]:
def get_pred_performance(prob_matrix, labels, test):
    '''convert prediction result in proba_matrix to acutal df, and compute confusion matrix'''
    # add prediction results into a dictionary
    positive_review_dct = {}
    for i,label in enumerate(labels):
        positive_reviews = manual_classify(data, label, prob_matrix[:,i])
        positive_review_dct[label] = positive_reviews
    
    # get reversed dictionary: key is the index of user review, value is the labels predicted
    test_label_pred = {}
    for idx in range(len(test)):
        label_pred = get_label(idx,labels,positive_review_dct)
        test_label_pred[idx] = label_pred
    
    for i in range(len(test)):
    # handle review with multiple labels
        if len(test_label_pred[i]) > 1: 
            label_picked = pick_label(i,test_label_pred)
            test_label_pred[i] = label_picked

    # reput prediction into original dataframe
    test_ = test.copy()
    test_['predicted_labels'] = list(test_label_pred.values())
    def formatting(row):
        '''remove [] in the prediction result'''
        if len(row) > 0:
            return row[0]
        else:
            # np.nan is float, not supported for confusion matrix calculation, so change it to 'N/A'
            return 'N/A'
    test_['predicted_labels'] = test_['predicted_labels'].apply(formatting)
    test_.head()
    
    # get confusion matrix
    get_confusion_matrix(test_.label,test_.predicted_labels)

In [11]:
get_pred_performance(prob_matrix, labels, test)

Confusion Matrix:

Predicted  N/A  中转  出发  到达  售后  性能  机上  行程  计划  设计  预订  __all__
Actual                                                         
N/A          0   0   0   0   0   0   0   0   0   0   0        0
中转          16  18   6   1   1   0  11   0   0   0   1       54
出发          19   7  45  15  10   0  11   3   0   0   2      112
到达          15   1   1  20   0   0   9   0   1   0   0       47
售后          12   1   4   1  24   0   0   1   2   0   3       48
性能          35   0   2   0   3   0   2   3   2   0   8       55
机上          26   5   3   2   0   0  52   0   0   0   5       93
行程           5   0   0   0   0   0   1  10   2   0   0       18
计划           3   0   0   0   0   0   0   0   2   0   2        7
设计          10   1   0   0   0   0   0   0   0   0   1       12
预订          18   0   1   2   4   0   2   0   0   0  39       66
__all__    159  33  62  41  42   0  88  17   9   0  61      512


Overall Statistics:

Accuracy: 0.41015625
95% CI: (0.3672010335798694, 0.4541547668