**Naive Bayes Pipeline**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
from time import time

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn import decomposition
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.utils.multiclass import unique_labels

In [4]:
hotels=pd.read_csv("data/reviews_mod5.csv")

In [5]:
hotels = hotels.drop(hotels.columns[0], axis=1)

__Function Definitions__

In [6]:
def split_train_test_data(input_data, target, ratio=0.3, rand_state=42):
    return train_test_split(input_data, target, test_size=ratio, stratify=target, random_state=rand_state)

def apply_grid_search_cv(pipe, param_grid, X_train, y_train, X_test, y_test, print_flag=True, score_matrix=f1_score, n_jobs=-1, cv=5):
    grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=make_scorer(score_matrix), n_jobs=n_jobs, cv=cv)
    t0 = time()
    res = grid_search.fit(X_train, y_train)
    if print_flag:
        print("done in %0.3fs" % (time() - t0))
        print("best params:")
        print(res.best_params_)
        print("Best cross-validation score: {:.3f}".format(grid_search.best_score_))
        print("Test-set score: {:.3f}".format(grid_search.score(X_test, y_test)))        
    return grid_search

def save_class_report_cv(grid_search, X_test, y_test, target_names, filename):
    y_pred = grid_search.best_estimator_.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
    df = pd.DataFrame(report).transpose()
    df.to_csv(filename)
    return df

In [18]:
def save_cross_validation_results(grid_search, filename, print_flag=True):
    param_keys = list(grid_search.cv_results_["params"][0].keys())
    matrix_list = ["mean_test_score", "std_test_score", "rank_test_score", "mean_fit_time"] 
    col = param_keys + matrix_list

    cv_results = []
    cv_results.append(col)    
    
    for param, score, std, rank, time in zip(grid_search.cv_results_["params"], grid_search.cv_results_["mean_test_score"],grid_search.cv_results_["std_test_score"],
                                             grid_search.cv_results_["rank_test_score"], grid_search.cv_results_["mean_fit_time"]):
        row_item = list(param.values())
        row_item.append(score)
        row_item.append(std)
        row_item.append(rank)
        row_item.append(time)
        cv_results.append(row_item)        
    cv_results = pd.DataFrame(cv_results) 
    header = cv_results.iloc[0] 
    cv_results = cv_results[1:]
    cv_results = cv_results.rename(columns = header)
    cv_results = cv_results.sort_values(by=['rank_test_score'])
    cv_results.to_csv(filename)
    if print_flag:
        print(cv_results.head(6))
        print(cv_results.tail(6))

__1. Choose input data and target__

In [8]:
X = hotels["norm"]
y = hotels["deceptive"]

__2. Split data__

In [9]:
X_train, X_test, y_train, y_test = split_train_test_data(X, y)

__3. Build pipeline__

In [10]:
steps = [
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
]

pipe = Pipeline(steps)

__4. Construct parameters for cross validation testing__

In [11]:
param_grid = [
    {'vect__stop_words': ['english', None],
        'vect__min_df': [1, 2, 5], 
        'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'clf__alpha': (1, 1e-2, 1e-3)}
]

__5. Apply Grid Serach CV__

In [12]:
nb_grid_search = apply_grid_search_cv(pipe, param_grid, X_train, y_train, X_test, y_test)

done in 54.964s
best params:
{'clf__alpha': 0.01, 'vect__min_df': 2, 'vect__ngram_range': (1, 3), 'vect__stop_words': None}
Best cross-validation score: 0.883
Test-set score: 0.888


__6. Save Classification Report to File__

In [19]:
save_cross_validation_results(nb_grid_search, "output/nb_norm1_validation_res.csv", True)

   clf__alpha vect__min_df vect__ngram_range vect__stop_words mean_test_score  \
30       0.01            2            (1, 3)             None        0.883126   
16          1            5            (1, 2)             None        0.881645   
11          1            2            (1, 3)          english        0.881515   
12          1            2            (1, 3)             None        0.880358   
9           1            2            (1, 2)          english        0.879127   
22       0.01            1            (1, 2)             None        0.879048   

   std_test_score rank_test_score mean_fit_time  
30      0.0173371               1       1.89402  
16      0.0128866               2      0.934683  
11      0.0318756               3       1.14633  
12      0.0176003               4       1.94367  
9       0.0184544               5      0.608048  
22      0.0148647               6      0.892472  
   clf__alpha vect__min_df vect__ngram_range vect__stop_words mean_test_score  \
2

In [20]:
save_class_report_cv(nb_grid_search, X_test, y_test, ["truthful", "deceptive"], "output/nb_norm1_report.csv")

Unnamed: 0,f1-score,precision,recall,support
truthful,0.883227,0.900433,0.866667,240.0
deceptive,0.887526,0.871486,0.904167,240.0
micro avg,0.885417,0.885417,0.885417,480.0
macro avg,0.885376,0.885959,0.885417,480.0
weighted avg,0.885376,0.885959,0.885417,480.0


__7. Save model to file__

In [21]:
from sklearn.externals import joblib
joblib.dump(nb_grid_search.best_estimator_, 'output/nb_norm1.pkl')

['output/nb_norm1.pkl']

__8. Sentiment Analysis__

In [22]:
y = hotels["polarity"]
X_train, X_test, y_train, y_test = split_train_test_data(X, y)
nb_grid_search_p = apply_grid_search_cv(pipe, param_grid, X_train, y_train, X_test, y_test)


done in 55.699s
best params:
{'clf__alpha': 0.01, 'vect__min_df': 1, 'vect__ngram_range': (1, 3), 'vect__stop_words': None}
Best cross-validation score: 0.956
Test-set score: 0.958


In [23]:
save_cross_validation_results(nb_grid_search, "output/nb_norm1_validation_res_sent.csv", True)

   clf__alpha vect__min_df vect__ngram_range vect__stop_words mean_test_score  \
30       0.01            2            (1, 3)             None        0.883126   
16          1            5            (1, 2)             None        0.881645   
11          1            2            (1, 3)          english        0.881515   
12          1            2            (1, 3)             None        0.880358   
9           1            2            (1, 2)          english        0.879127   
22       0.01            1            (1, 2)             None        0.879048   

   std_test_score rank_test_score mean_fit_time  
30      0.0173371               1       1.89402  
16      0.0128866               2      0.934683  
11      0.0318756               3       1.14633  
12      0.0176003               4       1.94367  
9       0.0184544               5      0.608048  
22      0.0148647               6      0.892472  
   clf__alpha vect__min_df vect__ngram_range vect__stop_words mean_test_score  \
2

In [24]:
save_class_report_cv(nb_grid_search_p, X_test, y_test, ["negative", "positive"], "output/nb_norm1_report_sent.csv")

Unnamed: 0,f1-score,precision,recall,support
negative,0.958333,0.958333,0.958333,240.0
positive,0.958333,0.958333,0.958333,240.0
micro avg,0.958333,0.958333,0.958333,480.0
macro avg,0.958333,0.958333,0.958333,480.0
weighted avg,0.958333,0.958333,0.958333,480.0


In [25]:
from sklearn.externals import joblib
joblib.dump(nb_grid_search.best_estimator_, 'output/nb_norm1_sent.pkl')

['output/nb_norm1_sent.pkl']