**SVM Pipeline**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
from time import time

In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn import decomposition
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.utils.multiclass import unique_labels

In [3]:
hotels=pd.read_csv("data/reviews_mod5.csv")

In [4]:
hotels = hotels.drop(hotels.columns[0], axis=1)

__Function Definitions__

In [5]:
def split_train_test_data(input_data, target, ratio=0.3, rand_state=42):
    return train_test_split(input_data, target, test_size=ratio, stratify=target, random_state=rand_state)

def apply_grid_search_cv(pipe, param_grid, X_train, y_train, X_test, y_test, print_flag=True, score_matrix=f1_score, n_jobs=-1, cv=5):
    grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=make_scorer(score_matrix), n_jobs=n_jobs, cv=cv)
    t0 = time()
    res = grid_search.fit(X_train, y_train)
    if print_flag:
        print("done in %0.3fs" % (time() - t0))
        print("best params:")
        print(res.best_params_)
        print("Best cross-validation score: {:.3f}".format(grid_search.best_score_))
        print("Test-set score: {:.3f}".format(grid_search.score(X_test, y_test)))        
    return grid_search

def save_class_report_cv(grid_search, X_test, y_test, target_names, filename):
    y_pred = grid_search.best_estimator_.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
    df = pd.DataFrame(report).transpose()
    df.to_csv(filename)
    return df

In [6]:
def save_cross_validation_results(grid_search, filename, print_flag=True):
    param_keys = list(grid_search.cv_results_["params"][0].keys())
    matrix_list = ["mean_test_score", "std_test_score", "rank_test_score", "mean_fit_time"] 
    col = param_keys + matrix_list

    cv_results = []
    cv_results.append(col)    
    
    for param, score, std, rank, time in zip(grid_search.cv_results_["params"], grid_search.cv_results_["mean_test_score"],grid_search.cv_results_["std_test_score"],
                                             grid_search.cv_results_["rank_test_score"], grid_search.cv_results_["mean_fit_time"]):
        row_item = list(param.values())
        row_item.append(score)
        row_item.append(std)
        row_item.append(rank)
        row_item.append(time)
        cv_results.append(row_item)        
    cv_results = pd.DataFrame(cv_results) 
    header = cv_results.iloc[0] 
    cv_results = cv_results[1:]
    cv_results = cv_results.rename(columns = header)
    cv_results = cv_results.sort_values(by=['rank_test_score'])
    cv_results.to_csv(filename)
    if print_flag:
        print(cv_results.head(6))
        print(cv_results.tail(6))

__1. Choose input data and target__

In [7]:
X = hotels["norm"]
y = hotels["deceptive"]

__2. Split data__

In [8]:
X_train, X_test, y_train, y_test = split_train_test_data(X, y)

__3. Build pipeline__

In [9]:
steps = [
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC()),
]

pipe = Pipeline(steps)

__4. Construct parameters for cross validation testing__

In [10]:
param_grid1 = [
    {'vect__min_df': [1, 2, 5], 
        'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'clf__kernel': ['rbf'], 'clf__C': [0.1, 1, 10, 100], 
         'clf__gamma': [0.01, 0.1, 1, 10]}
]

In [11]:
# using a different kernel
param_grid2 = [
    {'vect__min_df': [1, 2, 5], 
         'vect__ngram_range': [(1, 1), (1, 2), (1, 3)], 
         'clf__kernel': ['linear'], 'clf__C': [0.1, 1, 10, 100]}
]

__5. Apply Grid Serach CV__

In [12]:
grid_search = apply_grid_search_cv(pipe, param_grid1, X_train, y_train, X_test, y_test)

done in 594.586s
best params:
{'clf__C': 100, 'clf__gamma': 0.01, 'clf__kernel': 'rbf', 'vect__min_df': 2, 'vect__ngram_range': (1, 3)}
Best cross-validation score: 0.876
Test-set score: 0.918


__6. Save Classification Report to File__

In [14]:
save_cross_validation_results(grid_search, "output/svm_norm1_validation_res.csv", True)

    clf__C clf__gamma clf__kernel vect__min_df vect__ngram_range  \
114    100       0.01         rbf            2            (1, 3)   
123    100        0.1         rbf            2            (1, 3)   
86      10        0.1         rbf            2            (1, 2)   
122    100        0.1         rbf            2            (1, 2)   
110    100       0.01         rbf            1            (1, 2)   
87      10        0.1         rbf            2            (1, 3)   

    mean_test_score std_test_score rank_test_score mean_fit_time  
114          0.8758      0.0219081               1       4.68852  
123        0.875361      0.0241683               2       4.57393  
86          0.87453      0.0188827               3       3.03406  
122        0.873756      0.0207876               4       3.10983  
110        0.873668         0.0141               5       3.23239  
87         0.872851      0.0247937               6       4.82305  
   clf__C clf__gamma clf__kernel vect__min_df vect__ng

In [15]:
save_class_report_cv(grid_search, X_test, y_test, ["truthful", "deceptive"], "output/svm_norm1_report.csv")

Unnamed: 0,f1-score,precision,recall,support
truthful,0.915254,0.931034,0.9,240.0
deceptive,0.918033,0.903226,0.933333,240.0
micro avg,0.916667,0.916667,0.916667,480.0
macro avg,0.916644,0.91713,0.916667,480.0
weighted avg,0.916644,0.91713,0.916667,480.0


__7. Save model to file__

In [16]:
from sklearn.externals import joblib
joblib.dump(grid_search.best_estimator_, 'output/svm_norm1.pkl')

['output/svm_norm1.pkl']

__8. SVM Model 2 - Linear Kernel__

In [17]:
y = hotels["deceptive"]
X_train, X_test, y_train, y_test = split_train_test_data(X, y)
grid_search2 = apply_grid_search_cv(pipe, param_grid2, X_train, y_train, X_test, y_test)

done in 138.400s
best params:
{'clf__C': 1, 'clf__kernel': 'linear', 'vect__min_df': 2, 'vect__ngram_range': (1, 3)}
Best cross-validation score: 0.874
Test-set score: 0.909


In [18]:
save_cross_validation_results(grid_search2, "output/svm_norm2_validation_res.csv", True)

   clf__C clf__kernel vect__min_df vect__ngram_range mean_test_score  \
15      1      linear            2            (1, 3)        0.874125   
33    100      linear            2            (1, 3)        0.873989   
24     10      linear            2            (1, 3)        0.873989   
20     10      linear            1            (1, 2)        0.871906   
29    100      linear            1            (1, 2)        0.871906   
32    100      linear            2            (1, 2)        0.871338   

   std_test_score rank_test_score mean_fit_time  
15      0.0210368               1       4.30018  
33      0.0216878               2       4.33107  
24      0.0216878               2       4.23071  
20      0.0157255               4         3.426  
29      0.0157255               4       3.34749  
32      0.0187081               6       2.94377  
   clf__C clf__kernel vect__min_df vect__ngram_range mean_test_score  \
34    100      linear            5            (1, 1)        0.828616   
2

In [19]:
save_class_report_cv(grid_search2, X_test, y_test, ["truthful", "deceptive"], "output/svm_norm2_report.csv")

Unnamed: 0,f1-score,precision,recall,support
truthful,0.907563,0.915254,0.9,240.0
deceptive,0.909091,0.901639,0.916667,240.0
micro avg,0.908333,0.908333,0.908333,480.0
macro avg,0.908327,0.908447,0.908333,480.0
weighted avg,0.908327,0.908447,0.908333,480.0


In [20]:
joblib.dump(grid_search.best_estimator_, 'output/svm_norm2.pkl')

['output/svm_norm2.pkl']

__9. Sentiment - RBF Kernel__

In [22]:
y = hotels["polarity"]
X_train, X_test, y_train, y_test = split_train_test_data(X, y)
grid_search_p = apply_grid_search_cv(pipe, param_grid1, X_train, y_train, X_test, y_test)


done in 554.610s
best params:
{'clf__C': 100, 'clf__gamma': 0.1, 'clf__kernel': 'rbf', 'vect__min_df': 2, 'vect__ngram_range': (1, 2)}
Best cross-validation score: 0.959
Test-set score: 0.962


In [25]:
save_cross_validation_results(grid_search_p, "output/svm_norm1_validation_res_sent.csv", True)

    clf__C clf__gamma clf__kernel vect__min_df vect__ngram_range  \
122    100        0.1         rbf            2            (1, 2)   
86      10        0.1         rbf            2            (1, 2)   
113    100       0.01         rbf            2            (1, 2)   
125    100        0.1         rbf            5            (1, 2)   
90      10        0.1         rbf            5            (1, 3)   
87      10        0.1         rbf            2            (1, 3)   

    mean_test_score std_test_score rank_test_score mean_fit_time  
122        0.958532      0.0143336               1       3.40155  
86         0.957676      0.0149199               2       3.13113  
113        0.957667       0.014396               3       2.82825  
125        0.956002      0.0151397               4       2.76727  
90         0.955983      0.0157994               5       3.59012  
87          0.95581       0.014025               6       4.22101  
   clf__C clf__gamma clf__kernel vect__min_df vect__ng

In [26]:
save_class_report_cv(grid_search_p, X_test, y_test, ["negative", "positive"], "output/svm_norm1_report_sent.csv")

Unnamed: 0,f1-score,precision,recall,support
negative,0.96281,0.954918,0.970833,240.0
positive,0.962185,0.970339,0.954167,240.0
micro avg,0.9625,0.9625,0.9625,480.0
macro avg,0.962497,0.962629,0.9625,480.0
weighted avg,0.962497,0.962629,0.9625,480.0


In [27]:
from sklearn.externals import joblib
joblib.dump(grid_search.best_estimator_, 'output/svm_norm1_sent.pkl')

['output/svm_norm1_sent.pkl']

__10. Sentiment Analysis - Linear Kernel__

In [28]:
grid_search_p2 = apply_grid_search_cv(pipe, param_grid2, X_train, y_train, X_test, y_test)

done in 131.224s
best params:
{'clf__C': 10, 'clf__kernel': 'linear', 'vect__min_df': 2, 'vect__ngram_range': (1, 2)}
Best cross-validation score: 0.959
Test-set score: 0.966


In [29]:
save_cross_validation_results(grid_search_p, "output/svm_norm2_validation_res_sent.csv", True)

    clf__C clf__gamma clf__kernel vect__min_df vect__ngram_range  \
122    100        0.1         rbf            2            (1, 2)   
86      10        0.1         rbf            2            (1, 2)   
113    100       0.01         rbf            2            (1, 2)   
125    100        0.1         rbf            5            (1, 2)   
90      10        0.1         rbf            5            (1, 3)   
87      10        0.1         rbf            2            (1, 3)   

    mean_test_score std_test_score rank_test_score mean_fit_time  
122        0.958532      0.0143336               1       3.40155  
86         0.957676      0.0149199               2       3.13113  
113        0.957667       0.014396               3       2.82825  
125        0.956002      0.0151397               4       2.76727  
90         0.955983      0.0157994               5       3.59012  
87          0.95581       0.014025               6       4.22101  
   clf__C clf__gamma clf__kernel vect__min_df vect__ng

In [30]:
save_class_report_cv(grid_search_p2, X_test, y_test, ["negative", "positive"], "output/svm_norm2_report_sent.csv")

Unnamed: 0,f1-score,precision,recall,support
negative,0.966942,0.959016,0.975,240.0
positive,0.966387,0.974576,0.958333,240.0
micro avg,0.966667,0.966667,0.966667,480.0
macro avg,0.966664,0.966796,0.966667,480.0
weighted avg,0.966664,0.966796,0.966667,480.0


In [31]:
from sklearn.externals import joblib
joblib.dump(grid_search.best_estimator_, 'output/svm_norm2_sent.pkl')

['output/svm_norm2_sent.pkl']