__LIWC__

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
from time import time

In [3]:
from sklearn.svm import LinearSVC, SVC
from sklearn import decomposition
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [4]:
def split_train_test_data(input_data, target, ratio=0.3, rand_state=42):
    return train_test_split(input_data, target, test_size=ratio, stratify=target, random_state=rand_state)

def apply_grid_search_cv(pipe, param_grid, X_train, y_train, X_test, y_test, print_flag=True, score_matrix=f1_score, n_jobs=-1, cv=5):
    grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring=make_scorer(score_matrix), n_jobs=n_jobs, cv=cv)
    t0 = time()
    res = grid_search.fit(X_train, y_train)
    if print_flag:
        print("done in %0.3fs" % (time() - t0))
        print("best params:")
        print(res.best_params_)
        print("Best cross-validation score: {:.3f}".format(grid_search.best_score_))
        print("Test-set score: {:.3f}".format(grid_search.score(X_test, y_test)))        
    return grid_search

def save_class_report_cv(grid_search, X_test, y_test, target_names, filename):
    y_pred = grid_search.best_estimator_.predict(X_test)
    report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
    df = pd.DataFrame(report).transpose()
    df.to_csv(filename)
    return df

In [5]:
def save_cross_validation_results(grid_search, filename, print_flag=True):
    param_keys = list(grid_search.cv_results_["params"][0].keys())
    matrix_list = ["mean_test_score", "std_test_score", "rank_test_score", "mean_fit_time"] 
    col = param_keys + matrix_list

    cv_results = []
    cv_results.append(col)    
    
    for param, score, std, rank, time in zip(grid_search.cv_results_["params"], grid_search.cv_results_["mean_test_score"],grid_search.cv_results_["std_test_score"],
                                             grid_search.cv_results_["rank_test_score"], grid_search.cv_results_["mean_fit_time"]):
        row_item = list(param.values())
        row_item.append(score)
        row_item.append(std)
        row_item.append(rank)
        row_item.append(time)
        cv_results.append(row_item)
        
    cv_results = pd.DataFrame(cv_results) 
    header = cv_results.iloc[0] 
    cv_results = cv_results[1:]
    cv_results = cv_results.rename(columns = header)
    cv_results = cv_results.sort_values(by=['rank_test_score'])
    cv_results.to_csv(filename)
    if print_flag:
        print(cv_results.head(6))
#        print(cv_results.tail(6))

In [6]:
hotels = pd .read_csv("data/LIWC2015_mod5.csv")

In [7]:
pd.set_option('display.max_columns', 500)

In [8]:
hotels = hotels.drop(hotels.columns[0], axis=1)

In [9]:
hotels.describe(include="all")

Unnamed: 0,deceptive,hotel,polarity,source,text,text_length,lower_case,lc_no_punct,norm,norm_lemma,norm_stem,norm_lemma_stopword,norm_stem_stopword,ADJ_count,ADP_count,ADV_count,AUX_count,CCONJ_count,DET_count,INTJ_count,NOUN_count,NUM_count,PART_count,PRON_count,PROPN_count,PUNCT_count,SCONJ_count,SYM_count,VERB_count,X_count,class,WC,Analytic,Clout,Authentic,Tone,WPS,Sixltr,Dic,function,pronoun,ppron,i,we,you,shehe,they,ipron,article,prep,auxverb,adverb,conj,negate,verb,adj,compare,interrog,number,quant,affect,posemo,negemo,anx,anger,sad,social,family,friend,female,male,cogproc,insight,cause,discrep,tentat,certain,differ,percept,see,hear,feel,bio,body,health,sexual,ingest,drives,affiliation,achieve,power,reward,risk,focuspast,focuspresent,focusfuture,relativ,motion,space,time,work,leisure,home,money,relig,death,informal,swear,netspeak,assent,nonflu,filler,AllPunc,Period,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
count,1600.0,1600,1600.0,1600,1600,1600.0,1600,1600,1600,1600,1600,1600,1600,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0,1600.0
unique,,20,,3,1596,,1596,1596,1596,1596,1596,1596,1596,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,,allegro,,MTurk,My daughter and I woke in the morning wanting ...,,the omni was chosen for it's location whichwor...,the omni was chosen for it s location whichwor...,the omni was chosen for it s location whichwor...,my daughter and i woke in the morning wanting ...,i d been search for a cool non chain hotel for...,disappointed stay chicago monoco stay ma...,omni wa chosen locat whichwork perfectli ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,,80,,800,2,,2,2,2,2,2,2,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,0.5,,0.5,,,806.39125,,,,,,,,0.099315,0.105465,0.073256,0.0,0.0,0.124403,0.0,0.249986,0.01351,0.0,0.063546,0.0,0.0,0.0,0.0,0.196686,0.00016,2.5,149.373125,71.799387,54.789069,62.433331,66.70815,16.388063,17.162369,88.76245,53.783787,11.534156,7.6238,3.905612,2.055,0.660881,0.218681,0.78355,3.895581,9.8068,12.801825,8.915756,5.446219,6.384119,1.677969,15.360131,6.0504,2.071431,0.933763,1.591325,1.943287,5.907506,4.553938,1.12075,0.163806,0.199269,0.203594,7.5057,0.359981,0.442594,0.325388,0.327144,8.955763,1.1454,0.797769,1.566944,1.757069,1.761981,2.854931,2.1814,0.910044,0.462844,0.480525,1.801225,0.252956,0.231481,0.010756,1.20815,8.111631,3.2419,1.098881,2.213175,1.785962,0.384556,7.265406,6.288981,0.765519,17.468119,2.915563,11.037831,4.88925,2.436375,3.751388,2.768525,1.240731,0.02215,0.02055,0.406875,0.018712,0.043844,0.149638,0.181806,0.00905,14.055519,6.612544,3.591619,0.087137,0.066175,0.076775,0.767031,0.681212,0.0,1.280112,0.5267,0.36615
std,0.500156,,0.500156,,,467.260647,,,,,,,,0.033587,0.026239,0.027617,0.0,0.0,0.028481,0.0,0.038522,0.014884,0.0,0.030471,0.0,0.0,0.0,0.0,0.034197,0.001135,1.118384,87.739431,18.697766,25.82183,26.023894,34.070231,7.226241,4.38268,4.14512,5.090643,3.915191,3.297535,3.034989,2.542078,1.105959,0.570362,1.037403,1.975704,2.581639,2.861762,2.497208,2.447376,1.966778,1.399532,3.391309,2.658416,1.498134,0.925727,1.507279,1.40312,2.870457,3.081086,1.28789,0.397944,0.464278,0.42458,4.030817,0.734593,0.696959,0.721457,0.650519,3.416747,1.065891,0.839687,1.235211,1.336183,1.384616,1.940218,1.597797,1.086196,0.74582,0.750665,1.694788,0.488442,0.484716,0.091442,1.498268,3.705509,3.115135,1.071992,1.435299,1.474134,0.62348,3.413554,3.136674,0.876047,4.175587,1.56133,3.249295,2.529072,1.69943,2.162818,1.720677,1.26993,0.142495,0.137384,0.680934,0.124303,0.21132,0.410125,0.454386,0.08024,5.047531,2.942552,2.465118,0.397933,0.322842,0.351067,1.384438,1.239694,0.0,1.450216,1.094702,0.850661
min,0.0,,0.0,,,151.0,,,,,,,,0.019048,0.0,0.0,0.0,0.0,0.0,0.0,0.115942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,1.0,25.0,3.86,3.48,1.0,1.0,5.23,5.45,71.79,31.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.69,1.09,0.0,0.0,0.0,2.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,,0.0,,,487.0,,,,,,,,0.07533,0.090526,0.054348,0.0,0.0,0.106557,0.0,0.223832,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.175676,0.0,1.75,89.0,60.275,32.465,42.3475,35.755,12.725,14.1675,86.3,50.9325,8.89,5.3275,1.39,0.0,0.0,0.0,0.0,2.61,8.1075,10.9875,7.28,3.73,5.05,0.6875,13.1525,4.21,1.02,0.0,0.43,0.9675,3.92,2.21,0.0,0.0,0.0,0.0,4.62,0.0,0.0,0.0,0.0,6.63,0.0,0.0,0.69,0.7975,0.81,1.47,1.1,0.0,0.0,0.0,0.5775,0.0,0.0,0.0,0.0,5.26,0.6175,0.0,1.23,0.76,0.0,5.0,4.095,0.0,14.7775,1.83,8.8475,3.08,1.27,2.2,1.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.52,4.96,1.74,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.5,,0.5,,,700.0,,,,,,,,0.096541,0.106667,0.072289,0.0,0.0,0.12416,0.0,0.247807,0.010363,0.0,0.061377,0.0,0.0,0.0,0.0,0.197346,0.0,2.5,128.0,75.015,54.525,66.74,81.23,15.4,16.985,89.125,54.43,11.585,7.765,3.45,0.935,0.0,0.0,0.49,3.75,9.785,12.85,8.94,5.26,6.32,1.49,15.38,5.71,1.85,0.8,1.325,1.75,5.49,3.965,0.82,0.0,0.0,0.0,7.045,0.0,0.0,0.0,0.0,8.86,1.03,0.69,1.45,1.61,1.54,2.675,1.95,0.69,0.0,0.0,1.48,0.0,0.0,0.0,0.79,7.785,2.455,0.95,2.02,1.52,0.0,7.505,5.85,0.59,17.43,2.78,10.82,4.71,2.15,3.475,2.61,0.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.395,6.095,3.34,0.0,0.0,0.0,0.0,0.0,0.0,0.92,0.0,0.0
75%,1.0,,1.0,,,987.5,,,,,,,,0.1193,0.121527,0.090109,0.0,0.0,0.141747,0.0,0.274336,0.020426,0.0,0.083333,0.0,0.0,0.0,0.0,0.22,0.0,3.25,183.0,87.0525,77.235,84.275,98.87,18.6325,19.6525,91.55,57.2725,14.18,9.88,5.94,3.555,0.98,0.0,1.2,5.0,11.49,14.75,10.37,6.85,7.6125,2.5,17.6875,7.485,2.94,1.49,2.3525,2.7625,7.32,6.435,1.81,0.0,0.0,0.2325,10.095,0.51,0.76,0.3525,0.48,11.11,1.7225,1.29,2.29,2.59,2.5,3.98,3.03,1.43,0.78,0.8,2.58,0.4,0.32,0.0,1.87,10.47,5.0725,1.6325,3.03,2.5,0.7025,9.64,8.0,1.22,20.1325,3.81,13.06,6.38,3.23,4.9425,3.7225,1.85,0.0,0.0,0.68,0.0,0.0,0.0,0.0,0.0,16.67,7.69,5.13,0.0,0.0,0.0,1.015,1.01,0.0,2.0525,0.6825,0.38


In [10]:
liwc = hotels.loc[:, "WC":"OtherP"]

In [11]:
liwc.head(2)

Unnamed: 0,WC,Analytic,Clout,Authentic,Tone,WPS,Sixltr,Dic,function,pronoun,ppron,i,we,you,shehe,they,ipron,article,prep,auxverb,adverb,conj,negate,verb,adj,compare,interrog,number,quant,affect,posemo,negemo,anx,anger,sad,social,family,friend,female,male,cogproc,insight,cause,discrep,tentat,certain,differ,percept,see,hear,feel,bio,body,health,sexual,ingest,drives,affiliation,achieve,power,reward,risk,focuspast,focuspresent,focusfuture,relativ,motion,space,time,work,leisure,home,money,relig,death,informal,swear,netspeak,assent,nonflu,filler,AllPunc,Period,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
0,107,61.61,61.05,35.01,77.39,13.38,13.08,82.24,49.53,6.54,3.74,0.93,1.87,0.93,0.0,0.0,2.8,7.48,10.28,10.28,9.35,6.54,1.87,12.15,3.74,0.0,0.0,4.67,0.93,4.67,3.74,0.93,0.0,0.0,0.0,6.54,0.93,0.0,0.0,0.0,5.61,0.0,0.0,0.93,0.0,0.93,3.74,0.0,0.0,0.0,0.0,3.74,0.0,0.0,0.0,3.74,6.54,3.74,0.93,0.93,1.87,0.0,9.35,1.87,0.0,14.95,1.87,8.41,6.54,0.93,3.74,2.8,1.87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.82,11.21,1.87,0.0,0.0,0.0,0.0,0.0,0,0.0,3.74,0.0
1,44,97.59,41.03,19.82,94.75,8.8,29.55,77.27,31.82,2.27,0.0,0.0,0.0,0.0,0.0,0.0,2.27,6.82,13.64,4.55,4.55,2.27,0.0,6.82,13.64,6.82,2.27,2.27,6.82,4.55,4.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.55,0.0,0.0,0.0,2.27,0.0,2.27,6.82,6.82,0.0,0.0,4.55,0.0,0.0,0.0,4.55,4.55,0.0,0.0,2.27,2.27,0.0,6.82,0.0,0.0,13.64,2.27,11.36,0.0,4.55,4.55,6.82,2.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,11.36,9.09,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,4.55


In [12]:
liwc.columns

Index(['WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic',
       'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they',
       'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate',
       'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect',
       'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend',
       'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat',
       'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body',
       'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve',
       'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture',
       'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home',
       'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent',
       'nonflu', 'filler', 'AllPunc', 'Period', 'Comma', 'Colon', 'SemiC',
       'QMark', 'Exclam', 'Dash', 'Quote', 'Apostro', 'Parenth', 'OtherP'],
      dtype='o

In [13]:
len(liwc.columns)

93

__1. Choose input data and target__

In [14]:
X = liwc

In [15]:
y = hotels["deceptive"]

__2. Split data__

In [16]:
X_train, X_test, y_train, y_test = split_train_test_data(X, y)

__3. Build pipeline__

In [17]:
steps_svm = [
    ('scale', StandardScaler()),
    ('clf', SVC()),
]

pipe_svm = Pipeline(steps_svm)

__4. Construct parameters for cross validation testing__

In [18]:
param_grid_svm = [
    {       
        'clf__kernel': ['rbf'], 'clf__C': [10, 100], 
         'clf__gamma': [0.01, 0.1]}
]

__5. Apply Grid Serach CV__

In [19]:
svm_grid_search = apply_grid_search_cv(pipe_svm, param_grid_svm, X_train, y_train, X_test, y_test)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


done in 3.958s
best params:
{'clf__C': 100, 'clf__gamma': 0.01, 'clf__kernel': 'rbf'}
Best cross-validation score: 0.771
Test-set score: 0.750


__6. Save Classification Report to File__

In [20]:
save_class_report_cv(svm_grid_search, X_test, y_test, ["truthful", "deceptive"], "output/svm_liwc1_report.csv")

  Xt = transform.transform(Xt)


Unnamed: 0,f1-score,precision,recall,support
truthful,0.75,0.75,0.75,240.0
deceptive,0.75,0.75,0.75,240.0
micro avg,0.75,0.75,0.75,480.0
macro avg,0.75,0.75,0.75,480.0
weighted avg,0.75,0.75,0.75,480.0


In [21]:
save_cross_validation_results(svm_grid_search, "output/svm_liwc1_validation_res.csv", True)

  clf__C clf__gamma clf__kernel mean_test_score std_test_score  \
3    100       0.01         rbf        0.770545      0.0209934   
1     10       0.01         rbf        0.769851      0.0204686   
2     10        0.1         rbf        0.626371      0.0921483   
4    100        0.1         rbf        0.626371      0.0921483   

  rank_test_score mean_fit_time  
3               1      0.176827  
1               2       0.18311  
2               3      0.186729  
4               3      0.147308  


__7. Save model to file__

In [22]:
from sklearn.externals import joblib
joblib.dump(svm_grid_search.best_estimator_, 'output/svm_liwc1.pkl')

['output/svm_liwc1.pkl']

__8. Sentiment Analysis__

In [23]:
y = hotels["polarity"]
X_train, X_test, y_train, y_test = split_train_test_data(X, y)
svm_grid_search_p = apply_grid_search_cv(pipe_svm, param_grid_svm, X_train, y_train, X_test, y_test)

done in 0.942s
best params:
{'clf__C': 10, 'clf__gamma': 0.01, 'clf__kernel': 'rbf'}
Best cross-validation score: 0.892
Test-set score: 0.873


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


In [24]:
save_cross_validation_results(svm_grid_search_p, "output/svm_liwc_sent_validation_res.csv")

  clf__C clf__gamma clf__kernel mean_test_score std_test_score  \
1     10       0.01         rbf        0.892358      0.0136111   
3    100       0.01         rbf        0.892179      0.0137926   
2     10        0.1         rbf        0.768246       0.018295   
4    100        0.1         rbf        0.768246       0.018295   

  rank_test_score mean_fit_time  
1               1      0.112298  
3               2      0.103919  
2               3      0.180516  
4               3      0.160171  


In [25]:
save_class_report_cv(svm_grid_search_p, X_test, y_test, ["negative", "positive"], "output/svm_liwc_report_sent.csv")

  Xt = transform.transform(Xt)


Unnamed: 0,f1-score,precision,recall,support
negative,0.873181,0.871369,0.875,240.0
positive,0.872651,0.874477,0.870833,240.0
micro avg,0.872917,0.872917,0.872917,480.0
macro avg,0.872916,0.872923,0.872917,480.0
weighted avg,0.872916,0.872923,0.872917,480.0


__Combined TFIDF with LIWC Analysis__

In [26]:
hotels_comb = pd.concat([hotels["norm"], liwc], axis=1)
hotels_comb.head(2)

Unnamed: 0,norm,WC,Analytic,Clout,Authentic,Tone,WPS,Sixltr,Dic,function,pronoun,ppron,i,we,you,shehe,they,ipron,article,prep,auxverb,adverb,conj,negate,verb,adj,compare,interrog,number,quant,affect,posemo,negemo,anx,anger,sad,social,family,friend,female,male,cogproc,insight,cause,discrep,tentat,certain,differ,percept,see,hear,feel,bio,body,health,sexual,ingest,drives,affiliation,achieve,power,reward,risk,focuspast,focuspresent,focusfuture,relativ,motion,space,time,work,leisure,home,money,relig,death,informal,swear,netspeak,assent,nonflu,filler,AllPunc,Period,Comma,Colon,SemiC,QMark,Exclam,Dash,Quote,Apostro,Parenth,OtherP
0,we stayed for a one night getaway with family ...,107,61.61,61.05,35.01,77.39,13.38,13.08,82.24,49.53,6.54,3.74,0.93,1.87,0.93,0.0,0.0,2.8,7.48,10.28,10.28,9.35,6.54,1.87,12.15,3.74,0.0,0.0,4.67,0.93,4.67,3.74,0.93,0.0,0.0,0.0,6.54,0.93,0.0,0.0,0.0,5.61,0.0,0.0,0.93,0.0,0.93,3.74,0.0,0.0,0.0,0.0,3.74,0.0,0.0,0.0,3.74,6.54,3.74,0.93,0.93,1.87,0.0,9.35,1.87,0.0,14.95,1.87,8.41,6.54,0.93,3.74,2.8,1.87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.82,11.21,1.87,0.0,0.0,0.0,0.0,0.0,0,0.0,3.74,0.0
1,triple a rate with upgrade to view room was le...,44,97.59,41.03,19.82,94.75,8.8,29.55,77.27,31.82,2.27,0.0,0.0,0.0,0.0,0.0,0.0,2.27,6.82,13.64,4.55,4.55,2.27,0.0,6.82,13.64,6.82,2.27,2.27,6.82,4.55,4.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.55,0.0,0.0,0.0,2.27,0.0,2.27,6.82,6.82,0.0,0.0,4.55,0.0,0.0,0.0,4.55,4.55,0.0,0.0,2.27,2.27,0.0,6.82,0.0,0.0,13.64,2.27,11.36,0.0,4.55,4.55,6.82,2.27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,11.36,9.09,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,4.55


__1. Choose input data and target__

In [27]:
X = hotels_comb
y = hotels["deceptive"]

__2. Split data__

In [28]:
X_train, X_test, y_train, y_test = split_train_test_data(X, y)

__3. Build pipeline__

__4. Construct parameters for cross validation testing__

In [29]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

In [30]:
class ItemExcluder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_frame):
        #df = data_frame.loc[:, data_frame.columns != self.key]
        df = data_frame.drop([self.key], axis=1)
        return df
    
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_frame):
        df = data_frame[self.key]
        return df

In [31]:
steps_combined = [
    ('preprocess', FeatureUnion(
        transformer_list=[
            ('terms', Pipeline([
                ('selector', ItemSelector(key='norm')),
                ('vect', CountVectorizer()),                
                ('tfidf', TfidfTransformer())
            ])),
            ('liwc', Pipeline([
                ('selector', ItemExcluder(key='norm')),         
                ('scale', StandardScaler())
            ]))                        
        ]    
    )),        
    ('clf', SVC())
]

pipe_comb = Pipeline(steps_combined)

In [32]:
param_grid_comb = [
    {'preprocess__terms__vect__stop_words': ['english', None],
        'preprocess__terms__vect__min_df': [1, 2], 
        'preprocess__terms__vect__ngram_range': [(1, 2), (1, 3)],    
        'clf__kernel': ['rbf'], 
         'clf__gamma': [0.01, 0.1],
         'clf__C': [1, 10, 100]}
]

__5. Apply Grid Serach CV__

In [33]:
svm_comb_grid_search = apply_grid_search_cv(pipe_comb, param_grid_comb, X_train, y_train, X_test, y_test)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


done in 153.009s
best params:
{'clf__C': 1, 'clf__gamma': 0.01, 'clf__kernel': 'rbf', 'preprocess__terms__vect__min_df': 2, 'preprocess__terms__vect__ngram_range': (1, 3), 'preprocess__terms__vect__stop_words': 'english'}
Best cross-validation score: 0.789


  Xt = transform.transform(Xt)


Test-set score: 0.799


__6. Save Reports to File__

In [34]:
save_cross_validation_results(svm_comb_grid_search, "output/svm_lwic_comb1_validation_res.csv", True)

  clf__C clf__gamma clf__kernel preprocess__terms__vect__min_df  \
7      1       0.01         rbf                               2   
5      1       0.01         rbf                               2   
6      1       0.01         rbf                               2   
1      1       0.01         rbf                               1   
2      1       0.01         rbf                               1   
3      1       0.01         rbf                               1   

  preprocess__terms__vect__ngram_range preprocess__terms__vect__stop_words  \
7                               (1, 3)                             english   
5                               (1, 2)                             english   
6                               (1, 2)                                None   
1                               (1, 2)                             english   
2                               (1, 2)                                None   
3                               (1, 3)                        

In [35]:
save_class_report_cv(svm_comb_grid_search, X_test, y_test, ["truthful", "deceptive"], "output/svm_lwic_comb1_report.csv")

  Xt = transform.transform(Xt)


Unnamed: 0,f1-score,precision,recall,support
truthful,0.78355,0.815315,0.754167,240.0
deceptive,0.799197,0.771318,0.829167,240.0
micro avg,0.791667,0.791667,0.791667,480.0
macro avg,0.791373,0.793317,0.791667,480.0
weighted avg,0.791373,0.793317,0.791667,480.0


__7. Save model to file__

In [36]:
from sklearn.externals import joblib
joblib.dump(svm_comb_grid_search.best_estimator_, 'output/svm_lwic_comb1.pkl')

['output/svm_lwic_comb1.pkl']

__8. Sentiment__

In [37]:
y = hotels["polarity"]
X_train, X_test, y_train, y_test = split_train_test_data(X, y)
grid_search_p = apply_grid_search_cv(pipe_comb, param_grid_comb, X_train, y_train, X_test, y_test)


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


done in 125.580s
best params:
{'clf__C': 1, 'clf__gamma': 0.01, 'clf__kernel': 'rbf', 'preprocess__terms__vect__min_df': 1, 'preprocess__terms__vect__ngram_range': (1, 2), 'preprocess__terms__vect__stop_words': 'english'}
Best cross-validation score: 0.904
Test-set score: 0.892


  Xt = transform.transform(Xt)


In [38]:
save_cross_validation_results(grid_search_p, "output/svm_lwic_comb1_validation_res_sent.csv", True)

  clf__C clf__gamma clf__kernel preprocess__terms__vect__min_df  \
1      1       0.01         rbf                               1   
2      1       0.01         rbf                               1   
3      1       0.01         rbf                               1   
4      1       0.01         rbf                               1   
5      1       0.01         rbf                               2   
6      1       0.01         rbf                               2   

  preprocess__terms__vect__ngram_range preprocess__terms__vect__stop_words  \
1                               (1, 2)                             english   
2                               (1, 2)                                None   
3                               (1, 3)                             english   
4                               (1, 3)                                None   
5                               (1, 2)                             english   
6                               (1, 2)                        

In [39]:
save_class_report_cv(grid_search_p, X_test, y_test, ["negative", "positive"], "output/svm_lwic_comb1_report_sent.csv")

  Xt = transform.transform(Xt)


Unnamed: 0,f1-score,precision,recall,support
negative,0.895277,0.882591,0.908333,240.0
positive,0.892178,0.905579,0.879167,240.0
micro avg,0.89375,0.89375,0.89375,480.0
macro avg,0.893727,0.894085,0.89375,480.0
weighted avg,0.893727,0.894085,0.89375,480.0
