In [43]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
import os
from keras.utils import np_utils

from keras.preprocessing import sequence
from keras.models import Sequential
from sklearn.metrics import classification_report
from keras.preprocessing.text import hashing_trick
from keras.preprocessing.text import text_to_word_sequence

from sklearn.model_selection import train_test_split
from sklearn import svm

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [44]:
mixeddataname = "mixed_20568.pkl"
modelname = 'SVM.h5'

dim_output = 2

size_batch = 10
num_epoch = 100
l2_lambda = 0.00001

stemmer = SnowballStemmer("english")
tokenizer = RegexpTokenizer("[a-z']+")

In [45]:
def tokenize(text):
    tokens = tokenizer.tokenize(text)
    return [stemmer.stem(t) for t in tokens] 

def get_tf(data, use_idf, max_df=1.0, min_df=1, ngram_range=(1,1)):
    if use_idf:
        m = TfidfVectorizer(
            max_df=max_df,
            min_df=min_df, 
            stop_words='english', 
            ngram_range=ngram_range, 
            tokenizer=tokenize
        )
    else:
        m = CountVectorizer(max_df=max_df, min_df=min_df, stop_words='english', ngram_range=ngram_range, tokenizer=tokenize)
    
    d = m.fit_transform(data)
    return m, d

In [46]:
def getData(filename):
    print("---------------")
    print("| Getting data...")
    print("---------------")
    
    data = pd.read_pickle(filename)
    tfidf_m, tfidf_d = get_tf(data['reviewText'], use_idf=True, max_df=0.90, min_df=10)
    
    X = tfidf_d[0: tfidf_d.shape[0] - 20568 - 1]
    X_adv = tfidf_d[tfidf_d.shape[0] - 20568 -1: -1]
    data_Y = data.overall
    Y = data_Y[0: tfidf_d_mixed.shape[0] - 20568 - 1]
    Y_adv = data_Y[tfidf_d_mixed.shape[0] - 20568 -1: -1]
    return X, Y, X_adv, Y_adv

In [47]:
def splitData(x, y):
    X_trains, X_test, Y_trains, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    X_train, X_validation, Y_train, Y_validation = train_test_split(X_trains, Y_trains, test_size=0.2, random_state=42)
    return X_train, X_validation, X_test, Y_train, Y_validation, Y_test

In [48]:
from sklearn.metrics import accuracy_score
def train(x_train, y_train):
    print("---------------")
    print("| Training...")
    print("---------------")
    
    print(str(datetime.now()))
    
#     model=svm.SVC(gamma=0.01,C=100.,decision_function_shape='ovo')
    model = svm.LinearSVC()
    model.fit(x_train, y_train)
    
    print(str(datetime.now()))
    
    return model

In [49]:
def test(model, x_test, y_test):
    print("---------------")
    print("| Testing...")
    print("---------------")
    
    print(str(datetime.now()))

    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)
    print(classification_report(y_test,y_pred))
    
    print(str(datetime.now()))

In [50]:
# def main():
# X, Y = getData(dataname)
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# model = train(X_train,Y_train)
# test(model, X_test, Y_test)

In [51]:
# main()

### Test with Adversasial Dataset

In [54]:
X, Y, X_adv, Y_adv = getData(mixeddataname)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
model = train(X_train,Y_train)

---------------
| Getting data...
---------------
---------------
| Training...
---------------
2018-06-18 03:01:14.052377
[LibLinear]2018-06-18 03:01:15.143194


In [55]:
test(model, X_test, Y_test)

---------------
| Testing...
---------------
2018-06-18 03:01:15.148165
0.8670264488525865
             precision    recall  f1-score   support

          0       0.87      0.86      0.87     10392
          1       0.86      0.87      0.87     10176

avg / total       0.87      0.87      0.87     20568

2018-06-18 03:01:15.173098


In [56]:
test(model, X_adv, Y_adv)

---------------
| Testing...
---------------
2018-06-18 03:01:15.183071
0.7990567872423182
             precision    recall  f1-score   support

          0       0.78      0.83      0.81     10330
          1       0.82      0.77      0.79     10238

avg / total       0.80      0.80      0.80     20568

2018-06-18 03:01:15.198032
