In [11]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
import os
from keras.utils import np_utils

from keras.preprocessing import sequence
from keras.models import Sequential
from sklearn.metrics import classification_report
from keras.preprocessing.text import hashing_trick
from keras.preprocessing.text import text_to_word_sequence

from sklearn.model_selection import train_test_split
from sklearn import svm

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [12]:
mixeddataname = "mixed_s.pkl"
modelname = 'SVM_s.h5'
num_adv = 20535

dim_output = 2

size_batch = 10
num_epoch = 100
l2_lambda = 0.00001

stemmer = SnowballStemmer("english")
tokenizer = RegexpTokenizer("[a-z']+")

In [13]:
def tokenize(text):
    tokens = tokenizer.tokenize(text)
    return [stemmer.stem(t) for t in tokens] 

def get_tf(data, use_idf, max_df=1.0, min_df=1, ngram_range=(1,1)):
    if use_idf:
        m = TfidfVectorizer(
            max_df=max_df,
            min_df=min_df, 
            stop_words='english', 
            ngram_range=ngram_range, 
            tokenizer=tokenize
        )
    else:
        m = CountVectorizer(max_df=max_df, min_df=min_df, stop_words='english', ngram_range=ngram_range, tokenizer=tokenize)
    
    d = m.fit_transform(data)
    return m, d

In [14]:
def getData(filename):
    print("---------------")
    print("| Getting data...")
    print("---------------")
    
    data = pd.read_pickle(filename)
    tfidf_m, tfidf_d = get_tf(data['reviewText'], use_idf=True, max_df=0.90, min_df=10)
    
    X = tfidf_d[0: tfidf_d.shape[0] - 4 * num_adv - 1]
    X_adv = tfidf_d[tfidf_d.shape[0] - 4 * num_adv -1: tfidf_d.shape[0] - 3 *num_adv - 1]
    X_adv1 = tfidf_d[tfidf_d.shape[0] - 3 * num_adv -1: tfidf_d.shape[0] - 2 * num_adv - 1]
    X_adv2 = tfidf_d[tfidf_d.shape[0] - 2 * num_adv -1: tfidf_d.shape[0] - num_adv - 1]
    X_adv3 = tfidf_d[tfidf_d.shape[0] - num_adv -1: -1]
    
    data_Y = data.overall
    Y = data_Y[0: tfidf_d.shape[0] - 4 * num_adv - 1]
    Y_adv = data_Y[tfidf_d.shape[0] - 4 * num_adv -1: tfidf_d.shape[0] - 3 * num_adv -1]
    Y_adv1 = data_Y[tfidf_d.shape[0] - 3 * num_adv -1: tfidf_d.shape[0] - 2 * num_adv -1]
    Y_adv2 = data_Y[tfidf_d.shape[0] - 2 * num_adv -1: tfidf_d.shape[0] - num_adv -1]
    Y_adv3 = data_Y[tfidf_d.shape[0] - num_adv -1: -1]
    
    return X, Y, X_adv, Y_adv, X_adv1, Y_adv1, X_adv2, Y_adv2, X_adv3, Y_adv3, len(tfidf_m.get_feature_names())

In [15]:
def splitData(x, y):
    X_trains, X_test, Y_trains, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    X_train, X_validation, Y_train, Y_validation = train_test_split(X_trains, Y_trains, test_size=0.2, random_state=42)
    return X_train, X_validation, X_test, Y_train, Y_validation, Y_test

In [16]:
from sklearn.metrics import accuracy_score
def train(x_train, y_train):
    print("---------------")
    print("| Training...")
    print("---------------")
    
#     model=svm.SVC(gamma=0.01,C=100.,decision_function_shape='ovo')
    model = svm.LinearSVC()
    model.fit(x_train, y_train)
    
    return model

In [17]:
def test(model, x_test, y_test):
    print("---------------")
    print("| Testing...")
    print("---------------")

    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)
    print(classification_report(y_test,y_pred))

In [8]:
# def main():
# X, Y = getData(dataname)
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# model = train(X_train,Y_train)
# test(model, X_test, Y_test)

In [28]:
# main()

### Test with Adversasial Dataset

#### Samller Set

In [43]:
X, Y, X_adv, Y_adv, X_adv1, Y_adv1, X_adv2, Y_adv2, X_adv3, Y_adv3, dim_input = getData(mixeddataname)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [33]:
model = train(X_train,Y_train)

---------------
| Training...
---------------


In [34]:
test(model, X_test, Y_test)

---------------
| Testing...
---------------
0.8412736995624697
             precision    recall  f1-score   support

          0       0.84      0.84      0.84      2054
          1       0.84      0.84      0.84      2060

avg / total       0.84      0.84      0.84      4114



In [35]:
test(model, X_adv, Y_adv)

---------------
| Testing...
---------------
0.7766739712685659
             precision    recall  f1-score   support

          0       0.74      0.86      0.79     10264
          1       0.83      0.70      0.76     10271

avg / total       0.78      0.78      0.78     20535



In [36]:
test(model, X_adv1, Y_adv1)

---------------
| Testing...
---------------
0.9121499878256635
             precision    recall  f1-score   support

          0       0.91      0.92      0.91     10264
          1       0.92      0.90      0.91     10271

avg / total       0.91      0.91      0.91     20535



In [37]:
test(model, X_adv2, Y_adv2)

---------------
| Testing...
---------------
0.900608716824933
             precision    recall  f1-score   support

          0       0.89      0.91      0.90     10264
          1       0.91      0.89      0.90     10271

avg / total       0.90      0.90      0.90     20535



In [38]:
test(model, X_adv3, Y_adv3)

---------------
| Testing...
---------------
0.8849281714146578
             precision    recall  f1-score   support

          0       0.87      0.91      0.89     10264
          1       0.90      0.86      0.88     10271

avg / total       0.89      0.88      0.88     20535



#### Whole Set

In [18]:
mixeddataname = "mixed.pkl"
num_adv = 20568
name_model = 'SVM.h5'

In [19]:
X, Y, X_adv, Y_adv, X_adv1, Y_adv1, X_adv2, Y_adv2, X_adv3, Y_adv3, dim_input = getData(mixeddataname)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
model = train(X_train,Y_train)

---------------
| Getting data...
---------------
---------------
| Training...
---------------


In [20]:
test(model, X_test, Y_test)
test(model, X_adv, Y_adv)
test(model, X_adv1, Y_adv1)
test(model, X_adv2, Y_adv2)
test(model, X_adv3, Y_adv3)

---------------
| Testing...
---------------
0.8703325554259043
             precision    recall  f1-score   support

          0       0.87      0.87      0.87     10321
          1       0.87      0.87      0.87     10247

avg / total       0.87      0.87      0.87     20568

---------------
| Testing...
---------------
0.7833527810190587
             precision    recall  f1-score   support

          0       0.75      0.86      0.80     10364
          1       0.83      0.70      0.76     10204

avg / total       0.79      0.78      0.78     20568

---------------
| Testing...
---------------
0.8764099572150914
             precision    recall  f1-score   support

          0       0.88      0.88      0.88     10363
          1       0.88      0.87      0.88     10205

avg / total       0.88      0.88      0.88     20568

---------------
| Testing...
---------------
0.861192143134967
             precision    recall  f1-score   support

          0       0.85      0.88      0.86    