# Benchmark Model

In this notebook, we will train benchmark models
* We choose Naive Bayes as the benchmark model
* For comparison purpose, we will also train a SVM model

In [243]:
import numpy as np
import sys
import string
import json

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

## Load Data

In [244]:
def load_data(data_file=None):
    if data_file == None:
        return
    with open(data_file) as f:
        data = json.load(f)
    return data

In [245]:
folder = './data/large_files/stanford_sentiment/parsed_data/'
word2idx = load_data(folder + "sentiment_word2idx.json")
sentiment_binary_train = load_data(folder + "sentiment_binary_train.json")
sentiment_train = load_data(folder + "sentiment_train.json")
sentiment_binary_test = load_data(folder + "sentiment_binary_test.json")
sentiment_test = load_data(folder + "sentiment_test.json")

## Preprocessing Data

### Exclude neutral samples

* The loaded samples has three type of labels -1,0,1, in which -1 indicates neutral sentiment.
* We exclude samples with neutral sentiment.

In [246]:
def exclude_neutral_sample(samples:dict):
    ssamples = {}
    for k, v in samples.items():
        if v[3][-1] != -1:
            ssamples[k] = v
    return ssamples
        
train_b = exclude_neutral_sample(sentiment_binary_train)
test_b = exclude_neutral_sample(sentiment_binary_test)

print("After filtering: # of training samples and # of test samples")
print("# of traing samples: ", len(train_b))
print("# of test samples: ", len(test_b))

After filtering: # of training samples and # of test samples
# of traing samples:  6920
# of test samples:  1821


### Convert training/test data to sentences
* Currently, the training/test data are in the form of integer sequences, which are directly parsed from Stanford sentimental analysis raw data. We have not done any preprocessing nor feature extracting on those data yet. 
* The purpose of coverting training/test data into sentences is that we will do some preprocessing and feature extracting on these sentences. Then, we will convert sentences back to integer sequences.

In [247]:
# convert review comment in form of integers to the form of words
def get_comment(wordidx, idx2word:dict):
    wordlist = []
    for idx in wordidx:
        if idx != -1:
            token = idx2word[idx]
            # remove punctuation
            if token not in string.punctuation:
                wordlist.append(token)
    return wordlist

In [248]:
def get_comments_samples(samples:dict, idx2word:dict):
    comments = []
    targets = []
    for _, v in samples.items():
        if v[3][-1] != -1:
            # concatenate word list to a string
            comment = " ".join(get_comment(v[0], idx2word))
            label = v[3][-1]
            comments.append(comment)
            targets.append(label) 
    return comments, targets

In [249]:
idx2word = {v:k for k, v in word2idx.items()}
train_comments_o, train_targets = get_comments_samples(train_b, idx2word)
test_comments_o, test_targets = get_comments_samples(test_b, idx2word)

In [250]:
print(len(train_comments_o))
print(len(test_comments_o))
print(len(train_targets))
print(len(test_targets))

6920
1821
6920
1821


### Resplit data
* Since we will add more data to the training set for training Recurrent Neural Network and Recursive Neural Network, we want to do the same thing for training Naive Bayes. 
* However, After experiments, we found that with more training data, the accuray of Naive Bayes model on the test data set was decreased dramatically, from 0.811 to 0.689. Even the accuracy on training data set was decreased from 0.911 to 0.897 (need to further investigate the reason).
* Therefore, we will just use 6920 examples for training and 1821 examples for testing. 

In [251]:
# train_comments_o  = train_comments_o + test_comments_o[:1000]
# test_comments_o = test_comments_o[1000:]

# train_targets = train_targets + test_targets[:1000]
# test_targets = test_targets[1000:]

In [252]:
print(len(train_comments_o))
print(len(test_comments_o))
print(len(train_targets))
print(len(test_targets))

6920
1821
6920
1821


### Extract features

In [253]:
all_comments = train_comments_o + test_comments_o

print('# of train comments:', len(train_comments_o))
print('# of test comments:', len(train_comments_o))
print("total # of comments:", len(all_comments))

# Initialize a CoutVectorizer to use NLTK's tokenizer instead of its 
# default one (which ignores punctuation and stopwords). 
# Minimum document frequency set to 1. 
foovec = CountVectorizer(max_features=6000)
# sentences turned into sparse vector of word frequency counts
foovec = foovec.fit(train_comments_o)
train_comments = foovec.transform(train_comments_o)
test_comments = foovec.transform(test_comments_o)

# of train comments: 6920
# of test comments: 6920
total # of comments: 8741


In [254]:
print(train_comments.shape)
print(test_comments.shape)

(6920, 6000)
(1821, 6000)


In [255]:
tfidf_transformer = TfidfTransformer()
X_train = tfidf_transformer.fit_transform(train_comments)
X_test = tfidf_transformer.transform(test_comments)

print(train_comments.shape)
print(test_comments.shape)
print(X_train.shape)
print(X_test.shape)

(6920, 6000)
(1821, 6000)
(6920, 6000)
(1821, 6000)


## Using Naive Bayes and SVM 

### Train and validate Naive Bayes Model

In [256]:
# Train a Multimoda Naive Bayes classifier
clf = MultinomialNB().fit(X_train, train_targets)

In [257]:
# Predicting the test set results, find accuracy
y_pred = clf.predict(X_test)
print('accuracy on test data set:', accuracy_score(test_targets, y_pred))

accuracy on test data set: 0.8116419549697969


In [258]:
y_pred2 = clf.predict(X_train)
print('accuracy on training data set:',accuracy_score(train_targets, y_pred2))

accuracy on training data set: 0.911271676300578


In [259]:
# Making the Confusion Matrix
cm = confusion_matrix(test_targets, y_pred)
cm

array([[699, 213],
       [130, 779]])

### Train and validate SVM Model

In [260]:
classifier_rbf = SVC(kernel='linear').fit(X_train, train_targets)
y_pred = classifier_rbf.predict(X_test)
print('accuracy on test data set:', accuracy_score(test_targets, y_pred))

accuracy on test data set: 0.8088962108731467


In [261]:
cm = confusion_matrix(test_targets, y_pred)
cm

array([[726, 186],
       [162, 747]])