In [2]:
import os

data_loc = "../aclImdb"

def build_dataset(root):
    data = []
    labels = []
    for filename in os.listdir(os.path.join(root, 'pos')):
        data.append(open(os.path.join(root, 'pos', filename)).read())
        labels.append(1)
    for filename in os.listdir(os.path.join(root, 'neg')):
        data.append(open(os.path.join(root, 'neg', filename)).read())
        labels.append(0)
    return data, labels

class IMDBDataset(object):
    def __init__(self, root):
        self.train_data, self.train_labels = build_dataset(os.path.join(root, 'train'))
        self.dev_data, self.dev_labels = build_dataset(os.path.join(root, 'test'))

In [3]:
imdbDataset = IMDBDataset(data_loc)

In [4]:
from BagOfWordsNN import BoWMLP
clf = BoWMLP(imdbDataset.train_data, imdbDataset.train_labels, rare_word_threshold=15)
clf.fit()
clf.test(imdbDataset.dev_data, imdbDataset.dev_labels)

Done building vocab! Vocab size: 12314
Done building train dev!
Training...
Iteration 1, loss = 0.34067612
Iteration 2, loss = 0.15657234
Iteration 3, loss = 0.06402856
Iteration 4, loss = 0.01649610
Iteration 5, loss = 0.00646667
Iteration 6, loss = 0.00777718
Iteration 7, loss = 0.01170576
Iteration 8, loss = 0.01035176
Iteration 9, loss = 0.01007133
Iteration 10, loss = 0.00332316
Iteration 11, loss = 0.00200266
Iteration 12, loss = 0.00059611
Iteration 13, loss = 0.00044441
Iteration 14, loss = 0.00042735
Iteration 15, loss = 0.00042113
Iteration 16, loss = 0.00041667
Iteration 17, loss = 0.00041297
Iteration 18, loss = 0.00040972
Iteration 19, loss = 0.00040679
Iteration 20, loss = 0.00040407
Iteration 21, loss = 0.00040149
Iteration 22, loss = 0.00039900
Iteration 23, loss = 0.00039657
Iteration 24, loss = 0.00039418
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Training done!
Testing on dev...
Done! Scores below:
0.8732
0.85664


In [5]:
import joblib

joblib.dump(clf, "BoWMLP-IMDB.joblib")

['BoWMLP-IMDB.joblib']

In [None]:
from BagOfWordsLogReg import BoWLR
clf2 = BoWLR(imdbDataset.train_data, imdbDataset.train_labels, rare_word_threshold=15)
clf2.fit()

In [8]:
clf2.test(imdbDataset.dev_data, imdbDataset.dev_labels)

0.86056


In [10]:
joblib.dump(clf2, "BoWLogReg-IMDB.joblib")

['BoWLogReg-IMDB.joblib']

In [2]:
# classifying on test points

import joblib
from BagOfWordsLogReg import BoWLR
from BagOfWordsNN import BoWMLP

clf_lr = joblib.load("BoWLogReg-IMDB.joblib")
clf_nn = joblib.load("BoWMLP-IMDB.joblib")

In [6]:
test_point_file_loc = "/home/yicheng-wang/CS-Stuff/machine_learning/Sentence-VAE/test_points.txt"

with open(test_point_file_loc, 'r') as in_file:
    test_points = in_file.read().split('\n')[:-1]
    
print(len(test_points))

2401


In [7]:
preds_lr = clf_lr.classify(test_points)
print(preds_lr.shape)

(2401,)


In [8]:
preds_nn = clf_nn.classify(test_points)
print(preds_nn.shape)

(2401,)


In [46]:
from utils import find_decision_boundary, write_out_csv

In [41]:
lr_boundaries = find_decision_boundary(preds_lr, 4, 7)

In [42]:
nn_boundaries = find_decision_boundary(preds_nn, 4, 7)

In [43]:
lstm_pred_file_loc = "/home/yicheng-wang/CS-Stuff/machine_learning/fictional-garbanzo/imdb_test_points.h5f"

import h5py

f = h5py.File(lstm_pred_file_loc, 'r')
rnn_preds = f['predictions'][:]
f.close()
lstm_boundaries = find_decision_boundary(rnn_preds, 4, 7)

In [44]:
print(len(lr_boundaries))
print(len(nn_boundaries))
print(len(lstm_boundaries))

196
511
253


In [47]:
p2l = {0: 'negative', 1: 'positive'}
write_out_csv(test_points, preds_lr, lr_boundaries, p2l, 'imdb_lr_boundary.csv')
write_out_csv(test_points, preds_nn, nn_boundaries, p2l, 'imdb_mlp_boundary.csv')
write_out_csv(test_points, rnn_preds, lstm_boundaries, p2l, 'imdb_rnn_boundary.csv')