This notebook is for training the vector machine initially. In the webpage portion, I will be using the provided pickled files.

## Turn raw data into csv file

In [8]:
import pyprind
import pandas as pd
import os
# change the 'basepath' to the directory of the
# unzipped movie dataset
basepath = 'dataset\\aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = pd.concat([df, pd.DataFrame([[txt, labels[l]]])], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:06:17


## Save as csv file

In [9]:
df.to_csv('imdb.csv', index = False)

## Shuffle data

In [12]:
import numpy as np

df = pd.read_csv('dataset\\imdb.csv')
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('dataset\\shuffled_data.csv', index=False)

In [25]:
df = pd.read_csv('dataset\\shuffled_data.csv')

## Initialize vectorizer

In [26]:
from sklearn.feature_extraction.text import HashingVectorizer
import re
import pickle
cur_dir = ''
stop = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'stopwords.pkl'), 'rb'))
def tokenizer(text):
   text = re.sub('<[^>]*>', '', text)
   emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
   text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
   tokenized = [w for w in text.split() if w not in stop]
   return tokenized

vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)
print ('finish')

finish


  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
  text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')


## Implement read from file and get batch of lines

In [27]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

## Split dataset into test/train

In [21]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

## Train HashingVectorizer

In [32]:
import pyprind
import numpy as np
import pickle
from sklearn.linear_model import SGDClassifier

pbar = pyprind.ProgBar(45)
classes = np.array([0,1])

doc_stream = stream_docs(path='dataset\\shuffled_data.csv')
clf = SGDClassifier(loss='log_loss', random_state=1)

for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:21


## Test 

In [33]:
label = {0:'negative', 1:'positive'}
example = ["I love this movie. It rocks."]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%' %(label[clf.predict(X)[0]], np.max(clf.predict_proba(X))*100))

example = ["I hate this movie. It sucks."]
X = vect.transform(example)
print('Prediction: %s\nProbability: %.2f%%' %(label[clf.predict(X)[0]], np.max(clf.predict_proba(X))*100))

Prediction: positive
Probability: 78.80%
Prediction: negative
Probability: 73.07%


## Create SQLite database

In [2]:
import sqlite3
import os
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute('DROP TABLE IF EXISTS review_db')
c.execute('CREATE TABLE review_db'\
          ' (review TEXT, sentiment INTEGER, date TEXT)')
conn.commit()
conn.close()