In [3]:
import gzip

In [4]:
with gzip.open('movie_data.csv.gz') as f_in, open('movie_data.csv', 'wb') as f_out:
    f_out.writelines(f_in)

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/timco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop = stopwords.words('english')
porter = PorterStemmer()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [8]:
next(stream_docs(path='movie_data.csv'))

('"In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />""Murder in Greenwich"" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich f

In [9]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [10]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path='movie_data.csv')

In [11]:
import pyprind
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:28


In [12]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.868


In [13]:
clf = clf.partial_fit(X_test, y_test)

After we trained the logistic regression model as shown above, we know save the classifier along woth the stop words, Porter Stemmer, and `HashingVectorizer` as serialized objects to our local disk so that we can use the fitted classifier in our web application later.


# Serializing fitted scikit-learn estimators

In [18]:
import pickle
import os

dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
    
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)

Next, we save the `HashingVectorizer` as in a separate file so that we can import it later.

Using the preceding code, we created a movieclassifier directory where we will later store the files and data for our web application. Within this movieclassifier directory, we created a pkl_objects subdirectory to save the serialized Python objects to our local hard drive or solid-state drive. Via the dump method of the pickle module, we then serialized the trained logistic regression model as well
as the stop-word set from the `Natural Language Toolkit` (NLTK) library, so that we don't have to install the NLTK vocabulary on our server.

In [19]:
import os
os.chdir('movieclassifier')

In [20]:
import pickle
import re
import os
from vectorizer import vect

clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb'))

In [23]:
import numpy as np
label = {0:'negative', 1:'positive'}
example = ["I love this movie. It's amazing."]
X = vect.transform(example) #  transform the simple example document into a word vector
print('Prediction: %s\n Probability: %.2f%%.' % (label[clf.predict(X)[0]], np.max(clf.predict_proba(X))*100))

Prediction: positive
 Probability: 95.55%.


- Used HashingVectorizer to transform the simple example document into a word vector - X
- predict method of the logistic regression classifier to predict the class label, as well as the predict_proba method to return the corresponding probability of prediction. 
- Note that the predict_proba method call returns an array with a probability value for each unique class label. Since the class label with the largest probability corresponds to the class label that is returned by the predict call, we used the np.max function to return the probability of the predicted class.

# Setting up an SQLite database for data storage

- set up a simple SQLite database to collect optional feedback about the predictions from users of the web application
- SQLite is an open source SQL database engine that doesn't require a separate server to operate, which makes it ideal for smaller projects and simple web applications
- there is already an API in the Python standard library, sqlite3, which allows us to work with SQLite databases

Before you execute this code, please make sure that you are currently in the `movieclassifier` directory.


In [27]:
import sqlite3
import os

conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute('DROP TABLE IF EXISTS review_db')
c.execute('CREATE TABLE review_db (review TEXT, sentiment INTEGER, date TEXT)')

example1 = 'I love this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example1, 1))

example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example2, 0))

conn.commit()
conn.close()

connect - create a connection to new database "reviews.sqlite" in 'movieclassifier' directory

cursor - create a cursor, allows us to traverse over the database using SQL syntax

execute - create a new database table - review_db, also create 3 columns (review, sentiment, date) - store 2 examples

DATATIME('now') - also added date and timestamps to entries, ? - pass the movie review texts and corresponding class label (1/0) 

commit - to save changes we made to the database and close the connection via 'close'

Test if the entries have been stored in the database table correctly:

In [30]:
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute("SELECT * FROM review_db WHERE date BETWEEN '2017-01-01 00:00:00' AND DATETIME('now')")
results = c.fetchall()
conn.close()
print(results)

[('I love this movie', 1, '2020-04-01 14:46:15'), ('I disliked this movie', 0, '2020-04-01 14:46:15')]


# Flask web application