In [1]:
import numpy as np
import re
from nltk.corpus import stopwords

In [2]:
stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [3]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [4]:
next(stream_docs(path='./movie_data.csv'))

('"I read a few reviews of the movie and got the impression that it was not as good as the previous Karate Kid installments. Although my favorite is still Karate Kid II, I felt this fourth installment of the movie series was consistent with the others and had some important lessons to share. Unlike the previous versions, the karate student is a female teenager who takes a somewhat different learning path, rather than a male teenager. Maggi finds this a little more challenging, but rises to the occasion. The plot twists are believable and predictable. I found that the bad guys are a little one dimensional, but this weakness is present in all the installments in varying degrees. The camera work is impressive and pans across some beautiful scenery from time to time. The Zen monastery is both austere and charming. The Zen monks add some humor and lightness to the narrative flow. I liked the ""Zen Bowling"" scenes which are a humorous counterpoint to the more serious Zen archery scene earli

In [5]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [6]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

In [7]:
vect = HashingVectorizer(decode_error='ignore',
                        n_features=2**21,
                        preprocessor=None,
                        tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')

In [8]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:20


In [9]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.870


In [10]:
clf = clf.partial_fit(X_test, y_test)

In [12]:
import pickle
import os
dest = os.path.join('movieclssifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

In [13]:
pickle.dump(stop, 
           open(os.path.join(dest, 'stopwords.pkl'), 'wb'),
           protocol=4)
pickle.dump(clf,
           open(os.path.join(dest, 'classifier.pkl'), 'wb'),
           protocol=4)