In [1]:
import numpy as np
import re

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from nltk.corpus import stopwords

import pyprind

In [2]:
stop = stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [None]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [None]:
next(stream_docs(path='./../data/IMDb/movie_data.csv'))

In [None]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
            
    except StopIteration:
        return None, None
    return docs, y

In [None]:
vect = HashingVectorizer(decode_error='ignore',
                         n_features=2 ** 21,
                         preprocessor=None,
                         tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
stream = stream_docs(path='./../data/IMDb/movie_data.csv')

In [None]:
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

In [None]:
X_test, y_test = get_minibatch(stream, size=5000)
X_test = vect.transform(X_test)
f"'Accuracy: {clf.score(X_test, y_test):.3f}'"

### Serialization

In [3]:
import pickle
import os

destination = os.path.join(os.path.pardir, 'models', 'movie_classifier')
if not os.path.exists(destination):
    os.makedirs(destination)

pickle.dump(stop,
            open(os.path.join(destination, 'stopwords.pkl'), 'wb'),
            protocol=4)
pickle.dump(clf,
            open(os.path.join(destination, 'clf.pkl'), 'wb'),
            protocol=4)

'..'