In [1]:
# out-of-core Learning

import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')

# 1. tokenizer
# ------------
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

# 2. stream controller
# --------------------
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

next(stream_docs(path='movie_data.csv'))

# 3. get batches of docs (partial X and y)
def get_minibatch(doc_stream, size):
    X, y =[], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            X.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return X, y

# 4. use Hash Vectorizer and SGD classifier
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

hash_vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21, 
                         preprocessor=None, 
                         tokenizer=tokenizer)
clf = SGDClassifier(loss='log_loss', 
                    random_state=1)
doc_stream = stream_docs(path='movie_data.csv')

In [2]:
# 45 mini-batches(1000 words in each) of original docs, which include 50000 samples, as train sampels
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = hash_vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:13


In [3]:
# 5 mini-batches(1000 words in each) left as test samples
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = hash_vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

# update test samples into whole training set
clf = clf.partial_fit(X_test, y_test, classes=classes)

Accuracy: 0.868


In [4]:
import pickle
import os

dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)