In [37]:
import numpy as np
import re
import pyprind
from nltk.corpus import stopwords

In [32]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

In [2]:
stop = stopwords.words('english')

In [4]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    
    tokenized = [w for w in text.split() if w not in stop]
    
    return tokenized

In [13]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)
        
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [14]:
path = 'D:/LocalData/N196003/Desktop/movie_data.csv'

In [29]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    
    except StopIteration:
        return None, None
    
    return docs, y

In [42]:
stream = stream_docs(path)

In [43]:
x, y = get_minibatch(stream, size=100)

In [33]:
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)

In [35]:
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)

In [36]:
doc_stream = stream_docs(path=path)

In [38]:
pbar = pyprind.ProgBar(45)

In [39]:
classes = np.array([0, 1])

In [41]:
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    
    if not X_train:
        break
    
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:08:53


In [50]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)

In [51]:
print('Acc %.3f' % clf.score(X_test, y_test))

Acc 0.867


In [52]:
import pickle
import os

In [53]:
dest = os.path.join('movieclassifer', 'pkl_objects')

In [55]:
if not os.path.exists(dest):
    os.makedirs(dest)

In [56]:
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)

In [58]:
X_test

<5000x2097152 sparse matrix of type '<class 'numpy.float64'>'
	with 492540 stored elements in Compressed Sparse Row format>