In [None]:
import pandas as pd
import numpy as np
import math, random
import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
import pyprind
import os

In [9]:
# change the `basepath` to the directory of the
# unzipped movie dataset

#basepath = '/Users/Sebastian/Desktop/aclImdb/'
basepath = 'aclImdb/'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:03:58


In [10]:
df.head(5)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [14]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False)

In [15]:
df = pd.read_csv('movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,In Canadian director Kari Skogland's film adap...,1
1,"I agree with the previous comment, what a disa...",0
2,"If you are looking for King Kong, you mispelle...",1


In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
count = CountVectorizer()
docs = np.array(['The sun is shining', 'The weather is sweet', 'The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)

In [22]:
print count.vocabulary_

{u'and': 0, u'weather': 6, u'sweet': 4, u'sun': 3, u'is': 1, u'the': 5, u'shining': 2}


In [23]:
print bag.toarray()

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [24]:
from sklearn.feature_extraction.text import TfidfTransformer

In [76]:
def tfidf(bag):
    tf = []
    nd = float(bag.shape[0])
    unnorm = bag.multiply((np.log((1+nd)/(1.0+bag.sum(axis=0)))+1.0)) #multiply bag(tf) by (idf+1)
    for v in unnorm:
        tf.append(v/np.linalg.norm(v))
    return np.array(tf)

In [30]:
tfidf = TfidfTransformer()
np.set_printoptions(precision=3)
print tfidf.fit_transform(count.fit_transform(docs)).toarray()

[[ 0.     0.434  0.558  0.558  0.     0.434  0.   ]
 [ 0.     0.434  0.     0.     0.558  0.434  0.558]
 [ 0.405  0.478  0.308  0.308  0.308  0.478  0.308]]


In [36]:
unnorm = bag.multiply((np.log((1+3.0)/(1.0+bag.sum(axis=0)))+1.0))
print unnorm

[[ 0.     0.777  1.288  1.288  0.     0.777  0.   ]
 [ 0.     0.777  0.     0.     1.288  0.777  1.288]
 [ 1.693  1.554  1.288  1.288  1.288  1.554  1.288]]


In [43]:
normed = unnorm/(np.sqrt(np.sum(unnorm.dot(unnorm.T), axis=1)))
print normed

[[ 0.     0.229  0.38   0.38   0.     0.229  0.   ]
 [ 0.     0.229  0.     0.     0.38   0.229  0.38 ]
 [ 0.333  0.306  0.254  0.254  0.254  0.306  0.254]]


In [48]:
np.linalg.norm(unnorm, axis=1)

array([ 2.127,  2.127,  3.785])

In [50]:
from sklearn.preprocessing import normalize

In [60]:
x = np.random.rand(1000)*10
norm1 = x / np.linalg.norm(x)
norm2 = normalize(x[:,np.newaxis], axis=0).ravel()
print np.allclose(norm1, norm2)

True


In [77]:
print tfidf(bag)

[[[ 0.     0.365  0.605  0.605  0.     0.365  0.   ]]

 [[ 0.     0.365  0.     0.     0.605  0.365  0.605]]

 [[ 0.447  0.41   0.34   0.34   0.34   0.41   0.34 ]]]


In [79]:
import re

In [82]:
df.loc[1, 'review'][-50:]

" you'll waste time and money.<br /><br />Boring!!!"

In [81]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [83]:
preprocessor(df.loc[1, 'review'][-50:])

' you ll waste time and money boring '

In [84]:
df['review'] = df['review'].apply(preprocessor)

In [85]:
df.shape

(50000, 2)

In [86]:
def tokenizer(text):
    return text.split()

In [87]:
from nltk.stemm.porter import PorterStemmer

In [88]:
porter = PorterStemmer()

In [89]:
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [90]:
import nltk

In [91]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Steve\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [92]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
print stop

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u

In [93]:
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]

[u'runner', u'like', u'run', u'run', u'lot']

In [94]:
xtrain = df.loc[:25000, 'review'].values
ytrain = df.loc[:25000, 'sentiment'].values
xtest = df.loc[25000:, 'review'].values
ytest = df.loc[25000:, 'sentiment'].values

In [96]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [97]:
TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

In [98]:
lr_tfidf = Pipeline([('vect', TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, stop_words=None, 
                            tokenizer=tokenizer, ngram_range=(1,1))), 
                     ('clf', LogisticRegression(random_state=0, C=10.0, penalty='l2'))])

In [100]:
lr_tfidf.fit(xtrain, ytrain)

Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...ovr',
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0))])

In [101]:
from sklearn.metrics import accuracy_score

In [102]:
print "Training accuracy:", accuracy_score(ytrain, lr_tfidf.predict(xtrain))

Training accuracy: 0.988680452782


In [103]:
print "Test accuracy:", accuracy_score(ytest, lr_tfidf.predict(xtest))

Test accuracy: 0.89696


In [104]:
def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv) #skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [106]:
next(stream_docs(path='movie_data.csv'))

('"In Canadian director Kari Skogland\'s film adaptation of the Margaret Laurence novel The Stone Angel Ellen Burstyn is Hagar Shipley, a proud and cantankerous woman approaching her nineties who wishes to remain independent until the very end, stubbornly refusing to be placed in a nursing home by her well-meaning son Marvin. Filmed in Manitoba, Canada and set in the fictional town of Manawaka, The Stone Angel is a straightforward and conventional interpretation of the book that has been required reading in Canadian high school English classes for almost half a century.<br /><br />The title of the film comes from the stone statue erected on Hagar\'s mother\'s grave which serves as a metaphor for Hagar\'s inability to express emotion during her tumultuous lifetime. Burstyn brings vulnerability and humor to the role but is a bit too likable to fully realize the ego-driven, self-defeating character who managed to alienate her wealthy father, her well-meaning but alcoholic husband, and bot

In [107]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [108]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

In [109]:
vect = HashingVectorizer(decode_error='ignore', 
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='movie_data.csv')

In [111]:
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    xtrain, ytrain = get_minibatch(doc_stream, size=1000) #train with 1000 documents at a time
    if not xtrain:
        break
    xtrain = vect.transform(xtrain)
    clf.partial_fit(xtrain, ytrain, classes=classes)
    pbar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:15


In [112]:
xtest, ytest = get_minibatch(doc_stream, size=5000)
xtest = vect.transform(xtest)
print "Accuracy:", clf.score(xtest, ytest)

Accuracy: 0.8212


In [113]:
clf = clf.partial_fit(xtest, ytest)