In [48]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.pipeline import Pipeline

## Amazon dataset

In [60]:
import gzip
# Start small with 10mb first
review_lines = gzip.open('amaz_movies_tv.json.gz', 'rt').readlines(10*1024*1024)
len(review_lines)

8767

In [79]:
import json
df = pd.DataFrame(list(map(json.loads, review_lines)))

In [80]:
df = df[['overall', 'reviewText']]
df.head()

Unnamed: 0,overall,reviewText
0,4.0,This is a charming version of the classic Dick...
1,3.0,It was good but not as emotionally moving as t...
2,3.0,"Don't get me wrong, Winkler is a wonderful cha..."
3,5.0,Henry Winkler is very good in this twist on th...
4,4.0,This is one of the best Scrooge movies out. H...


In [81]:
df.columns = ['label', 'text']
df.columns

Index(['label', 'text'], dtype='object')

In [82]:
df.label[df.label < 3] = 0
df.label[df.label > 3] = 1
df = df.drop(df[df.label == 3].index)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,label,text
0,1.0,This is a charming version of the classic Dick...
3,1.0,Henry Winkler is very good in this twist on th...
4,1.0,This is one of the best Scrooge movies out. H...
5,1.0,This has been a favorite movie of mine for a l...
6,1.0,This is the American adaptation of the Charles...


In [83]:
df.label = df.label.astype(int)
df.head()

Unnamed: 0,label,text
0,1,This is a charming version of the classic Dick...
3,1,Henry Winkler is very good in this twist on th...
4,1,This is one of the best Scrooge movies out. H...
5,1,This has been a favorite movie of mine for a l...
6,1,This is the American adaptation of the Charles...


In [50]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df.text, df.label, test_size=0.1, random_state=42, stratify=df.label)

# Save 
df_test = pd.DataFrame()

In [51]:
# Take the last 22 words from each review in the train set
X_train = X_train.str.split().apply(lambda x:  ' '.join(x for x in x[-22:]))

In [52]:
# Stopwords
STOPWORDS = ['by','does', 'was', 'were', 'the', 'of', 'end', 'and', 'is']    

In [53]:
cvect = CountVectorizer()
classifier = PassiveAggressiveClassifier(C=0.001, fit_intercept = False, shuffle = False, n_iter = 91, n_jobs = -1)

pipeline = Pipeline([('vectorizer', CountVectorizer(binary=True,ngram_range=(1,4),stop_words=STOPWORDS)), ('classifier', classifier)])

In [54]:
model = pipeline.fit(X=X_train, y=y_train)

In [55]:
# Compare Validation Accuracy on RT, IMDB and mixed test sets
y_pred_rt = model.predict(X_test_rt)
y_pred_imdb = model.predict(X_test_imdb)
y_pred = model.predict(X_test)

print ("Accuracy RT :", metrics.accuracy_score(y_test_rt, y_pred_rt))
print ("Accuracy IMDB :", metrics.accuracy_score(y_test_imdb, y_pred_imdb))
print ("Accuracy RT+IMDB :", metrics.accuracy_score(y_test, y_pred))

Accuracy RT : 0.812396452587
Accuracy IMDB : 0.9034
Accuracy RT+IMDB : 0.842212174825


In [44]:
joblib.dump(pipeline, 'output.pkl')

['output.pkl']