In [37]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.pipeline import Pipeline

In [38]:
# Data import
rt = pd.read_csv('reviews_rt_all.csv', sep = '|')
imdb = pd.read_csv('imdb_small.csv', sep = '|')

In [39]:
# Split RT and IMDB datasets seapretely
X_train_rt, X_test_rt, y_train_rt, y_test_rt = train_test_split(rt.text, rt.label, test_size=0.2, random_state=42, stratify=rt.label)
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb = train_test_split(imdb.text, imdb.label, test_size=0.2, random_state=42, stratify=imdb.label)

# Then concatenate
X_train = pd.concat([X_train_rt, X_train_imdb])
X_test = pd.concat([X_test_rt, X_test_imdb])
y_train = pd.concat([y_train_rt, y_train_imdb])
y_test = pd.concat([y_test_rt, y_test_imdb])

In [40]:
# Take the last 22 words from each review in the train set
X_train = X_train.str.split().apply(lambda x:  ' '.join(x for x in x[-22:]))

In [41]:
# Stopwords
STOPWORDS = ['by','does', 'was', 'were', 'the', 'of', 'end', 'and', 'is']    

In [42]:
cvect = CountVectorizer()
counts = cvect.fit_transform(X_train)

classifier = PassiveAggressiveClassifier(C=0.001, fit_intercept = False, shuffle = False, n_iter = 91, n_jobs = -1)
pipeline = Pipeline([('vectorizer', CountVectorizer(binary=True,ngram_range=(1,4),stop_words=STOPWORDS)), ('classifier', classifier)])
model = pipeline.fit(X=X_train, y=y_train)

In [43]:
# Compare Validation Accuracy on RT, IMDB and mixed test sets
y_pred_rt = model.predict(X_test_rt)
y_pred_imdb = model.predict(X_test_imdb)
y_pred = model.predict(X_test)

print ("Accuracy RT :", metrics.accuracy_score(y_test_rt, y_pred_rt))
print ("Accuracy IMDB :", metrics.accuracy_score(y_test_imdb, y_pred_imdb))
print ("Accuracy RT+IMDB :", metrics.accuracy_score(y_test, y_pred))

Accuracy RT : 0.812396452587
Accuracy IMDB : 0.9034
Accuracy RT+IMDB : 0.842212174825


In [44]:
joblib.dump(pipeline, 'output.pkl')

['output.pkl']