In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.pipeline import Pipeline

In [2]:
# Data import
df = pd.read_csv('amazon_train.csv', sep = '|')
len(df)

1346607

In [21]:
# Split dataset
# X_train, X_test, y_train, y_test = train_test_split(df.text, df.label, test_size=0.1, random_state=42, stratify=df.label)
X_train, y_train = (df.text, df.label)

CPU times: user 25 ms, sys: 1.69 ms, total: 26.7 ms
Wall time: 26.6 ms


In [22]:
X_train = X_train.astype(str)

CPU times: user 369 ms, sys: 4.04 ms, total: 373 ms
Wall time: 372 ms


In [23]:
%%time

# Take the last 22 words from each review in the train set
X_train = X_train.str.split().apply(lambda x:  ' '.join(x for x in x[-22:]))

CPU times: user 1min 23s, sys: 3min 16s, total: 4min 39s
Wall time: 7min 16s


In [24]:
# Stopwords
STOPWORDS = ['by','does', 'was', 'were', 'the', 'of', 'end', 'and', 'is']    

In [25]:
cvect = CountVectorizer()
classifier = PassiveAggressiveClassifier(C=0.001, fit_intercept = False, shuffle = False, n_iter = 91, n_jobs = -1)

pipeline = Pipeline([('vectorizer', CountVectorizer(binary=True,ngram_range=(1,4),stop_words=STOPWORDS)), ('classifier', classifier)])

In [26]:
%%time

model = pipeline.fit(X_train, y_train)

CPU times: user 14min 55s, sys: 28min 42s, total: 43min 38s
Wall time: 1h 26s


In [27]:
%%time

rt = pd.read_csv('reviews_rt_all.csv', sep = '|')
imdb = pd.read_csv('imdb_small.csv', sep = '|')
imdb2 = pd.read_csv('imdb2.csv', sep = '|')
amz = pd.read_csv('amazon_test.csv', sep = '|')

X_test_rt, y_test_rt = (rt.text, rt.label)
X_test_imdb, y_test_imdb = (imdb.text, imdb.label)
X_test_imdb2, y_test_imdb2 = (imdb2.text, imdb2.label)
X_test_amz, y_test_amz = (amz.text, amz.label)

CPU times: user 3.43 s, sys: 947 ms, total: 4.38 s
Wall time: 5.98 s


In [29]:
y_pred_rt = model.predict(X_test_rt)
y_pred_imdb = model.predict(X_test_imdb)
y_pred_imdb2 = model.predict(X_test_imdb2)
y_pred_amz = model.predict(X_test_amz.values.astype('U'))

In [43]:
print ("Accuracy RT :", metrics.accuracy_score(y_test_rt, y_pred_rt))
print ("Accuracy IMDB :", metrics.accuracy_score(y_test_imdb, y_pred_imdb))
print ("Accuracy IMDB2 :", metrics.accuracy_score(y_test_imdb2, y_pred_imdb2))
print ("Accuracy Amazon :", metrics.accuracy_score(y_test_amz, y_pred_amz))

Accuracy RT : 0.757499269077
Accuracy IMDB : 0.73768
Accuracy IMDB2 : 0.703095684803
Accuracy Amazon : 0.929683740576


In [45]:
df_train_22 = pd.DataFrame({"text": X_train, "label": y_train})
df_train_22.to_csv('amazon_train_22.csv', index=False, sep='|')

In [49]:
#joblib.dump(pipeline, 'output.pkl')

In [46]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_train, y_train, cv=5)
scores

array([ 0.92144006,  0.92085652,  0.92194073,  0.92249398,  0.92163998])