In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.pipeline import Pipeline

In [2]:
# Data import
rt = pd.read_csv('reviews_rt_all.csv', sep = '|')
imdb = pd.read_csv('imdb_small.csv', sep = '|')
amazon_test = pd.read_csv('amazon_all_test.csv', sep = '|')
amazon_test.dropna(inplace=True)

In [3]:
import os
import numpy
#BASE_DIR = '../'
TEXT_DATA_DIR = ''#BASE_DIR  + '../data/test'
TEXT_DATA_FILE_1 = "rt-polarity_neg.txt"
TEXT_DATA_FILE_2 = "rt-polarity_pos.txt"
HEADER = True

def load_data():
    x = []
    y = []
    for i in [TEXT_DATA_FILE_1, TEXT_DATA_FILE_2]:
        with open(os.path.join(TEXT_DATA_DIR, i), "r", encoding='utf-8', errors='ignore') as f:
            if HEADER:
                _ = next(f)
            if i[-7:-4] == "pos":
                temp_y = 1
            else: temp_y = 0
            for line in f:
                x.append(line.rstrip("\n"))
                y.append(temp_y)

    return x, y


data, labels = load_data()
data = pd.DataFrame({'label': labels, 'text': data})

In [4]:
df = pd.read_csv("amazon_all_train.csv", sep = '|')
df.dropna(inplace=True)
df_bad = df[df.label == 0]
df_good = df[df.label == 1]
df_good_new = df_good.sample (n=185376, random_state=42)
result = pd.concat([df_good_new, df_bad])
result = result.sample(frac=1)

In [5]:
X_train_am, X_test_am, y_train_am, y_test_am = train_test_split(result.text, result.label, test_size=0.2, random_state=42, stratify=result.label)


In [6]:
# Split RT and IMDB datasets seapretely
X_train_rt, X_test_rt, y_train_rt, y_test_rt = train_test_split(rt.text, rt.label, test_size=0.2, random_state=42, stratify=rt.label)
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb = train_test_split(imdb.text, imdb.label, test_size=0.2, random_state=42, stratify=imdb.label)

# Then concatenate
X_train = pd.concat([X_train_rt, X_train_imdb])
X_test = pd.concat([X_test_rt, X_test_imdb])
y_train = pd.concat([y_train_rt, y_train_imdb])
y_test = pd.concat([y_test_rt, y_test_imdb])
X_train_am = pd.concat([X_train_am, X_train_rt])
y_train_am = pd.concat([y_train_am, y_train_rt])

In [8]:
# Stopwords
STOPWORDS = ['by','does', 'was', 'were', 'the', 'of', 'end', 'and', 'is']    

In [9]:
cvect = CountVectorizer()
counts = cvect.fit_transform(X_train_am)

classifier = PassiveAggressiveClassifier(C=0.001, fit_intercept = False, shuffle = False, n_iter = 91, n_jobs = -1)
pipeline = Pipeline([('vectorizer', CountVectorizer(binary=True,ngram_range=(1,4),stop_words=STOPWORDS)), ('classifier', classifier)])
model = pipeline.fit(X=X_train_am, y=y_train_am)

In [10]:
y_pred_rt_new = model.predict(data.text)
y_pred_amazon = model.predict(amazon_test.text)

print ("Accuracy RT :", metrics.accuracy_score(data.label, y_pred_rt_new))
print ("F1 RT :", metrics.f1_score(data.label, y_pred_rt_new))

print ("Accuracy Amazon :", metrics.accuracy_score(amazon_test.label, y_pred_amazon))
print ("F1 Amazon :", metrics.f1_score(amazon_test.label, y_pred_amazon))

Accuracy RT : 0.773639774859
F1 RT : 0.773405953611
Accuracy Amazon : 0.918692815711
F1 Amazon : 0.951080626897


In [None]:
joblib.dump(model, 'output_amazon.pkl')