In [1]:
import pandas as pd
import re

from sklearn.linear_model import SGDClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.externals import joblib

In [2]:
df1 = pd.read_csv("../data/reviews_rt_all.csv", sep="|")
df2 = pd.read_csv("../data/imdb_small.csv", sep="|")

In [3]:
df = pd.concat([df1, df2], ignore_index=True, copy=False)

In [4]:
df.shape

(152610, 2)

In [5]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '')
    return text

In [6]:
# apply preprocessor to our dataset
df['text'] = df['text'].apply(preprocessor)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.text, 
                                                    df.label, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=df.label)

In [8]:
STOPWORDS = ['a','an','by','did','does', 'was', 'were', 'i']

In [9]:
pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), stop_words=STOPWORDS)),
                     ('tfidf', TfidfTransformer(sublinear_tf=True)),
                     ('clsf',  SGDClassifier())])
model = pipeline.fit(X=X_train, y=y_train)

In [10]:
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.777046065133
             precision    recall  f1-score   support

          0       0.87      0.54      0.67     12590
          1       0.74      0.95      0.83     17932

avg / total       0.80      0.78      0.76     30522



In [11]:
joblib.dump(pipeline, '../dumps/m_sgdc_mix_out.pkl')

['../dumps/m_sgdc_mix_out.pkl']