In [1]:
import pandas as pd
import re

# from sklearn.naive_bayes import MultinomialNB
# from sklearn.svm import LinearSVR, SVC
from sklearn.linear_model import SGDClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.externals import joblib

In [2]:
df = pd.DataFrame()
df = pd.read_csv("../data/reviews_rt_all.csv", sep="|")
# df = pd.read_csv("../data/imdb_small.csv", sep="|")

In [3]:
print(df.shape)

(102610, 2)


In [4]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '')
    return text

In [5]:
# apply preprocessor to our dataset
df['text'] = df['text'].apply(preprocessor)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.text, 
                                                    df.label, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=df.label)

In [7]:
STOPWORDS = ['a','an','by','did','does', 'was', 'were', 'i']

In [8]:
pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,2), stop_words=STOPWORDS)),
                     ('tfidf', TfidfTransformer(sublinear_tf=True)),
                     ('clsf',  SGDClassifier())])
model = pipeline.fit(X=X_train, y=y_train)

In [9]:
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.731994932268
             precision    recall  f1-score   support

          0       0.82      0.35      0.49      7590
          1       0.71      0.96      0.82     12932

avg / total       0.76      0.73      0.70     20522



In [10]:
joblib.dump(pipeline, '../dumps/m_sgdc_rt_out.pkl')

['../dumps/m_sgdc_rt_out.pkl']