In [None]:
import pandas as pd
import re

from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.externals import joblib

In [None]:
df1 = pd.read_csv("../data/reviews_rt_all.csv", sep="|")
df2 = pd.read_csv("../data/imdb_small.csv", sep="|")

In [None]:
df = pd.concat([df1, df2], ignore_index=True, copy=False)

In [None]:
print(df.shape)

In [None]:
df.info()

In [None]:
# df.describe()
# df.describe(include=['object'])
# df['label'].value_counts()
df['label'].value_counts(normalize=True)

In [None]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '')
    return text

In [None]:
# apply preprocessor to our dataset
df['text'] = df['text'].apply(preprocessor)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.text, 
                                                    df.label, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=df.label)

In [None]:
STOPWORDS = ['a','an','by','did','does', 'was', 'were', 'i']

In [None]:
pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), stop_words=STOPWORDS)),
                     ('clsf',  LinearSVC(C=0.04))])

In [None]:
model = pipeline.fit(X=X_train, y=y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
joblib.dump(pipeline, '../dumps/m_lin_svc_out.pkl')