In [1]:
import pandas as pd
import re
import pickle

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle

import nltk
from nltk.stem import SnowballStemmer
from nltk import word_tokenize as nltk_wtknz

In [2]:
data_rt = pd.read_csv("../data/reviews_rt_all.csv", sep="|")
data_imdb = pd.read_csv("../data/imdb_small.csv", sep="|")

In [3]:
data_df = pd.concat([data_rt, data_imdb], ignore_index=True, copy=False)
data_df = shuffle(data_df)

In [4]:
print(data_df.shape)

(152610, 2)


In [5]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 152610 entries, 81330 to 30903
Data columns (total 2 columns):
label    152610 non-null int64
text     152610 non-null object
dtypes: int64(1), object(1)
memory usage: 3.5+ MB


In [6]:
# df.describe()
# df.describe(include=['object'])
# df['label'].value_counts()
data_df['label'].value_counts(normalize=True)

1    0.587498
0    0.412502
Name: label, dtype: float64

In [7]:
def tokenize(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    word_list = nltk_wtknz(text)
    stemmer = SnowballStemmer("english")
    stems = [stemmer.stem(word) for word in word_list]
    return stems

In [8]:
X_train_rt, X_test_rt, y_train_rt, y_test_rt  = train_test_split(
                                                        data_rt.text, 
                                                        data_rt.label,
                                                        test_size=0.2, 
                                                        random_state=42)

In [9]:
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb  = train_test_split(
                                                        data_imdb.text, 
                                                        data_imdb.label,
                                                        test_size=0.2, 
                                                        random_state=42)

In [10]:
X_train = pd.concat([X_train_rt, X_train_imdb])
X_test = pd.concat([X_test_rt, X_test_imdb])
y_train = pd.concat([y_train_rt, y_train_imdb])
y_test = pd.concat([y_test_rt, y_test_imdb])

In [11]:
pipeline = Pipeline([('vectorizer', TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 3),
                              analyzer = 'word', binary = True, max_df = 0.75)), 
                     ('classifier', LinearSVC(C=100))])

In [12]:
model = pipeline.fit(X=X_train, y=y_train)

In [13]:
y_pred = model.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.848895878383
             precision    recall  f1-score   support

          0       0.83      0.79      0.81     12576
          1       0.86      0.89      0.87     17946

avg / total       0.85      0.85      0.85     30522



In [14]:
y_pred = model.predict(X_test_rt)

print (accuracy_score(y_test_rt, y_pred))
print(classification_report(y_test_rt, y_pred))

0.815953610759
             precision    recall  f1-score   support

          0       0.77      0.71      0.74      7521
          1       0.84      0.88      0.86     13001

avg / total       0.81      0.82      0.81     20522



In [15]:
y_pred = model.predict(X_test_imdb)

print (accuracy_score(y_test_imdb, y_pred))
print(classification_report(y_test_imdb, y_pred))

0.9165
             precision    recall  f1-score   support

          0       0.92      0.91      0.92      5055
          1       0.91      0.92      0.92      4945

avg / total       0.92      0.92      0.92     10000



In [16]:
with open('../dumps/m_lin_svc_mix_out.pkl', 'wb') as f:
    pickle.dump(pipeline, f)