In [1]:
import pandas as pd
import numpy as np
import re
import pickle

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle

from nltk.stem import SnowballStemmer
from nltk import word_tokenize as nltk_wtknz

In [5]:
data_mtv = pd.read_csv("../data/mtv_train_2.csv", sep="|", nrows=200011)

In [8]:
data_rt = pd.read_csv("../data/reviews_rt_all.csv", sep="|")
data_imdb = pd.read_csv("../data/imdb_small.csv", sep="|")

In [9]:
print(data_mtv.shape)

(200000, 2)


In [6]:
# remove rows with NaN values
data_mtv = data_mtv.dropna()

In [7]:
print(data_mtv.shape)

(200000, 2)


In [10]:
data_mtv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 200010
Data columns (total 2 columns):
overall       200000 non-null int64
reviewText    200000 non-null object
dtypes: int64(1), object(1)
memory usage: 4.6+ MB


In [11]:
data_mtv.head(10)

Unnamed: 0,overall,reviewText
0,1,This is a charming version of the classic Dick...
1,1,Henry Winkler is very good in this twist on th...
2,1,This is one of the best Scrooge movies out. H...
3,1,This has been a favorite movie of mine for a l...
4,1,This is the American adaptation of the Charles...
5,1,Glad that this american classic came out on dv...
6,1,A good Christmas carol dhenry winkler one duri...
7,1,How a bitter old man comes to know the true me...
8,1,"The small historic Canadian town of Elora, wit..."
9,1,Even though i don't care for Henry Winklers a...


In [12]:
data_mtv['overall'].value_counts(normalize=True)

1    0.895805
0    0.104195
Name: overall, dtype: float64

In [13]:
X_train_mtv, X_test_mtv, y_train_mtv, y_test_mtv  = train_test_split(
                                                        data_mtv.reviewText, 
                                                        data_mtv.overall,
                                                        test_size=0.2, 
                                                        random_state=42)

In [14]:
X_train_rt, X_test_rt, y_train_rt, y_test_rt  = train_test_split(
                                                        data_rt.text, 
                                                        data_rt.label,
                                                        test_size=0.2, 
                                                        random_state=42)

In [15]:
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb  = train_test_split(
                                                            data_imdb.text, 
                                                            data_imdb.label,
                                                            test_size=0.2, 
                                                            random_state=42)

In [16]:
data_mix = pd.concat([data_rt, data_imdb], ignore_index=True, copy=False)
data_mix = shuffle(data_mix)

In [17]:
X_train_mix = pd.concat([X_train_rt, X_train_imdb])
X_test_mix = pd.concat([X_test_rt, X_test_imdb])
y_train_mix = pd.concat([y_train_rt, y_train_imdb])
y_test_mix = pd.concat([y_test_rt, y_test_imdb])

In [18]:
def tokenize(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    word_list = nltk_wtknz(text)
    stemmer = SnowballStemmer("english")
    stems = [stemmer.stem(word) for word in word_list]
    return stems

In [19]:
pipeline = Pipeline([('vectorizer', TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 3),
                              analyzer = 'word', binary = True, max_df = 0.75)), 
                     ('classifier', LinearSVC(C=100))])

In [None]:
model = pipeline.fit(X=X_train_mtv, y=y_train_mtv)

In [None]:
y_pred = model.predict(X_test_mtv)

print(accuracy_score(y_test_mtv, y_pred))
print(classification_report(y_test_mtv, y_pred))

In [None]:
y_pred = model.predict(X_test_mix)

print(accuracy_score(y_test_mix, y_pred))
print(classification_report(y_test_mix, y_pred))

In [None]:
y_pred = model.predict(X_test_rt)

print (accuracy_score(y_test_rt, y_pred))
print(classification_report(y_test_rt, y_pred))

In [None]:
y_pred = model.predict(X_test_imdb)

print (accuracy_score(y_test_imdb, y_pred))
print(classification_report(y_test_imdb, y_pred))

In [None]:
with open('../dumps/m_lin_svc_mtv_200_out.pkl', 'wb') as f:
    pickle.dump(pipeline, f)