In [1]:
import sys
from hashlib import sha1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [3]:
train_df = pd.read_csv("data/train.csv", names=['target', 'stuning', 'reviews'])

In [4]:
train_df.head()

Unnamed: 0,target,stuning,reviews
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [13]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   target   int64 
 1   stuning  object
 2   reviews  object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB


In [5]:
train_df = train_df.dropna()

In [6]:
train_df.shape

(3599923, 3)

In [7]:
test_df = pd.read_csv("data/test.csv", names=['target', 'stuning', 'reviews'])

In [17]:
test_df.head()

Unnamed: 0,target,stuning,reviews
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [8]:
test_df = test_df.dropna()

In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 399990 entries, 0 to 399999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   target   399990 non-null  int64 
 1   stuning  399990 non-null  object
 2   reviews  399990 non-null  object
dtypes: int64(1), object(2)
memory usage: 12.2+ MB


In [10]:
test_df.shape

(399990, 3)

In [11]:
X_train, y_train = train_df["reviews"], train_df["target"]
X_test, y_test = test_df["reviews"], test_df["target"]

In [12]:
# use the mean_std_cross_val_scores from class notes
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [13]:
results = {}
pipe_nb = make_pipeline(CountVectorizer(), BernoulliNB(alpha=0.01))
results["Naive Bayes"] = mean_std_cross_val_scores(
    pipe_nb, X_train, y_train, cv=3, n_jobs=-1, return_train_score=True
)

In [15]:
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_score,train_score
Naive Bayes,376.912 (+/- 57.009),130.123 (+/- 4.659),0.816 (+/- 0.002),0.849 (+/- 0.001)


In [16]:
pipe_nb.fit(X_train, y_train)

In [17]:
pipe_nb.score(X_test, y_test)

0.8177179429485737