[YouTube Guide](https://www.youtube.com/watch?v=oXZThwEF4r0&ab_channel=MikeBernico)

In [12]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes, metrics

In [2]:
# load data
df = pd.read_csv('data/UMICH_SI650_Sentiment_Classification.txt', sep='\t', names=['liked', 'txt'])

In [3]:
df.head()

Unnamed: 0,liked,txt
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [4]:
# TFIDF Vectorizer
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [5]:
# in this case, our dependent variable will be liked as 0 (didn't like the movie) or 1 (liked the movie)
y = df.liked

In [6]:
# covert df.txt from text to  features
X = vectorizer.fit_transform(df.txt)

In [7]:
# 6918 obervations x 2011 unique words
print(y.shape)
print(X.shape)

(6918,)
(6918, 2011)


In [8]:
# test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
# train Naive Bayes classifier
clf = naive_bayes.MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [10]:
# test model's accuracy
y_pred = clf.predict_proba(X_test)[:, 1]
metrics.roc_auc_score(y_test, y_pred)

0.9979292333245913

In [13]:
movie_reviews_array = np.array(['Jupiter Ascending was a disappointing and terrible movie'])
movie_reviews_vector = vectorizer.transform(movie_reviews_array)
print(clf.predict(movie_reviews_vector))

[0]
