# Import data and review the first entry to check the form

In [37]:
with open("./netflix.txt", "r") as text_file:
                         lines = text_file.read().split('\n')
    
with open("./hbo.txt", "r") as text_file:
                         lines = text_file.read().split('\n')    
    

In [38]:
from sklearn.utils import shuffle
import numpy as np

#shuffle data before dividing into training and test sets
lines = np.array(shuffle(lines))

In [39]:
lines[0]

"I loved this movie it was a great portrayal of a family who had it's share of ups and down, but in the end they knew that special love they had for each other.  \t1"

# Edit data: split in sentences and classes

In [40]:
lines = [line.split("\t") for line in lines if len(line.split("\t"))==2 and line.split("\t")]

In [41]:
lines[0]

["I loved this movie it was a great portrayal of a family who had it's share of ups and down, but in the end they knew that special love they had for each other.  ",
 '1']

In [42]:
text = [line[0] for line in lines]

In [43]:
text[0]

"I loved this movie it was a great portrayal of a family who had it's share of ups and down, but in the end they knew that special love they had for each other.  "

In [44]:
labels = [int(line[1]) for line in lines]

# Need numeric data: Count features

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

In [46]:
count_vectorizer = CountVectorizer(binary='true')

In [47]:
text = count_vectorizer.fit_transform(text)

In [48]:
text

<1000x3047 sparse matrix of type '<class 'numpy.int64'>'
	with 12666 stored elements in Compressed Sparse Row format>

# Get the classifiers

In [50]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier

In [51]:
classifierBNB = BernoulliNB().fit(text,labels)
classifierRF = RandomForestClassifier(max_depth=2, random_state=0).fit(text, labels)

# Perform cross validation and compute mean accuracy for BernoulliNB

In [52]:
from sklearn.model_selection import cross_validate

In [53]:
cv_results = cross_validate(classifierBNB, text, labels, return_train_score = False)
cv_results['test_score'].mean()

0.7740182286030349

# Perform cross validation and compute mean accuracy for Random Forest Classifier

In [54]:
cv_results = cross_validate(classifierRF, text, labels, return_train_score = False)
cv_results['test_score'].mean()

0.5529904047327032

# Split the data into training and test sets, fit and predict

In [59]:
#select 20% of data to form the test set
split_point = int(0.2*text.shape[0])
test_data, train_data = text[:split_point], text[split_point:]
test_labels, train_labels = labels[:split_point], labels[split_point:]


In [60]:
#fit using training sample
classifierBNB = BernoulliNB().fit(train_data,train_labels)
classifierRF = RandomForestClassifier(max_depth=2, random_state=0).fit(train_data,train_labels)

#predict using test sample
predictionBNB = classifierBNB.predict(test_data)
predictionRF = classifierRF.predict(test_data)

# Compute the confusion matrix

In [62]:
from sklearn import metrics

In [63]:
confusionBNB = metrics.confusion_matrix(test_labels, predictionBNB)
confusionRF = metrics.confusion_matrix(test_labels, predictionRF)

In [64]:
confusionBNB

array([[84,  8],
       [39, 69]])

In [65]:
confusionRF

array([[87,  5],
       [88, 20]])

In [74]:
#percentage of correctly predicted positive reviews with Naive Bayes
confusionBNB[1,1]/float(confusionBNB[1,1] + confusionBNB[1,0])

0.6388888888888888

In [75]:
#percentage of correctly predicted positive reviews with Random forest
confusionRF[1,1]/float(confusionRF[1,1] + confusionRF[1,0])

0.18518518518518517

In [76]:
#percentage of correctly predicted negative reviews with Naive Bayes
confusionBNB[0,0]/float(confusionBNB[0,0] + confusionBNB[0,1])

0.9130434782608695

In [77]:
#percentage of correctly predicted negative reviews with Random forest
confusionRF[0,0]/float(confusionRF[0,0] + confusionRF[0,1])

0.9456521739130435