# Machine Learning Project - Naive Bayes Implementation (without using libraries)
## Reviews Classification (Positive or Negative)
## by Vinay Kumar Ranganath Babu

In [63]:
# Importing the libraries
import os, sys
from collections import Counter
import numpy as np
import re
import random
from math import log

In [64]:
# The positive reviews are present in pos folder and negative in neg
# We need to extract all the files from these folders and read them
pos_path = "pos"
neg_path = "neg"
pos_dirs = os.listdir(pos_path)
neg_dirs = os.listdir(neg_path)

In [65]:
# Initialise the lists for future reviews processing
pos_reviews = []
neg_reviews = []
train = []
test = []

In [66]:
# Read all the review contents into the corresponding lists
for review in pos_dirs:
    with open(os.path.join(pos_path, review),'r',encoding="latin-1") as file:
        contents = file.read()
        pos_reviews.append([contents,1])

for review in neg_dirs:
    with open(os.path.join(neg_path, review), 'r',encoding="latin-1") as file:
        contents = file.read()
        neg_reviews.append([contents, -1])

In [67]:
# Place all the reviews into one list
for review in pos_reviews:
    train.append(review)

for review in neg_reviews:
    train.append(review)

In [68]:
# Perform the Cross-Validation to divide the data into train and test sets
np.random.shuffle(train)
test = train[1000:]
train = train[0:1000]

In [69]:
len(train) # Training set 

1000

In [70]:
len(test) # Test set

400

In [71]:
# Function to get the text from each review based on the score
def get_text(reviews, score):
    # Join together the text in the reviews for a particular tone.
    # We lowercase to avoid "Not" and "not" being seen as different words, for example.
    return " ".join([r[0].lower() for r in reviews if r[1] == score])

In [72]:
# Function to count each text for its appearance
def count_text(text):
    # Split text into words based on whitespace.  Simple but effective.
    words = re.split("\s+", text)
    # Count up the occurence of each word.
    return Counter(words)

In [73]:
# Store the negative and positive texts separately
negative_text = get_text(train, -1)
positive_text = get_text(train, 1)

In [74]:
negative_text[0:100]

"actually , i'm fairly sure the experience of having my flesh torn and mutilated by barbed wire would"

In [75]:
positive_text[0:100]

'starring : jamie lee curtis ( lauris strode ) , donald pleasance ( dr . sam loomis ) directed by : j'

In [76]:
# Generate word counts for negative tone.
negative_counts = count_text(negative_text)

# Generate word counts for positive tone.
positive_counts = count_text(positive_text)

In [77]:
negative_counts

Counter({'': 1,
         "alex's": 2,
         'wetsuits': 1,
         'willingness': 1,
         'soldiers': 12,
         "patrick's": 7,
         'gothic': 4,
         'authors': 3,
         'chinese-american': 1,
         'roescher': 2,
         'moresco': 4,
         'fielding': 1,
         'actors': 184,
         'wigger': 3,
         "filmmakers'": 1,
         'time-filler': 1,
         'entrepreneur': 3,
         'sgt': 4,
         'manson': 2,
         'louie': 1,
         'perky': 1,
         'substitute': 2,
         'dmx': 4,
         'villainous': 4,
         'refer': 4,
         'stellar': 4,
         "ratt's": 1,
         'philosophising': 1,
         'h': 6,
         'throat': 5,
         'changes': 19,
         'punishing': 1,
         'fund': 2,
         'intrigued': 1,
         'obi-wan': 3,
         'mumble': 3,
         'embarrasses': 1,
         'caused': 8,
         'staterooms': 2,
         'crazed': 4,
         'calculators': 1,
         'hopefully': 15,
       

In [78]:
positive_counts

Counter({'': 1,
         "alex's": 3,
         'facto': 2,
         'soldiers': 41,
         'period-piece': 1,
         'gothic': 10,
         'authors': 1,
         'robocop-style': 1,
         'cubans': 3,
         'roadshow': 2,
         'chinese-american': 1,
         'christie': 3,
         'actors': 167,
         "bowfinger's": 3,
         'entrepreneur': 1,
         'paranormale': 1,
         'sgt': 2,
         "nbc's": 2,
         'manson': 1,
         'perky': 5,
         'kwc': 1,
         'substitute': 3,
         'dmx': 1,
         'spilled': 4,
         'refer': 1,
         'beaten': 10,
         'h': 17,
         'throat': 4,
         'changes': 26,
         'fund': 3,
         'intrigued': 3,
         'obi-wan': 7,
         'look-alike': 2,
         'exits': 1,
         'embarrasses': 2,
         'caused': 9,
         'crazed': 4,
         'hopefully': 10,
         'mockery': 3,
         'they¹re': 1,
         'germany': 6,
         'coterie': 1,
         'creeping': 1,

In [79]:
def get_y_count(score):
    # Compute the count of each classification occurring in the data.
    return len([r for r in train if r[1] == score])

In [80]:
# We need these counts to use for smoothing when computing the prediction.
positive_review_count = get_y_count(1)
negative_review_count = get_y_count(-1)

In [81]:
positive_review_count

485

In [82]:
negative_review_count

515

In [83]:
# These are the class probabilities (P(y) in the Bayes formula).
prob_positive = float(positive_review_count) / float(len(train))
prob_negative = float(negative_review_count) / float(len(train))

In [84]:
prob_positive

0.485

In [85]:
prob_negative

0.515

In [86]:
# The prediction function
def make_class_prediction(text, counts, class_prob, class_count):
    prediction = 0
    text_counts = Counter(re.split("\s+", text))
    for word in text_counts:
        # For every word in the text, we get the number of times that word occured in the reviews for a given class, add 1 to smooth the value, and divide by the total number of words in the class (plus the class_count to also smooth the denominator).
        # Smoothing ensures that we don't multiply the prediction by 0 if the word didn't exist in the training data.
        # We also smooth the denominator counts to keep things even.
        prediction += log(float(text_counts.get(word)) * (float(counts.get(word, 0) + 1) / (float(sum(counts.values())) + float(class_count))))
        # Now we multiply by the probability of the class existing in the documents.

    return prediction + log(class_prob)

In [87]:
# The decision maker function
def make_decision(text, make_class_prediction):
    # Compute the negative and positive probabilities.
    negative_prediction = make_class_prediction(text, negative_counts, prob_negative, negative_review_count)
    positive_prediction = make_class_prediction(text, positive_counts, prob_positive, positive_review_count)

    # We assign a classification based on which probability is greater.
    if negative_prediction > positive_prediction:
      return -1
    return 1

In [88]:
# Make the predictions using the above functions
predictions = [make_decision(r[0], make_class_prediction) for r in test]
print("The predicted values are: {0}".format(predictions))

The predicted values are: [1, 1, -1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, 1, 1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -

In [89]:
# The actual values for the reviews 
actual = [int(r[1]) for r in test]

In [90]:
# Print the actual values to compare them between the predicted values
print("The actual values are: {0}".format(actual))

The actual values are: [1, 1, -1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, 1, 1, -1, -1, 1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, -1, 

In [91]:
# ROC curve generation using scikits-learn
from sklearn import metrics

# Generate the roc curve using scikits-learn.
fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)

# Measure the area under the curve.  The closer to 1, the "better" the predictions.
print("AUC of the predictions: {0}".format(metrics.auc(fpr, tpr)))

AUC of the predictions: 0.8214959145191704
