Import all the packages needed for the program.

In [None]:
import os
import numpy as np
import sklearn as skl
import scipy as sp
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('omw-1.4')



Initialisation of the reviews into positive and negative (break data into test, evaluation and training).

In [None]:
pos = np.array([])
neg = np.array([])

files = os.listdir ("data/pos")
for file in files:
    f = open("data/pos/" + file, 'r')
    data = f.read().replace("<br />", ' ')
    f.close()
    pos = np.append(pos, data.translate(str.maketrans('', '', string.punctuation)))


files = os.listdir ("data/neg")
for file in files:
    f = open("data/neg/" + file, 'r')
    data = f.read().replace("<br />", ' ')
    f.close()
    neg = np.append(neg, data.translate(str.maketrans('', '', string.punctuation)))

#10% for evaluation, 20% for testing, and 70% for training

3 methods of tokenisation and the different arrays that they form (split by whitespace, lemmatized, stemmed).

In [None]:
# Splitting by whitespace of the positive and negative reviews
white_space_pos = np.char.split(pos)
white_space_neg = np.char.split(neg)


# Lemmatizisation of the positive and negative reviews
lemmatizer = WordNetLemmatizer()
lemmatized_pos = np.char.split(pos)
lemmatized_neg = np.char.split(neg)

for i in range(len(white_space_pos)):
    for j in range(len(white_space_pos[i])):
        lemmatized_pos[i][j] = lemmatizer.lemmatize(lemmatized_pos[i][j],pos="v")
        lemmatized_pos[i][j] = lemmatizer.lemmatize(lemmatized_pos[i][j],pos="n")
        lemmatized_pos[i][j] = lemmatizer.lemmatize(lemmatized_pos[i][j],pos="a")
        lemmatized_pos[i][j] = lemmatizer.lemmatize(lemmatized_pos[i][j],pos="r")
        lemmatized_pos[i][j] = lemmatizer.lemmatize(lemmatized_pos[i][j],pos="s")

for i in range(len(white_space_neg)):
    for j in range(len(white_space_neg[i])):
        lemmatized_neg[i][j] = lemmatizer.lemmatize(lemmatized_neg[i][j],pos="v")
        lemmatized_neg[i][j] = lemmatizer.lemmatize(lemmatized_neg[i][j],pos="n")
        lemmatized_neg[i][j] = lemmatizer.lemmatize(lemmatized_neg[i][j],pos="a")
        lemmatized_neg[i][j] = lemmatizer.lemmatize(lemmatized_neg[i][j],pos="r")
        lemmatized_neg[i][j] = lemmatizer.lemmatize(lemmatized_neg[i][j],pos="s")


# Stemming of the positive and negative reviews
stemmer = PorterStemmer()
stemmed_pos = np.char.split(pos)
stemmed_neg = np.char.split(neg)
for i in range(len(white_space_pos)):
    for j in range(len(white_space_pos[i])):
        stemmed_pos[i][j] = stemmer.stem(stemmed_pos[i][j])
        
for i in range(len(white_space_neg)):
    for j in range(len(white_space_neg[i])):
        stemmed_neg[i][j] = stemmer.stem(stemmed_neg[i][j])


Finding the words that are very common within the reviews in order to remove them.

In [None]:
# Creates a frequency table of all the words within the positive and negative reviews
words = np.array([])
freq = np.array([])
def find(x,y):
    for i in range(len(x)):
        if x[i] == y:
            return (True, i)
    return (False,0)

# Goes through all the reviews word by word and add it to the words array if it isn't 
# in it already or increases its frequency if it is.
for i in range(len(stemmed_pos)):
    for j in range(len(stemmed_pos[i])):
        found,x = find(words,stemmed_pos[i][j])

        if not found:
            words = np.append(words,stemmed_pos[i][j])
            freq = np.append(freq,1)

        else:
            freq[x] += 1

for i in range(len(stemmed_neg)):
    for j in range(len(stemmed_neg[i])):
        found,x = find(words,stemmed_neg[i][j])

        if not found:
            words = np.append(words,stemmed_neg[i][j])
            freq = np.append(freq,1)

        else:
            freq[x] += 1

In [None]:
# Finds the words that are very common (above 10000 occurences) and records it as a 
# word to delete
todelete = np.where(freq > 10000)[0]
nodelete = np.delete(words,todelete)
nd = np.char.str_len(nodelete)
delete = np.where(nd<= 2)[0]
nodelete = np.delete(nodelete,delete)


# Goes through the array of review features and checks if that feature/word should be 
# deleted due to it being too common.
#
# As all features are the same length at this point, the stemmed features were used to
# find the length of all the feature arrays.
for i in range(len(stemmed_neg)):
    lengthi = len(stemmed_neg[i])
    j = 0

    while (j < lengthi):
        found,x = find(nodelete,stemmed_neg[i][j])

        if not found:
            stemmed_neg[i] = np.delete(stemmed_neg[i],j)
            lemmatized_neg[i] = np.delete(lemmatized_neg[i],j)
            white_space_neg[i] = np.delete(white_space_neg[i],j)
            lengthi-=1

        else:
            j+=1

for i in range(len(stemmed_pos)):
    lengthi = len(stemmed_pos[i])
    j = 0

    while (j<lengthi):
        found,x = find(nodelete,stemmed_pos[i][j])

        if not found:
            stemmed_pos[i] = np.delete(stemmed_pos[i],j)
            lemmatized_pos[i] = np.delete(lemmatized_pos[i],j)
            white_space_pos[i] = np.delete(white_space_pos[i],j)
            lengthi -= 1

        else:
            j+=1

As the searching of common words and their deletion took a long time, progress was saved by putting the reviews without the common phrases into new files.

In [None]:
pos = np.array([])
neg = np.array([])

for i in range(2000):
    f = open("data/" + str(i) + "p.txt", 'r')
    data = f.read()
    f.close()
    pos = np.append(pos, data)

for i in range(1997):
    f = open("data/" + str(i) + "n.txt", 'r')
    data = f.read()
    f.close()
    neg = np.append(neg, data)


Code for Compositional Phrases:\
One part is for finding n-grams of the words. Those n-grams got stored as the process took relatively long to compute. The second part is PoS and constituency parsing for noun phrases.

In [None]:
# Finds the n-grams when n = 2
#
# Goes through a sample of 500 of the positive and negative reviews to find common n-grams.
# 500 was picked as the sample size as it was a large enough sample that could represent that dataset
# well, while not taking an long time to run.
#
# Please note that the variables of this code were changed between stemmed, lemmatized and white spaces
# to acquire the different n-grams for each tokenisation method. The code below features the code for the
# lemmatizisation method.
words = []
freq = []
for i in range(500):
    for j in range(len(lemmatized_pos[i])-(1)):

        if (lemmatized_pos[i][j]+"-"+lemmatized_pos[i][j+(1)]) not in words:
            words = np.append(words,(lemmatized_pos[i][j]+"-"+lemmatized_pos[i][j+(1)]))
            freq = np.append(freq,1)
            words = words.tolist()
            freq = freq.tolist()

        else:
            words = np.array(words)
            x = np.where(words == (lemmatized_pos[i][j]+"-"+lemmatized_pos[i][j+(1)]))[0][0]
            freq[x] += 1
            words = words.tolist()

for i in range(500):
    for j in range(len(lemmatized_neg[i])-(1)):

        if (lemmatized_neg[i][j]+"-"+lemmatized_neg[i][j+(1)]) not in words:
            words = np.append(words,(lemmatized_neg[i][j]+"-"+lemmatized_neg[i][j+(1)]))
            freq = np.append(freq,1)
            words = words.tolist()
            freq = freq.tolist()

        else:
            words = np.array(words)
            x = np.where(words == (lemmatized_neg[i][j]+"-"+lemmatized_neg[i][j+(1)]))[0][0]
            freq[x] += 1
            words = words.tolist()



In [None]:
# The code mirrors that of the code above, but simple for n-grams where n = 3.
words_3 = []
freq_3 = []
for i in range(500):
    for j in range(len(lemmatized_pos[i])-2):

        if (lemmatized_pos[i][j]+"-"+lemmatized_pos[i][j+(1)]+"-"+lemmatized_pos[i][j+(2)]) not in words_3:
            words_3 = np.append(words_3,(lemmatized_pos[i][j]+"-"+lemmatized_pos[i][j+(1)]+"-"+lemmatized_pos[i][j+(2)]))
            freq_3 = np.append(freq_3,1)
            words_3 = words_3.tolist()
            freq_3 = freq_3.tolist()
        
        else:
            words_3 = np.array(words_3)
            x = np.where(words_3 == (lemmatized_pos[i][j]+"-"+lemmatized_pos[i][j+(1)]+"-"+lemmatized_pos[i][j+(2)]))[0][0]
            freq_3[x] += 1
            words_3 = words_3.tolist()

for i in range(500):
    for j in range(len(lemmatized_neg[i])-(2)):

        if (lemmatized_neg[i][j]+"-"+lemmatized_neg[i][j+(1)]+"-"+lemmatized_neg[i][j+(2)]) not in words_3:
            words_3 = np.append(words_3,(lemmatized_neg[i][j]+"-"+lemmatized_neg[i][j+(1)]+"-"+lemmatized_neg[i][j+(2)]))
            freq_3 = np.append(freq_3,1)
            words_3 = words_3.tolist()
            freq_3 = freq_3.tolist()
        
        else:
            words_3 = np.array(words_3)
            x = np.where(words_3 == (lemmatized_neg[i][j]+"-"+lemmatized_neg[i][j+(1)]+"-"+lemmatized_neg[i][j+(2)]))[0][0]
            freq_3[x] += 1
            words_3 = words_3.tolist()



As the n-gram search process was quite time consuming to run, the features were stored in text files for future use.

In [None]:
s3w = " ".join(str(x) for x in words_3)
s3f = " ".join(str(x) for x in freq_3)
f = open("data/3ngraml.txt","w")
f.write(s3w+"\n")
f.write(s3f+"\n")
f.close()

In [None]:
s2w = " ".join(str(x) for x in words)
s2f = " ".join(str(x) for x in freq)
f = open("data/ngramwl.txt","w")
f.write(s2w+"\n")
f.write(s2f+"\n")
f.close()


This code was run after all n-grams of each of the tokenisation method were found as it read the findings from their assigned files. This was done to prevent time being lost rerunning the previous code sections in case of a crash or other technical issues.

In [None]:
f = open("data/ngrams.txt","r")
data = f.readline()
data = data.replace("\n", '')
s2w = np.array(data.split())
data = f.readline()
data = data.replace("\n", '')
s2fw = data.split()

freq2s = np.array([])
for i in range(len(s2fw)):
    freq2s = np.append(freq2s, float(s2fw[i]))
f.close()

f = open("data/3ngrams.txt","r")
data = f.readline()
data = data.replace("\n", '')
s3w = np.array(data.split())
data = f.readline()
data = data.replace("\n", '')
s3fw = data.split()

freq3s = np.array([])
for i in range(len(s3fw)):
    freq3s = np.append(freq3s, float(s3fw[i]))
f.close()

f = open("data/ngramwl.txt","r")
data = f.readline()
data = data.replace("\n", '')
l2w = np.array(data.split())
data = f.readline()
data = data.replace("\n", '')
l2fw = data.split()

freq2l = np.array([])
for i in range(len(l2fw)):
    freq2l = np.append(freq2l, float(l2fw[i]))
f.close()

f = open("data/3ngraml.txt","r")
data = f.readline()
data = data.replace("\n", '')
l3w = np.array(data.split())
data = f.readline()
data = data.replace("\n", '')
l3fw = data.split()

freq3l = np.array([])
for i in range(len(l3fw)):
    freq3l = np.append(freq3l, float(l3fw[i]))
f.close()

f = open("data/ngramws.txt","r")
data = f.readline()
data = data.replace("\n", '')
ws2w = np.array(data.split())
data = f.readline()
data = data.replace("\n", '')
ws2fw = data.split()

freq2ws = np.array([])
for i in range(len(ws2fw)):
    freq2ws = np.append(freq2ws, float(ws2fw[i]))
f.close()

f = open("data/3ngramws.txt","r")
data = f.readline()
data = data.replace("\n", '')
ws3w = np.array(data.split())
data = f.readline()
data = data.replace("\n", '')
ws3fw = data.split()

freq3ws = np.array([])
for i in range(len(ws3fw)):
    freq3ws = np.append(freq3ws, float(ws3fw[i]))
f.close()

In [None]:
todelete = np.where(freq2ws < 30)[0]
ws_nwords = np.delete(ws2w,todelete)
ws_2words = ws_nwords.tolist()
ws_poswords = ['special-effects','New-York','low-budget']
todelete = np.where(freq3ws < 30)[0]
ws_3words = np.delete(ws3w,todelete)
ws_3words = ws_3words.tolist()

In [None]:
todelete = np.where(freq2l < 30)[0]
lem_nwords = np.delete(l2w,todelete)
lem_2words = lem_nwords.tolist()
lem_poswords = ['special-effect','horror-film','main-character','horror-movie','New-York','bad-movie', 'good-movie']
todelete = np.where(freq3l < 30)[0]
lem_3words = np.delete(l3w,todelete)
lem_2words = lem_3words.tolist()


In [None]:
todelete = np.where(freq2s < 30)[0]
stem_nwords = np.delete(s2w,todelete)
stem_2words = stem_nwords.tolist()
stem_poswords = ['special-effect','horror-film','main-charact','horror-movi', 'low-budget', 'new-york']
todelete = np.where(freq3s < 30)[0]
stem_3words = np.delete(s3w,todelete)
stem_3words = stem_3words.tolist()

In [None]:
ws_ngramsp = np.copy(white_space_pos)
ws_ngramsn = np.copy(white_space_neg)

lem_ngramsp = np.copy(lemmatized_pos)
lem_ngramsn = np.copy(lemmatized_neg)

stem_ngramsp = np.copy(stemmed_pos)
stem_ngramsn = np.copy(stemmed_neg)


#Stemmed
for i in range(len(stemmed_pos)):
    lengthi = len(stemmed_pos[i])-2
    j = 0

    while (j<lengthi):
        found_3 = stem_ngramsp[i][j]+"-"+stem_ngramsp[i][j+1]+"-"+stem_ngramsp[i][j+2] in stem_3words
        found_2 = stem_ngramsp[i][j]+"-"+stem_ngramsp[i][j+1] in stem_2words

        if found_3:
            stem_ngramsp[i][j] = stem_ngramsp[i][j]+"-"+stem_ngramsp[i][j+1]+"-"+stem_ngramsp[i][j+2]
            stem_ngramsp[i] = np.delete(stem_ngramsp[i],j+1)
            stem_ngramsp[i] = np.delete(stem_ngramsp[i],j+1)
            lengthi -= 2

        elif found_2:
            stem_ngramsp[i][j] = stem_ngramsp[i][j]+"-"+stem_ngramsp[i][j+1]
            stem_ngramsp[i] = np.delete(stem_ngramsp[i],j+1)
            lengthi -= 1

        j += 1

    if (j<(len(stem_ngramsp[i])-1)):
        found_2 = stem_ngramsp[i][j]+"-"+stem_ngramsp[i][j+1] in stem_2words

        if found_2:
            stem_ngramsp[i][j] = stem_ngramsp[i][j]+"-"+stem_ngramsp[i][j+1]
            stem_ngramsp[i] = np.delete(stem_ngramsp[i],j+1)
            lengthi -= 1

for i in range(len(stemmed_neg)):
    lengthi = len(stemmed_neg[i])-2
    j = 0

    while (j<lengthi):
        found_3 = stem_ngramsn[i][j]+"-"+stem_ngramsn[i][j+1]+"-"+stem_ngramsn[i][j+2] in stem_3words
        found_2 = stem_ngramsn[i][j]+"-"+stem_ngramsn[i][j+1] in stem_2words

        if found_3:
            stem_ngramsn[i][j] = stem_ngramsn[i][j]+"-"+stem_ngramsn[i][j+1]+"-"+stem_ngramsn[i][j+2]
            stem_ngramsn[i] = np.delete(stem_ngramsn[i],j+1)
            stem_ngramsn[i] = np.delete(stem_ngramsn[i],j+1)
            lengthi -= 2

        elif found_2:
            stem_ngramsn[i][j] = stem_ngramsn[i][j]+"-"+stem_ngramsn[i][j+1]
            stem_ngramsn[i] = np.delete(stem_ngramsn[i],j+1)
            lengthi -= 1

        j += 1

    if (j<(len(stem_ngramsn[i])-1)):
        found_2 = stem_ngramsn[i][j]+"-"+stem_ngramsn[i][j+1] in stem_2words

        if found_2:
            stem_ngramsn[i][j] = stem_ngramsn[i][j]+"-"+stem_ngramsn[i][j+1]
            stem_ngramsn[i] = np.delete(stem_ngramsn[i],j+1)
            lengthi -= 1


#Lemantized
for i in range(len(lemmatized_pos)):
    lengthi = len(lemmatized_pos[i])-2
    j = 0

    while (j<lengthi):
        found_3 = lem_ngramsp[i][j]+"-"+lem_ngramsp[i][j+1]+"-"+lem_ngramsp[i][j+2] in lem_3words
        found_2 = lem_ngramsp[i][j]+"-"+lem_ngramsp[i][j+1] in lem_2words

        if found_3:
            lem_ngramsp[i][j] = lem_ngramsp[i][j]+"-"+lem_ngramsp[i][j+1]+"-"+lem_ngramsp[i][j+2]
            lem_ngramsp[i] = np.delete(lem_ngramsp[i],j+1)
            lem_ngramsp[i] = np.delete(lem_ngramsp[i],j+1)
            lengthi -= 2

        elif found_2:
            lem_ngramsp[i][j] = lem_ngramsp[i][j]+"-"+lem_ngramsp[i][j+1]
            lem_ngramsp[i] = np.delete(lem_ngramsp[i],j+1)
            lengthi -= 1

        j += 1

    if (j<(len(lem_ngramsp[i])-1)):
        found_2 = lem_ngramsp[i][j]+"-"+lem_ngramsp[i][j+1] in lem_2words

        if found_2:
            lem_ngramsp[i][j] = lem_ngramsp[i][j]+"-"+lem_ngramsp[i][j+1]
            lem_ngramsp[i] = np.delete(lem_ngramsp[i],j+1)
            lengthi -= 1

for i in range(len(lemmatized_neg)):
    lengthi = len(lemmatized_neg[i])-2
    j = 0

    while (j<lengthi):
        found_3 = lem_ngramsn[i][j]+"-"+lem_ngramsn[i][j+1]+"-"+lem_ngramsn[i][j+2] in lem_3words
        found_2 = lem_ngramsn[i][j]+"-"+lem_ngramsn[i][j+1] in lem_2words

        if found_3:
            lem_ngramsn[i][j] = lem_ngramsn[i][j]+"-"+lem_ngramsn[i][j+1]+"-"+lem_ngramsn[i][j+2]
            lem_ngramsn[i] = np.delete(lem_ngramsn[i],j+1)
            lem_ngramsn[i] = np.delete(lem_ngramsn[i],j+1)
            lengthi -= 2

        elif found_2:
            lem_ngramsn[i][j] = lem_ngramsn[i][j]+"-"+lem_ngramsn[i][j+1]
            lem_ngramsn[i] = np.delete(lem_ngramsn[i],j+1)
            lengthi -= 1

        j += 1

    if (j<(len(lem_ngramsn[i])-1)):
        found_2 = lem_ngramsn[i][j]+"-"+lem_ngramsn[i][j+1] in lem_2words

        if found_2:
            lem_ngramsn[i][j] = lem_ngramsn[i][j]+"-"+lem_ngramsn[i][j+1]
            lem_ngramsn[i] = np.delete(lem_ngramsn[i],j+1)
            lengthi -= 1


# White Space
for i in range(len(white_space_pos)):
    lengthi = len(white_space_pos[i])-2
    j = 0

    while (j<lengthi):
        found_3 = ws_ngramsp[i][j]+"-"+ws_ngramsp[i][j+1]+"-"+ws_ngramsp[i][j+2] in ws_3words
        found_2 = ws_ngramsp[i][j]+"-"+ws_ngramsp[i][j+1] in ws_2words

        if found_3:
            ws_ngramsp[i][j] = ws_ngramsp[i][j]+"-"+ws_ngramsp[i][j+1]+"-"+ws_ngramsp[i][j+2]
            ws_ngramsp[i] = np.delete(ws_ngramsp[i],j+1)
            ws_ngramsp[i] = np.delete(ws_ngramsp[i],j+1)
            lengthi -= 2

        elif found_2:
            ws_ngramsp[i][j] = ws_ngramsp[i][j]+"-"+ws_ngramsp[i][j+1]
            ws_ngramsp[i] = np.delete(ws_ngramsp[i],j+1)
            lengthi -= 1

        j += 1

    if (j<(len(ws_ngramsp[i])-1)):
        found_2 = ws_ngramsp[i][j]+"-"+ws_ngramsp[i][j+1] in ws_2words

        if found_2:
            ws_ngramsp[i][j] = ws_ngramsp[i][j]+"-"+ws_ngramsp[i][j+1]
            ws_ngramsp[i] = np.delete(ws_ngramsp[i],j+1)
            lengthi -= 1

for i in range(len(white_space_neg)):
    lengthi = len(white_space_neg[i])-2
    j = 0

    while (j<lengthi):
        found_3 = ws_ngramsn[i][j]+"-"+ws_ngramsn[i][j+1]+"-"+ws_ngramsn[i][j+2] in ws_3words
        found_2 = ws_ngramsn[i][j]+"-"+ws_ngramsn[i][j+1] in ws_2words

        if found_3:
            ws_ngramsn[i][j] = ws_ngramsn[i][j]+"-"+ws_ngramsn[i][j+1]+"-"+ws_ngramsn[i][j+2]
            ws_ngramsn[i] = np.delete(ws_ngramsn[i],j+1)
            ws_ngramsn[i] = np.delete(ws_ngramsn[i],j+1)
            lengthi -= 2

        elif found_2:
            ws_ngramsn[i][j] = ws_ngramsn[i][j]+"-"+ws_ngramsn[i][j+1]
            ws_ngramsn[i] = np.delete(ws_ngramsn[i],j+1)
            lengthi -= 1

        j += 1

    if (j<(len(ws_ngramsn[i])-1)):
        found_2 = ws_ngramsn[i][j]+"-"+ws_ngramsn[i][j+1] in ws_2words
        
        if found_2:
            ws_ngramsn[i][j] = ws_ngramsn[i][j]+"-"+ws_ngramsn[i][j+1]
            ws_ngramsn[i] = np.delete(ws_ngramsn[i],j+1)
            lengthi -= 1

In [None]:
stem_posp = np.copy(stemmed_pos)
stem_posn = np.copy(stemmed_neg)
lem_posp = np.copy(lemmatized_pos)
lem_posn = np.copy(lemmatized_neg)
ws_posp = np.copy(white_space_pos)
ws_posn = np.copy(white_space_neg)

#Stemmed
for i in range(len(stemmed_pos)):
    lengthi = len(stemmed_pos[i])-1
    j = 0
    while (j<lengthi):
        found = stem_posp[i][j]+"-"+stem_posp[i][j+1] in stem_poswords
        if found:
            stem_posp[i][j] = stem_posp[i][j]+"-"+stem_posp[i][j+1]
            stem_posp[i] = np.delete(stem_posp[i],j+1)
            lengthi -= 1
        j += 1

for i in range(len(stemmed_neg)):
    lengthi = len(stemmed_neg[i])-1
    j = 0
    while (j<lengthi):
        found = stem_posn[i][j]+"-"+stem_posn[i][j+1] in stem_poswords
        if found:
            stem_posn[i][j] = stem_posn[i][j]+"-"+stem_posn[i][j+1]
            stem_posn[i] = np.delete(stem_posn[i],j+1)
            lengthi -= 1
        j += 1
#Lemmatized
for i in range(len(lemmatized_pos)):
    lengthi = len(lemmatized_pos[i])-1
    j = 0
    while (j<lengthi):
        found = lem_posp[i][j]+"-"+lem_posp[i][j+1] in lem_poswords
        if found:
            lem_posp[i][j] = lem_posp[i][j]+"-"+lem_posp[i][j+1]
            lem_posp[i] = np.delete(lem_posp[i],j+1)
            lengthi -= 1
        j += 1

for i in range(len(lemmatized_neg)):
    lengthi = len(lemmatized_neg[i])-1
    j = 0
    while (j<lengthi):
        found = lem_posn[i][j]+"-"+lem_posn[i][j+1] in lem_poswords
        if found:
            lem_posn[i][j] = lem_posn[i][j]+"-"+lem_posn[i][j+1]
            lem_posn[i] = np.delete(lem_posn[i],j+1)
            lengthi -= 1
        j += 1

#White Space
for i in range(len(white_space_pos)):
    lengthi = len(white_space_pos[i])-1
    j = 0
    while (j<lengthi):
        found = ws_posp[i][j]+"-"+ws_posp[i][j+1] in ws_poswords
        if found:
            ws_posp[i][j] = ws_posp[i][j]+"-"+ws_posp[i][j+1]
            ws_posp[i] = np.delete(ws_posp[i],j+1)
            lengthi -= 1
        j += 1

for i in range(len(white_space_neg)):
    lengthi = len(white_space_neg[i])-1
    j = 0
    while (j<lengthi):
        found = ws_posn[i][j]+"-"+ws_posn[i][j+1] in ws_poswords
        if found:
            ws_posn[i][j] = ws_posn[i][j]+"-"+ws_posn[i][j+1]
            ws_posn[i] = np.delete(ws_posn[i],j+1)
            lengthi -= 1
        j += 1

Code for Bonus Feature: \
This code splits abbreciated words into two separate words.

In [None]:
feature_set = np.append(lem_ngramsp,lem_ngramsn)
for i in range(len(feature_set)): #Goes through each review in the set
    lengthi = len(feature_set[i])
    j = 0

    while (j<lengthi): #Goes through each word in the review
        
        #First section uses words that can't simply be split by getting rid of the "nt"
        if feature_set[i][j].lower() == "wont":
            feature_set[i][j] = "would"
            feature_set[i] = np.insert(feature_set[i], j+1,"not")
            j+=2
            lengthi += 1

        elif feature_set[i][j].lower() == "cant":
            feature_set[i][j] = "can"
            feature_set[i] = np.insert(feature_set[i], j+1,"not")
            j+=2
            lengthi += 1


        #Second section get rid of the ending of the word and adds it as a separate word
        elif feature_set[i][j].lower() == "wouldnt":
            feature_set[i][j] = "would" #Removes the "nt" and stores that word is its current location
            
            feature_set[i] = np.insert(feature_set[i], j+1,"not") #Inserts the "not" into the next location
            
            j+=2 #j is incremented by 2 to skip the new "not" element that got added
            
            lengthi += 1 #The length of the review is incremented by 1

        elif feature_set[i][j].lower() == "couldnt":
            feature_set[i][j] = "could"
            feature_set[i] = np.insert(feature_set[i], j+1,"not")
            j+=2
            lengthi += 1

        elif feature_set[i][j].lower() == "youre":
            feature_set[i][j] = "you"
            feature_set[i] = np.insert(feature_set[i], j+1,"are")
            j+=2
            lengthi += 1

        elif feature_set[i][j].lower() == "youll":
            feature_set[i][j] = "you"
            feature_set[i] = np.insert(feature_set[i], j+1,"will")
            j+=2
            lengthi += 1

        elif feature_set[i][j].lower() == "youve":
            feature_set[i][j] = "you"
            feature_set[i] = np.insert(feature_set[i], j+1,"have")
            j+=2
            lengthi += 1

        elif feature_set[i][j].lower() == "ive":
            feature_set[i][j] = "I"
            feature_set[i] = np.insert(feature_set[i], j+1,"have")
            j+=2
            lengthi += 1

        #The same happens for the remaining elif statements but for different word endings

        #If none of the statements ar true, go to the next word in the review
        else:
            j+=1

In [None]:
#The implementation of PoS using algorithm was attempted. However, after hours of difficulties with fixing NLTKs bugs with the pos_tag algorithm,
# a manual method of going through n-grams to see which nouns are together was used instead. Below is the code that would have been run if no
# issues with pos_tag would have occurred.
'''
words_pos = []
freq_pos = []
phrases_pos = []
regexes = 'CHUNK: {<NOUN> <NOUN>}'
for i in range(500):
    tokens = nltk.word_tokenize(stemmed_pos[i])
    tag = nltk.pos_tag(tokens, tagset='universal')
    tree = noun_phrase_regex.parse(tag)
    for subtree in tree.subtrees():
        if subtree.label() == 'CHUNK':
            if subtree in words_pos:
                words_pos = np.array(words_pos)
                x = np.where(words_pos == subtree)[0][0]
                freq_pos[x] += 1
                words_pos = words_pos.tolist()
            else:
                words_pos.append(subtree)
                freq_pos = np.append(freq_pos,1)
                freq_pos = freq_pos.tolist()
print("Positive explored...")
for i in range(500):
    tokens = nltk.word_tokenize(stemmed_neg[i])
    tag = nltk.pos_tag(tokens, tagset='universal')
    tree = noun_phrase_regex.parse(tag)
    for subtree in tree.subtrees():
        if subtree.label() == 'CHUNK':
            if subtree in words_pos:
                words_pos = np.array(words_pos)
                x = np.where(words_pos == subtree)[0][0]
                freq_pos[x] += 1
                words_pos = words_pos.tolist()
            else:
                words_pos.append(subtree)
                freq_pos = np.append(freq_pos,1)
                freq_pos = freq_pos.tolist()
for wordy in words_pos:
    leaves = wordy.leaves()
    phrases_pos.append(' '.join([word for word, _ in leaves]))
'''


Code that puts the data is a format that can then be split into training and testing data.

In [None]:
stemposp = np.array([])
stemposn = np.array([])
lemposp = np.array([])
lemposn = np.array([])
wsposp = np.array([])
wsposn = np.array([])
stemngramsp = np.array([])
stemngramsn = np.array([])
lemngramsp = np.array([])
lemngramsn = np.array([])
wsngramsp = np.array([])
wsngramsn = np.array([])

for i in range(2000):
    stemposp =np.append(stemposp, " ".join(str(x) for x in stem_posp[i]))
    stemngramsp =np.append(stemngramsp, " ".join(str(x) for x in stem_ngramsp[i]))
    lemposp =np.append(lemposp, " ".join(str(x) for x in lem_posp[i]))
    lemngramsp =np.append(lemngramsp, " ".join(str(x) for x in lem_ngramsp[i]))
    wsposp =np.append(wsposp, " ".join(str(x) for x in ws_posp[i]))
    wsngramsp =np.append(wsngramsp, " ".join(str(x) for x in ws_ngramsp[i]))
for i in range(1996):
    stemposn =np.append(stemposn, " ".join(str(x) for x in stem_posn[i]))
    stemngramsn =np.append(stemngramsn, " ".join(str(x) for x in stem_ngramsn[i]))
    lemposn =np.append(lemposn, " ".join(str(x) for x in lem_posn[i]))
    lemngramsn =np.append(lemngramsn, " ".join(str(x) for x in lem_ngramsn[i]))
    wsposn =np.append(wsposn, " ".join(str(x) for x in ws_posn[i]))
    wsngramsn =np.append(wsngramsn, " ".join(str(x) for x in ws_ngramsn[i]))

In [None]:
from sklearn.model_selection import train_test_split
labels = np.append(np.ones(2000),np.zeros(1997))
stem_pos_data = np.append(stemposp,stemposn)
lem_pos_data = np.append(lemposp,lemposn)
ws_pos_data = np.append(wsposp,wsposn)

stem_ngrams_data = np.append(stemngramsp,stemngramsn)
lem_ngrams_data = np.append(lemngramsp,lemngramsn)
ws_ngrams_data = np.append(wsngramsp,wsngramsn)

data = np.append(pos,neg)

data_train,data_test,label_train, label_test = train_test_split(data, labels, test_size=0.30, random_state=24)

stem_pos_data_train,stem_pos_data_test,stem_pos_label_train, stem_pos_label_test = train_test_split(stem_pos_data, labels, test_size=0.30, random_state=24)
lem_pos_data_train,lem_pos_data_test,lem_pos_label_train, lem_pos_label_test = train_test_split(lem_pos_data, labels, test_size=0.30, random_state=24)
ws_pos_data_train,ws_pos_data_test,ws_pos_label_train, ws_pos_label_test = train_test_split(ws_pos_data, labels, test_size=0.30, random_state=24)

stem_ngrams_data_train,stem_ngrams_data_test,stem_ngrams_label_train, stem_ngrams_label_test = train_test_split(stem_ngrams_data, labels, test_size=0.30, random_state=24)
lem_ngrams_data_train,lem_ngrams_data_test,lem_ngrams_label_train, lem_ngrams_label_test = train_test_split(lem_ngrams_data, labels, test_size=0.30, random_state=24)
ws_ngrams_data_train,ws_ngrams_data_test,ws_ngrams_label_train, ws_ngrams_label_test = train_test_split(ws_ngrams_data, labels, test_size=0.30, random_state=24)

Code for Normalisation section:\
These are the two normalisation methods that were made from scratch. The first one being TF-IDF and the second being TF-RF.

In [None]:
class tfidfvectoriser:
    dictionary = np.array([])

    def fit_transform(self, array):
        self.create_dict(np.char.split(array)) #Fitting part of the fit_transform
        terms = self.transform(array) #Transforming part of the fit_transform
        return terms

    def transform(self,array):
        array = np.char.split(array)
        terms = np.zeros((len(array),len(self.dictionary))) #Creates the array of tokens that initially starts with a frequency of 0 for all
        terms = terms.tolist()

        for i in range(len(array)): #Goes through all the reviews passed into the transform function
            n = self.count_words(array[i]) #Creates a frequency table for a review based on the dictionary
            sum_d = sum(n)
            n = n/sum_d #Acquires the TF value of the tokens
            terms[i] = n

        idf = np.zeros(len(self.dictionary))

        for i in range(len(self.dictionary)):
            for j in range(len(array)):
              if terms[j][i] != 0.0: #Sees if the term is in the file (non-zero)
                idf[i]+= 1 #If it is in it, increments count by 1

        idf = (1+len(array))/(1+idf)
        idf = np.log(idf) #Acquires the IDF value of terms

        terms = terms * idf #Acquires the TF-IDF values
        return terms


    def count_words(self,array):
        freq = np.zeros(len(self.dictionary))
        for i in range(len(array)):
            if array[i].lower() in self.dictionary: #Sees if the lowercase version of the word is in the dictionary
                self.dictionary = np.array(self.dictionary)
                x = np.where(self.dictionary == array[i].lower())[0][0]
                freq[x] += 1 #If so, it increments the freq of that word appearing in this review
                self.dictionary = self.dictionary.tolist()
        
        return freq

    def create_dict(self,array):
        for i in range(len(array)): #Goes through all reviews
            for j in range(len(array[i])): #Goes through all words in the review
                if array[i][j].lower() not in self.dictionary: #Sees if the lowercase version of the word is in the dictionary
                    self.dictionary = np.append(self.dictionary,array[i][j].lower()) #If not, it adds the lowercase version of the word to the dictionary
                    self.dictionary = self.dictionary.tolist()

class tfrfvectoriser():
    dictionary = np.array([])
    rf = np.array([])
    def fit_transform(self, array,lables):
        self.create_dict(np.char.split(array)) #Fitting part of the fit_transform
        self.create_rf(array,lables) #Acquires RF values of terms based on labels
        terms = self.transform(array) #Transforming part of the fit_transform
        return terms

    def transform(self,array):
        array = np.char.split(array)
        terms = np.zeros((len(array),len(self.dictionary))) #Creates the array of tokens that initially starts with a frequency of 0 for all
        terms = terms.tolist()
        for i in range(len(array)): #Goes through all the reviews passed into the transform function
            n = self.count_words(array[i])
            sum_d = sum(n)
            tf = n/sum_d #Acquires the TF value of the tokens
            tfrf = tf*self.rf #Acquires the TF-RF value of the tokens
            terms[i] = tfrf

        return terms

    def count_words(self,array):
        freq = np.zeros(len(self.dictionary))
        for i in range(len(array)):
            if array[i].lower() in self.dictionary:
                self.dictionary = np.array(self.dictionary)
                x = np.where(self.dictionary == array[i].lower())[0][0]
                freq[x] += 1
                self.dictionary = self.dictionary.tolist()
        return freq

    def create_dict(self,array):
        for i in range(len(array)):
            for j in range(len(array[i])):
                if array[i][j].lower() not in self.dictionary:
                    self.dictionary = np.append(self.dictionary,array[i][j].lower())
                    self.dictionary = self.dictionary.tolist()


    def create_rf(self,array,lables):
        self.rf = np.zeros(len(self.dictionary)) #Creates the array of rf values for each of the words in the dictionary
        for i in range(len(self.dictionary)): #Goes through each term in the dictionary
            a = 0 #Number of positive reviews with the term
            c = 0 #Number of negative reviews without the term
            
            for j in range(len(array)):
                if self.dictionary[i] in array[j].lower():
                    if lables[j] == 1: #If the term is in the review and its a positive review, a is incremented
                        a += 1
                
                else:
                    if lables[j] == 0: #If the term isn't in the review and its a negative review, c is incremented
                        c += 1
            
            if 1>c: #The max function in the formula
                c = 1
            
            self.rf[i] = np.log(2 + (a/c)) #Acquires the RF value for the term in the dictionary




Code for Feature Selection: \
Runs all possible combinations on the evaluation data and computes the accuracy, precision, and recall of each feature set.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

tfvectorizer = tfidfvectoriser()
stem_ngrams_data_train_tfidf = tfvectorizer.fit_transform(stem_ngrams_data_train)
stem_ngrams_data_test_tfidf = tfvectorizer.transform(stem_ngrams_data_test[0:int(len(data_test)*0.33)])
stem_ngrams_tf_mnb = MultinomialNB()
stem_ngrams_tf_mnb.fit(stem_ngrams_data_train_tfidf,stem_ngrams_label_train)
stem_ngrams_tf_y = stem_ngrams_tf_mnb.predict(stem_ngrams_data_test_tfidf)
print("Stem N-grams TF-IDF")
print("Accuracy", accuracy_score(stem_ngrams_label_test[0:int(len(data_test)*0.33)], stem_ngrams_tf_y))
print("Precision", precision_score(stem_ngrams_label_test[0:int(len(data_test)*0.33)], stem_ngrams_tf_y))
print("Recall", recall_score(stem_ngrams_label_test[0:int(len(data_test)*0.33)], stem_ngrams_tf_y))
print("")

tfvectorizer = tfidfvectoriser()
lem_ngrams_data_train_tfidf = tfvectorizer.fit_transform(lem_ngrams_data_train)
lem_ngrams_data_test_tfidf = tfvectorizer.transform(lem_ngrams_data_test[0:int(len(data_test)*0.33)])
lem_ngrams_tf_mnb = MultinomialNB()
lem_ngrams_tf_mnb.fit(lem_ngrams_data_train_tfidf,lem_ngrams_label_train)
lem_ngrams_tf_y = lem_ngrams_tf_mnb.predict(lem_ngrams_data_test_tfidf)
print("Lem N-grams TF-IDF")
print("Accuracy", accuracy_score(lem_ngrams_label_test[0:int(len(data_test)*0.33)], lem_ngrams_tf_y))
print("Precision", precision_score(lem_ngrams_label_test[0:int(len(data_test)*0.33)], lem_ngrams_tf_y))
print("Recall", recall_score(lem_ngrams_label_test[0:int(len(data_test)*0.33)], lem_ngrams_tf_y))
print("")

tfvectorizer = tfidfvectoriser()
ws_ngrams_data_train_tfidf = tfvectorizer.fit_transform(ws_ngrams_data_train)
ws_ngrams_data_test_tfidf = tfvectorizer.transform(ws_ngrams_data_test[0:int(len(data_test)*0.33)])
ws_ngrams_tf_mnb = MultinomialNB()
ws_ngrams_tf_mnb.fit(ws_ngrams_data_train_tfidf,ws_ngrams_label_train)
ws_ngrams_tf_y = ws_ngrams_tf_mnb.predict(ws_ngrams_data_test_tfidf)
print("White Space N-grams TF-IDF")
print("Accuracy", accuracy_score(ws_ngrams_label_test[0:int(len(data_test)*0.33)], ws_ngrams_tf_y))
print("Precision", precision_score(ws_ngrams_label_test[0:int(len(data_test)*0.33)], ws_ngrams_tf_y))
print("Recall", recall_score(ws_ngrams_label_test[0:int(len(data_test)*0.33)], ws_ngrams_tf_y))
print("")

tfvectorizer = tfidfvectoriser()
stem_pos_data_train_tfidf = tfvectorizer.fit_transform(stem_pos_data_train)
stem_pos_data_test_tfidf = tfvectorizer.transform(stem_pos_data_test[0:int(len(data_test)*0.33)])
stem_pos_tf_mnb = MultinomialNB()
stem_pos_tf_mnb.fit(stem_pos_data_train_tfidf,stem_pos_label_train)
stem_pos_tf_y = stem_pos_tf_mnb.predict(stem_pos_data_test_tfidf)
print("Stem PoS TF-IDF")
print("Accuracy", accuracy_score(stem_pos_label_test[0:int(len(data_test)*0.33)], stem_pos_tf_y))
print("Precision", precision_score(stem_pos_label_test[0:int(len(data_test)*0.33)], stem_pos_tf_y))
print("Recall", recall_score(stem_pos_label_test[0:int(len(data_test)*0.33)], stem_pos_tf_y))
print("")

tfvectorizer = tfidfvectoriser()
lem_pos_data_train_tfidf = tfvectorizer.fit_transform(lem_pos_data_train)
lem_pos_data_test_tfidf = tfvectorizer.transform(lem_pos_data_test[0:int(len(data_test)*0.33)])
lem_pos_tf_mnb = MultinomialNB()
lem_pos_tf_mnb.fit(lem_pos_data_train_tfidf,lem_pos_label_train)
lem_pos_tf_y = lem_pos_tf_mnb.predict(lem_pos_data_test_tfidf)
print("Lem PoS TF-IDF")
print("Accuracy", accuracy_score(lem_pos_label_test[0:int(len(data_test)*0.33)], lem_pos_tf_y))
print("Precision", precision_score(lem_pos_label_test[0:int(len(data_test)*0.33)], lem_pos_tf_y))
print("Recall", recall_score(lem_pos_label_test[0:int(len(data_test)*0.33)], lem_pos_tf_y))
print("")

tfvectorizer = tfidfvectoriser()
ws_pos_data_train_tfidf = tfvectorizer.fit_transform(ws_pos_data_train)
ws_pos_data_test_tfidf = tfvectorizer.transform(ws_pos_data_test[0:int(len(data_test)*0.33)])
ws_pos_tf_mnb = MultinomialNB()
ws_pos_tf_mnb.fit(ws_pos_data_train_tfidf,ws_pos_label_train)
ws_pos_tf_y = ws_pos_tf_mnb.predict(ws_pos_data_test_tfidf)
print("White Space PoS TF-IDF")
print("Accuracy", accuracy_score(ws_pos_label_test[0:int(len(data_test)*0.33)], ws_pos_tf_y))
print("Precision", precision_score(ws_pos_label_test[0:int(len(data_test)*0.33)], ws_pos_tf_y))
print("Recall", recall_score(ws_pos_label_test[0:int(len(data_test)*0.33)], ws_pos_tf_y))
print("")


cvectorizer = tfrfvectoriser()
stem_ngrams_data_train_count = cvectorizer.fit_transform(stem_ngrams_data_train,stem_ngrams_label_train)
stem_ngrams_data_test_count = cvectorizer.transform(stem_ngrams_data_test[0:int(len(data_test)*0.33)])
stem_ngrams_count_mnb = MultinomialNB()
stem_ngrams_count_mnb.fit(stem_ngrams_data_train_count,stem_ngrams_label_train)
stem_ngrams_count_y = stem_ngrams_count_mnb.predict(stem_ngrams_data_test_count)
print("Stem N-grams TF-RF")
print("Accuracy", accuracy_score(stem_ngrams_label_test[0:int(len(data_test)*0.33)], stem_ngrams_count_y))
print("Precision", precision_score(stem_ngrams_label_test[0:int(len(data_test)*0.33)], stem_ngrams_count_y))
print("Recall", recall_score(stem_ngrams_label_test[0:int(len(data_test)*0.33)], stem_ngrams_count_y))
print("")

cvectorizer = tfrfvectoriser()
lem_ngrams_data_train_count = cvectorizer.fit_transform(lem_ngrams_data_train,lem_ngrams_label_train)
lem_ngrams_data_test_count = cvectorizer.transform(lem_ngrams_data_test[0:int(len(data_test)*0.33)])
lem_ngrams_count_mnb = MultinomialNB()
lem_ngrams_count_mnb.fit(lem_ngrams_data_train_count,lem_ngrams_label_train)
lem_ngrams_count_y = lem_ngrams_count_mnb.predict(lem_ngrams_data_test_count)
print("Lem N-grams TF-RF")
print("Accuracy", accuracy_score(lem_ngrams_label_test[0:int(len(data_test)*0.33)], lem_ngrams_count_y))
print("Precision", precision_score(lem_ngrams_label_test[0:int(len(data_test)*0.33)], lem_ngrams_count_y))
print("Recall", recall_score(lem_ngrams_label_test[0:int(len(data_test)*0.33)], lem_ngrams_count_y))
print("")

cvectorizer = tfrfvectoriser()
ws_ngrams_data_train_count = cvectorizer.fit_transform(ws_ngrams_data_train,ws_ngrams_label_train)
ws_ngrams_data_test_count = cvectorizer.transform(ws_ngrams_data_test[0:int(len(data_test)*0.33)])
ws_ngrams_count_mnb = MultinomialNB()
ws_ngrams_count_mnb.fit(ws_ngrams_data_train_count,ws_ngrams_label_train)
ws_ngrams_count_y = ws_ngrams_count_mnb.predict(ws_ngrams_data_test_count)
print("White Space N-grams TF-RF")
print("Accuracy", accuracy_score(ws_ngrams_label_test[0:int(len(data_test)*0.33)], ws_ngrams_count_y))
print("Precision", precision_score(ws_ngrams_label_test[0:int(len(data_test)*0.33)], ws_ngrams_count_y))
print("Recall", recall_score(ws_ngrams_label_test[0:int(len(data_test)*0.33)], ws_ngrams_count_y))
print("")

cvectorizer = tfrfvectoriser()
ws_pos_data_train_count = cvectorizer.fit_transform(ws_pos_data_train,ws_pos_label_train)
ws_pos_data_test_count = cvectorizer.transform(ws_pos_data_test[0:int(len(data_test)*0.33)])
ws_pos_count_mnb = MultinomialNB()
ws_pos_count_mnb.fit(ws_pos_data_train_count,ws_pos_label_train)
ws_pos_count_y = ws_pos_count_mnb.predict(ws_pos_data_test_count)
print("White Space PoS TF-RF")
print("Accuracy", accuracy_score(ws_pos_label_test[0:int(len(data_test)*0.33)], ws_pos_count_y))
print("Precision", precision_score(ws_pos_label_test[0:int(len(data_test)*0.33)], ws_pos_count_y))
print("Recall", recall_score(ws_pos_label_test[0:int(len(data_test)*0.33)], ws_pos_count_y))
print("")

cvectorizer = tfrfvectoriser()
stem_pos_data_train_count = cvectorizer.fit_transform(stem_pos_data_train,stem_pos_label_train)
stem_pos_data_test_count = cvectorizer.transform(stem_pos_data_test[0:int(len(data_test)*0.33)])
stem_pos_count_mnb = MultinomialNB()
stem_pos_count_mnb.fit(stem_pos_data_train_count,stem_pos_label_train)
stem_pos_count_y = stem_pos_count_mnb.predict(stem_pos_data_test_count)
print("Stem PoS TF-RF")
print("Accuracy", accuracy_score(stem_pos_label_test[0:int(len(data_test)*0.33)], stem_pos_count_y))
print("Precision", precision_score(stem_pos_label_test[0:int(len(data_test)*0.33)], stem_pos_count_y))
print("Recall", recall_score(stem_pos_label_test[0:int(len(data_test)*0.33)], stem_pos_count_y))
print("")

cvectorizer = tfrfvectoriser()
lem_pos_data_train_count = cvectorizer.fit_transform(lem_pos_data_train,lem_pos_label_train)
lem_pos_data_test_count = cvectorizer.transform(lem_pos_data_test[0:int(len(data_test)*0.33)])
lem_pos_count_mnb = MultinomialNB()
lem_pos_count_mnb.fit(lem_pos_data_train_count,lem_pos_label_train)
lem_pos_count_y = lem_pos_count_mnb.predict(lem_pos_data_test_count)
print("Lem PoS TF-RF")
print("Accuracy", accuracy_score(lem_pos_label_test[0:int(len(data_test)*0.33)], lem_pos_count_y))
print("Precision", precision_score(lem_pos_label_test[0:int(len(data_test)*0.33)], lem_pos_count_y))
print("Recall", recall_score(lem_pos_label_test[0:int(len(data_test)*0.33)], lem_pos_count_y))
print("")


Code for Naive Bayes: \
This is the model that was made from scratch.

In [None]:
class naive_bayes:
  def fit(self, data, labels):
      #Likelihood of each feature effecting the label
      self.neg_likelihood = np.zeros(len(data[0]))
      self.pos_likelihood = np.zeros(len(data[0]))

      #Total count of all words in type of review
      self.pos_total_count = 0
      self.neg_total_count = 0

      #Total count of each word in type of review
      self.pos_word_count = np.zeros(len(data[0]))
      self.neg_word_count = np.zeros(len(data[0]))

      #Counts and stores the word counts of positive and negative reviews
      for i in range(len(data)):
            for j in range(len(data[i])):
                if (labels[i] == 1):
                    self.pos_total_count += 1
                    self.pos_word_count[j] += data[i][j]
                else:
                    self.neg_total_count += 1
                    self.neg_word_count[j] += data[i][j]

      #Calculates average amount of words in positive and negative reviews
      self.pos_av = self.pos_total_count / len(data)
      self.neg_av = self.neg_total_count / len(data)

      #Calculates the likelhoods of each word affecting the label
      #+1 included for Laplace smoothing (avoids division by 0)
      self.pos_likelihood = (self.pos_word_count + 1) / (np.sum(self.pos_word_count + 1))
      self.neg_likelihood = (self.neg_word_count + 1) / (np.sum(self.neg_word_count + 1))


  def predict(self, data):
      labels = np.zeros(len(data))
      for i in range(len(data)):

          #Calculates the probability that the data is positive or negative
          pos = np.sum((np.log(self.pos_likelihood) * data[i])) + np.log(self.pos_av)
          neg = np.sum((np.log(self.neg_likelihood) * data[i])) + np.log(self.neg_av)

          #If the probability for positive is bigger, its labelled 1
          if pos > neg:
              labels[i] = 1

          #Otherwise, its labelled 0
          else:
            labels[i] = 0
      
      return labels




Code for Naive Bayes:\
This runs the different models on the test data of the feature set.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
tfvectorizer = tfidfvectoriser()

lem_ngrams_data_train_tfidf = tfvectorizer.fit_transform(lem_ngrams_data_train)
lem_ngrams_data_test_tfidf = tfvectorizer.transform(lem_ngrams_data_test[int(len(data_test)*0.33):])
lem_ngrams_tf_mnb = MultinomialNB()
lem_ngrams_tf_mnb.fit(lem_ngrams_data_train_tfidf,lem_ngrams_label_train)
lem_ngrams_tf_y = lem_ngrams_tf_mnb.predict(lem_ngrams_data_test_tfidf)
print("Sklearn")
print("Accuracy", accuracy_score(lem_ngrams_label_test[int(len(data_test)*0.33):], lem_ngrams_tf_y))
print("Precision", precision_score(lem_ngrams_label_test[int(len(data_test)*0.33):], lem_ngrams_tf_y))
print("Recall", recall_score(lem_ngrams_label_test[int(len(data_test)*0.33):], lem_ngrams_tf_y))
print("")

em_ngrams_tf_mnbs = naive_bayes()
lem_ngrams_tf_mnbs.fit(lem_ngrams_data_train_tfidf,lem_ngrams_label_train)
lem_ngrams_tf_ys = lem_ngrams_tf_mnbs.predict(lem_ngrams_data_test_tfidf)
print("Scratch")
print("Accuracy", accuracy_score(lem_ngrams_label_test[int(len(data_test)*0.33):], lem_ngrams_tf_ys))
print("Precision", precision_score(lem_ngrams_label_test[int(len(data_test)*0.33):], lem_ngrams_tf_ys))
print("Recall", recall_score(lem_ngrams_label_test[int(len(data_test)*0.33):], lem_ngrams_tf_ys))
print("")
