In [16]:
# IMPORTS AND LOADING DATA
import numpy as np
import re
from collections import Counter
import matplotlib.pyplot as plt

np.random.seed(10)

# load in data
reviews=[]
sentiment_ratings=[]
product_types=[]
helpfulness_ratings=[]

with open("Compiled_Reviews.txt") as f:
   for line in f.readlines()[1:]:
        fields = line.rstrip().split('\t')
        reviews.append(fields[0])
        sentiment_ratings.append(fields[1])
        product_types.append(fields[2])
        helpfulness_ratings.append(fields[3])


In [None]:
# HISTOGRAM OF POSITIVES V NEGATIVES
coded_sentiment_ratings = [1 if x == 'positive' else 0 for x in sentiment_ratings]
num_positive = sum(coded_sentiment_ratings)
num_negative = len(coded_sentiment_ratings) - num_positive
categories = ['Positives', 'Negatives']
counts = [num_positive, num_negative]

# Plot the histogram
plt.bar(categories, counts)
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('Number of Positives vs. Negatives')
plt.show()

print(f'{(num_positive/len(coded_sentiment_ratings))*100}% positive')
print(f'{(num_negative/len(coded_sentiment_ratings))*100}% negative')


In [17]:
# HELPER FUNCTION FOR REMOVING STEMS
def stem_word(word):
    suffixes = ['ing', 'ed', 'ly', 'es', 's', 'ment', 'tion', 'er', 'est']
    suffixes = sorted(suffixes, key=len, reverse=True)
    
    for suffix in suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]  
    return word  

# LOAD IN STOPWORDS
f = open("data/stopwords.txt", "r")
stopwords = []
for line in f:
    stopwords.append(line.strip())
f.close()

punct = ['#', '"', '""', '%', '$', '&', ')', '(', '+', '*', '-'] 


In [18]:
# LOAD IN STOPWORDS
f = open("data/stopwords.txt", "r")
stopwords = []
for line in f:
    stopwords.append(line.strip())
f.close()

punct = ['#', '"', '""', '%', '$', '&', ')', '(', '+', '*', '-'] 

In [19]:
# VOCABULARY - BASE MODEL

# tokenise reviews
token_def = re.compile("[^ \.?!:,)(\"]+")
tokenized_sents = [token_def.findall(txt) for txt in reviews]
tokens=[]
for s in tokenized_sents:

    # FOR BASE MODEL
    tokens.extend(s)

# use a counter to count the tokens
counts=Counter(tokens)
# sort the tokens
so=sorted(counts.items(), key=lambda item: item[1], reverse=True)
so=list(zip(*so))[0]

# get 5000 most common into vocabulary
type_list=so[0:5000]
vocab_list = type_list


In [21]:
# VOCABULARY - DOWNLOADED VOCAB

# ENGLISH DICTIONARY WORDS
f = open("data/popular.txt", "r")
tokens = []
for line in f:
    tokens.append(line.strip())
f.close()
vocab_list = tokens

# POSITIVE AND NEGATIVE SENTIMENT WORDS
f = open("data/positive-words.txt", "r")
positive = []
for line in f:
    positive.append(line.strip())
f.close()

f = open("data/negative-words.txt", "r")
negative = []
for line in f:
    negative.append(line.strip())
f.close()
tokens = positive + negative
vocab_list = tokens

In [None]:
# VOCABULARY - REMOVED STOPWORDS

# tokenise reviews
token_def = re.compile("[^ \.?!:,)(\"]+")
tokenized_sents = [token_def.findall(txt) for txt in reviews]
tokens=[]
for s in tokenized_sents:
    filtered_tokens = [t.lower() for t in s if t.lower() not in stopwords and t not in punct]
    tokens.extend(filtered_tokens)


# use a counter to count the tokens
counts=Counter(tokens)
# sort the tokens
so=sorted(counts.items(), key=lambda item: item[1], reverse=True)
so=list(zip(*so))[0]

# get 5000 most common into vocabulary
type_list=so[0:5000]
vocab_list = type_list


In [None]:
# VOCABULARY - STEM WORDS

# tokenise reviews
token_def = re.compile("[^ \.?!:,)(\"]+")
tokenized_sents = [token_def.findall(txt) for txt in reviews]
tokens=[]
for s in tokenized_sents:
    tokens.extend(s)


# FOR REMOVING STEMS
pre_stem_tokens = [stem_word(t) for t in tokens if t not in stopwords and t not in punct]
tokens = list(set(pre_stem_tokens))

# use a counter to count the tokens
counts=Counter(tokens)
# sort the tokens
so=sorted(counts.items(), key=lambda item: item[1], reverse=True)
so=list(zip(*so))[0]

# get 5000 most common into vocabulary
type_list=so[0:5000]
vocab_list = type_list


In [None]:
# VOCABULARY - BIGRAMS

# tokenise reviews
token_def = re.compile("[^ \.?!:,)(\"]+")
tokenized_sents = [token_def.findall(txt) for txt in reviews]
tokens=[]
for s in tokenized_sents:
    tokens.extend(tokens)
    tokens.extend([f"{tokens[i]}_{tokens[i+1]}" for i in range(len(tokens) - 1)]) 

# use a counter to count the tokens
counts=Counter(tokens)
# sort the tokens
so=sorted(counts.items(), key=lambda item: item[1], reverse=True)
so=list(zip(*so))[0]

# get 5000 most common into vocabulary
type_list=so[0:5000]
vocab_list = type_list


In [None]:
# CREATE AN EMBEDDING MATRIX
# # rerun this if you change the vocab
M = np.zeros((len(reviews), len(vocab_list)))
print(len(reviews))
for i, rev in enumerate(reviews):
    if i%1000 == 0:
         print(i)

    # FOR BASE MODEL
    tokens = [t for t in token_def.findall(rev)]

    # FOR LOWERCASING AND REMOVING STOPWORDS
    # tokens = [t.lower() for t in token_def.findall(rev) if t.lower() not in stopwords and t not in punct]
    
    # FOR BIGRAMS
    # tokens = [t for t in token_def.findall(rev)]
    # bigrams = [f"{tokens[j]}_{tokens[j+1]}" for j in range(len(tokens) - 1)]
    # all_tokens = tokens + bigrams 

    # FOR REMOVING STEMS 
    # tokens = [stem_word(token) for token in tokens]
    
    # iterate over vocab
    for j, vocab_token in enumerate(vocab_list):
        # if the current word j occurs in the current review i then set the matrix element at i,j to be one. Otherwise leave as zero.
        if vocab_token in tokens:
              
              # FOR ONE HOT
              M[i,j] = 1
              
              # FOR BAG OF WORDS
            #   M[i, j] += 1



train_ints=np.random.choice(len(reviews),int(len(reviews)*0.8),replace=False)
test_ints=list(set(range(0,len(reviews))) - set(train_ints))
M_train = M[train_ints,]
M_test = M[test_ints,]

# for labels, use a vector representation
labels_train = [sentiment_ratings[i] for i in train_ints]
labels_test = [sentiment_ratings[i] for i in test_ints]


In [None]:
# BASE MODEL 
y=[int(l == "positive") for l in labels_train]
y = np.array(y)
num_features=len(vocab_list)
weights = np.random.rand(num_features)
bias=np.random.rand(1)
n_iters = 6000
lr=0.225
logistic_loss=[]
num_samples=len(y)

for i in range(n_iters):
    if i % 100 == 0: # for logging progress
        print(i)
    
    loss = 0.0
    z = M_train.dot(weights.T) + bias
    q = 1/(1+np.exp(-z))
    eps=0.00001
    loss = -sum((y*np.log2(q+eps)+(np.ones(len(y))-y)*np.log2(np.ones(len(y))-q+eps)))
    logistic_loss.append(loss)

    # BASE MODEL
    dw = ((q-y).dot(M_train) * (1/len(y)))

    # L2 REGULARISATION
    # dw = ((q-y).dot(M_train) * (1/len(y))) + (0.001*weights) 
    
    db = sum((q-y))/len(y) 
    weights = weights - lr*dw 
    bias = bias - lr*db

    

plt.plot(range(1,n_iters),logistic_loss[1:])
plt.xlabel("number of epochs")
plt.ylabel("loss")

In [None]:
# MODEL WITH L2 REGULARISATION
y=[int(l == "positive") for l in labels_train]
y = np.array(y)
num_features=len(vocab_list)
weights = np.random.rand(num_features)
bias=np.random.rand(1)
n_iters = 6000
lr=0.225
logistic_loss=[]
num_samples=len(y)

for i in range(n_iters):
    if i % 100 == 0: # for logging progress
        print(i)
    
    loss = 0.0
    z = M_train.dot(weights.T) + bias
    q = 1/(1+np.exp(-z))
    eps=0.00001
    loss = -sum((y*np.log2(q+eps)+(np.ones(len(y))-y)*np.log2(np.ones(len(y))-q+eps)))
    logistic_loss.append(loss)

    # BASE MODEL
    dw = ((q-y).dot(M_train) * (1/len(y)))

    # L2 REGULARISATION
    # dw = ((q-y).dot(M_train) * (1/len(y))) + (0.001*weights) 
    
    db = sum((q-y))/len(y) 
    weights = weights - lr*dw 
    bias = bias - lr*db

    

plt.plot(range(1,n_iters),logistic_loss[1:])
plt.xlabel("number of epochs")
plt.ylabel("loss")

In [None]:
# MODEL WITH BATCH TRAINING

k = 443
# Create array of all indices in training data
a=np.arange(M_train.shape[0])
# randomly shuffle indices in place (in case of classes being unequally distributed across positions in data)
np.random.shuffle(a)
# Split indices into k equal batches
batches=np.array(np.split(a, k))

y=[int(l == "positive") for l in labels_train]
y = np.array(y)
num_features=len(vocab_list)
weights = np.random.rand(num_features)
bias=np.random.rand(1)
n_iters = 6000
lr=0.225
logistic_loss=[]
num_samples=len(y)

for i in range(n_iters):
    if i % 100 == 0: # for logging progress
        print(i)
    
    loss = 0.0
    for j in range(len(batches)):
        this_batch_M_train = M_train[batches[j]]
        this_batch_y_train = y[batches[j]]
        z = this_batch_M_train.dot(weights.T) + bias
        q = 1/(1+np.exp(-z))

        # calculate loss
        eps=0.00001
        loss = -sum((this_batch_y_train*np.log2(q+eps)+(np.ones(len(this_batch_y_train))-this_batch_y_train)*np.log2(np.ones(len(this_batch_y_train))-q+eps)))
        

        # calculate gradients
        dw = ((q-this_batch_y_train).dot(this_batch_M_train) * (1/len(this_batch_y_train)))
        db = sum((q-this_batch_y_train))/len(this_batch_y_train) 

        # update weights
        weights = weights - lr*dw 
        bias = bias - lr*db
    logistic_loss.append(loss)
    

plt.plot(range(1,n_iters),logistic_loss[1:])
plt.xlabel("number of epochs")
plt.ylabel("loss")
#loss = sum(-(np.ones(len(y))*np.log2(q)+(np.ones(len(y))-y)*np.log2(np.ones(len(y))-q)))

In [None]:
# EVALUATE ON TEST SET
z = M_test.dot(weights)+bias # weighted inputs
q = 1/(1+np.exp(-z)) # sigmoided input

y_test_pred = [int(prob > 0.5) for prob in q] 
y_test=[int(l == "positive") for l in labels_test]


# EVALUATION METRICS
# accuracy
acc_test=[int(yp == y_test[s]) for s,yp in enumerate(y_test_pred)]
print(f'accuracy: {sum(acc_test)/len(acc_test)}')

# precision and recall
labels_test_pred=["positive" if s == 1 else "negative" for s in y_test_pred]
true_positives=sum([int(yp == "positive" and labels_test[s] == "positive") for s,yp in enumerate(labels_test_pred)])
false_negatives=sum([int(yp == "negative" and labels_test[s] == "positive") for s,yp in enumerate(labels_test_pred)])
false_positives=sum([int(yp == "positive" and labels_test[s] == "negative") for s,yp in enumerate(labels_test_pred)])
true_negatives=sum([int(yp == "negative" and labels_test[s] == "negative") for s,yp in enumerate(labels_test_pred)])

precision = true_positives/(true_positives + false_positives)
recall = true_positives/(true_positives + false_negatives)
print(f'precision: {precision}')
print(f'recall: {recall}')


In [None]:
# EXAMINING WEIGHTS
print("most impactful words for a positive review:")
print([vocab_list[x] for x in np.argsort(weights)[::-1][:20]])
print([vocab_list[x] for x in np.argsort(weights)[::-1][20:40]])

print("\nmost impactful words for a negative review:")
print([vocab_list[x] for x in np.argsort(weights)[:20]])
print([vocab_list[x] for x in np.argsort(weights)[20:40]])