In [80]:
import re
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

reviews=[]
sentiment_ratings=[]
product_types=[]
helpfulness_ratings=[]

with open("Compiled_Reviews.txt") as f:
   for line in f.readlines()[1:]:
        fields = line.rstrip().split('\t')
        reviews.append(fields[0])
        sentiment_ratings.append(fields[1])
        product_types.append(fields[2])
        helpfulness_ratings.append(fields[3])

In [82]:
# LOAD IN STOPWORDS
f = open("data/stopwords.txt", "r")
stopwords = []
for line in f:
    stopwords.append(line.strip())
f.close()

punct = ['#', '"', '""', '%', '$', '&', ')', '(', '+', '*', '-'] 

In [83]:
# TOKENISE REVIEWS FOR VOCABULARY

token_def = re.compile("[^ \.?!:,)(\"]+")
tokenized_sents = [token_def.findall(txt) for txt in reviews]
tokens=[]
for s in tokenized_sents:
    filtered_tokens = [t.lower() for t in s if t.lower() not in stopwords and t not in punct]

    # Add unigrams
    tokens.extend(filtered_tokens)

    # Add bigrams
    tokens.extend([f"{filtered_tokens[i]}_{filtered_tokens[i+1]}" for i in range(len(filtered_tokens) - 1)])


tokens = [t for t in tokens if t not in stopwords and t not in punct]


In [None]:
# GET VOCABULARY - 5000 MOST COMMON WORDS

counts=Counter(tokens)
so=sorted(counts.items(), key=lambda item: item[1], reverse=True)
so=list(zip(*so))[0]

# get 5000 most common into vocabulary
type_list=so[0:5000]
vocab_list = type_list

In [84]:
M = np.zeros((len(reviews), len(vocab_list)))
print(len(reviews))
for i, rev in enumerate(reviews):
    if i%1000 == 0:
         print(i)

    tokens = [t.lower() for t in token_def.findall(rev) if t.lower() not in stopwords and t not in punct]

    # Generate bigrams
    bigrams = [f"{tokens[j]}_{tokens[j+1]}" for j in range(len(tokens) - 1)]

    # Combine unigrams and bigrams
    all_tokens = tokens + bigrams
    # for stemming
    # tokens = [stem_word(token) for token in tokens]

    # iterate over vocab
    for j, vocab_token in enumerate(vocab_list):
        # if the current word j occurs in the current review i then set the matrix element at i,j to be one. Otherwise leave as zero.
        if vocab_token in all_tokens:
              M[i,j] += 1

In [85]:
train_ints=np.random.choice(len(reviews),int(len(reviews)*0.8),replace=False)
train_ints = train_ints.tolist()
test_ints=list(set(range(0,len(reviews))) - set(train_ints))
M_train = M[train_ints,]
M_test = M[test_ints,]

# for labels, use a vector representation
labels_train = [sentiment_ratings[i] for i in train_ints]
labels_test = [sentiment_ratings[i] for i in test_ints]

In [126]:
# y has to be an vector of integers (0 for negative 1 for positive)
y=np.array([int(l == "positive") for l in labels_train])

In [None]:
y = np.array([int(l == "positive") for l in labels_train])  # Convert to cp.ndarray
# initialise variables
num_features=len(vocab_list)
hidden_size = 3000
weights_0_1 = np.random.randn(num_features, hidden_size)
weights_1_2 = np.random.randn(hidden_size, 1)
bias_1 = np.random.randn(hidden_size)  # Shape: (100,)
bias_2 = np.random.rand(1)  # Shape: (1,)
n_iters = 2500
lr=0.01
logistic_loss=[]
num_samples=len(y)
# previously on iteration 10001

In [None]:
# MODEL
for i in range(n_iters):
    print(f"Iteration {i}/{n_iters}")

    # forward pass
    layer_1 = np.maximum(M_train.dot(weights_0_1) + bias_1, 0)
    layer_2 = layer_1.dot(weights_1_2) + bias_2
    q = 1/(1+np.exp(-layer_2))

    # difference between actual labels
    diff = (q.T - y)


    loss = -np.mean(y * np.log(q.T) + (1 - y) * np.log(1 - q.T))
    logistic_loss.append(loss)

    # for hidden layer weights
    grad_weights_1_2 = layer_1.T.dot(diff.T) / num_samples + (0.001*weights_1_2)
    grad_bias_2 = np.sum(diff) / num_samples

    # for input layer weights
    layer_1_error = diff.T.dot(weights_1_2.T) # errors at hidden nodes
    layer_1_delta = layer_1_error * (layer_1 > 0) #Â multiply by relu derivative
    grad_weights_0_1 = M_train.T.dot(layer_1_delta) / num_samples + (0.001*weights_0_1)
    grad_bias_1 = np.sum(layer_1_delta, axis=0) / num_samples

    # update weights
    weights_0_1 -= lr * grad_weights_0_1
    weights_1_2 -= lr * grad_weights_1_2
    bias_1 -= lr * grad_bias_1
    bias_2 -= lr * grad_bias_2


logistic_loss_np = np.array([loss.get() for loss in logistic_loss])

# Plot the loss
plt.plot(range(1, n_iters), logistic_loss_np[1:])
plt.xlabel("Number of Iterations")
plt.ylabel("Loss")
plt.title("Logistic Loss Over Iterations")
plt.show()



In [None]:
# EVALUATE ON TEST SET
layer_1 = np.maximum(M_test.dot(weights_0_1) + bias_1, 0)
layer_2 = layer_1.dot(weights_1_2) + bias_2
q = 1/(1+np.exp(-layer_2))

y_test_pred = [int(prob > 0.5) for prob in q] 
y_test=[int(l == "positive") for l in labels_test]


# EVALUATION METRICS
# accuracy
acc_test=[int(yp == y_test[s]) for s,yp in enumerate(y_test_pred)]
print(f'accuracy: {sum(acc_test)/len(acc_test)}')

# precision and recall
labels_test_pred=["positive" if s == 1 else "negative" for s in y_test_pred]
true_positives=sum([int(yp == "positive" and labels_test[s] == "positive") for s,yp in enumerate(labels_test_pred)])
false_negatives=sum([int(yp == "negative" and labels_test[s] == "positive") for s,yp in enumerate(labels_test_pred)])
false_positives=sum([int(yp == "positive" and labels_test[s] == "negative") for s,yp in enumerate(labels_test_pred)])
true_negatives=sum([int(yp == "negative" and labels_test[s] == "negative") for s,yp in enumerate(labels_test_pred)])

precision = true_positives/(true_positives + false_positives)
recall = true_positives/(true_positives + false_negatives)
print(f'precision: {precision}')
print(f'recall: {recall}')


In [None]:
# EXAMINING WEIGHTS
all_weights = np.dot(weights_0_1, weights_1_2).flatten()

print("most impactful words for a positive review:")
print([vocab_list[x] for x in np.argsort(all_weights)[::-1][0:20]])

print("\nmost impactful words for a negative review:")
print([vocab_list[x] for x in np.argsort(all_weights)[0:20]])
