<a href="https://colab.research.google.com/github/tcrawley2/ml-final-project/blob/main/PredictingStarRatings_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy import sparse

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

In [2]:
df = pd.read_csv("shuffled_reviews.csv")
df.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,B6xZit8IaIHUo1k8KcOC9g,HDHA_fJ0p60lHqVl3-vkug,3-dQbvti2EpSfs9jKcyIjQ,2,0,0,0,"Food was OK, nothing special. Overpriced. Se...",2009-05-24 19:22:29
1,b3S2wv__m1RqVyBu0TTq2A,kmyO80BMdVGG3kJpr_fymA,HCqmx4ENAZ76SAjoalj-MQ,4,1,0,0,Pucketts is a cozy joint in the heart of the c...,2015-06-21 18:50:19
2,7WvaSQQCiDeHYiF2fIp13w,JBs_IamE1RTMMR5lY1vxzg,mCo2uVTTGYrEhRrkQW-CMw,4,0,0,0,"Ordered lamb stew noodles, Chinese hamburger, ...",2018-07-07 22:26:53
3,3wT0t_CFGSvsxnm-0Q9LZw,rO9lGYmDU3WL8KtF_-PsIg,Gscb04_HRI-p_J2ABHVILA,1,1,1,0,Yesterday was my second time eating at this lo...,2016-09-09 17:29:45
4,aAqoAd90bw93mraf0JG5WQ,Tr3T5hfCRDrLrxP48jFWyg,4SQ5uczVt66-GGGeNQRfEA,1,0,0,0,I never write reviews. This is actually my fir...,2016-01-21 22:32:36


In [3]:
df.shape

(60000, 9)

In [4]:
le = LabelEncoder()
df['encoded_labels'] = le.fit_transform(df['stars'])
df[['stars', 'encoded_labels']]

Unnamed: 0,stars,encoded_labels
0,2,1
1,4,3
2,4,3
3,1,0
4,1,0
...,...,...
59995,5,4
59996,4,3
59997,1,0
59998,2,1


In [5]:
# tokenize the text
max_words = 500
tokenizer = Tokenizer(num_words = max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])

In [6]:
# convert text to sequences
sequences = tokenizer.texts_to_sequences(df['text'])

In [7]:
# pad sequences to have the same length
max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

In [8]:
X_train, X_test, y_train, y_text = train_test_split(padded_sequences, df['encoded_labels'], test_size = 0.2, random_state=42)

In [9]:
# build neural network model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=32, input_length=max_sequence_length))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(len(le.classes_), activation='softmax'))

In [10]:
# compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [11]:
# train the model
model.fit(X_train, y_train, batch_size=32, epochs=5, validation_split=0.2, verbose=2)

Epoch 1/5
1200/1200 - 17s - loss: 1.2848 - accuracy: 0.4004 - val_loss: 1.0633 - val_accuracy: 0.5252 - 17s/epoch - 14ms/step
Epoch 2/5
1200/1200 - 15s - loss: 1.0133 - accuracy: 0.5582 - val_loss: 1.0708 - val_accuracy: 0.5241 - 15s/epoch - 13ms/step
Epoch 3/5
1200/1200 - 15s - loss: 0.9280 - accuracy: 0.6000 - val_loss: 1.0809 - val_accuracy: 0.5255 - 15s/epoch - 13ms/step
Epoch 4/5
1200/1200 - 17s - loss: 0.8184 - accuracy: 0.6565 - val_loss: 1.1289 - val_accuracy: 0.5181 - 17s/epoch - 14ms/step
Epoch 5/5
1200/1200 - 16s - loss: 0.6998 - accuracy: 0.7134 - val_loss: 1.2568 - val_accuracy: 0.5046 - 16s/epoch - 14ms/step


<keras.src.callbacks.History at 0x7bb2edb99d80>

The accuracy of the training data was constantly improving, however the validation accuracy plateaued and even decreased, which may be signs of overfitting to the training data. I'll go ahead with evaluating the model with the test data to see how it performs.

In [12]:
# evaluating the model
loss, accuracy = model.evaluate(X_test, y_text, verbose=2)
print(f'Accuracy: {accuracy}')

375/375 - 1s - loss: 1.2799 - accuracy: 0.5036 - 1s/epoch - 3ms/step
Accuracy: 0.5035833120346069


In [13]:
predictions = model.predict(X_test)
predicted_labels = np.argmax(predictions, axis=1)



In [18]:
print("Reality vs predictions")
for i, j in zip(y_text[:5], predicted_labels[:5]):
  print(f'{i} ----- {j}')

Reality vs predictions
2 ----- 2
0 ----- 0
2 ----- 2
0 ----- 0
0 ----- 1


(12000, 986)