In this notebook we will preprocess 100k instances of the data and then train the GRU model for 10 epochs. This requires the dataset google drive

In [None]:
import dask.dataframe as dd
ddf = dd.read_json("/content/drive/MyDrive/goodreads_reviews_dedup.json",lines=True,nrows=100000)
df = ddf.compute()

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
#importing necessary packages
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
#dropping erroenous values
df = df[df['rating'] != 0]
df.reset_index(drop=True, inplace=True)
df = df[['review_text','rating']]

In [None]:
#Adapted from this source: https://medium.com/@jozsef.dudas/predicting-wine-review-scores-from-text-using-lstm-bcfdbf7b4c6

# Download necessary resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Text preprocessing function
def preprocess_text(text: str) -> str:
    # Text cleaning
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Stop word removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

In [None]:
df['review_text_tokenized'] = df['review_text'].apply(preprocess_text)

In [None]:
#Adapted from this source: https://medium.com/@jozsef.dudas/predicting-wine-review-scores-from-text-using-lstm-bcfdbf7b4c6

# Tokenize the text and convert to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['review_text_tokenized'])
sequences = tokenizer.texts_to_sequences(df['review_text_tokenized'])

# Pad sequences to a fixed length
max_sequence_length = 100  # Maximum sequence length to pad
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['rating'], test_size=0.2)

And now we can train the GRU!

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Embedding
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=64))
model.add(GRU(128))
model.add(Dense(1, activation='linear'))

In [None]:
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='mean_squared_error',
              metrics=['mean_squared_error'])

In [None]:
history = model.fit(X_train, y_train,
                    validation_split=0.2,
                    batch_size=64,
                    epochs=10)

Epoch 1/10
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - loss: 1.9433 - mean_squared_error: 1.9433 - val_loss: 0.6992 - val_mean_squared_error: 0.6992
Epoch 2/10
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 0.6333 - mean_squared_error: 0.6333 - val_loss: 0.6692 - val_mean_squared_error: 0.6692
Epoch 3/10
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 0.5582 - mean_squared_error: 0.5582 - val_loss: 0.6533 - val_mean_squared_error: 0.6533
Epoch 4/10
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 0.5131 - mean_squared_error: 0.5131 - val_loss: 0.6622 - val_mean_squared_error: 0.6622
Epoch 5/10
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 0.4656 - mean_squared_error: 0.4656 - val_loss: 0.7377 - val_mean_squared_error: 0.7377
Epoch 6/10
[1m889/889[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - l

In [None]:
mse = model.evaluate(X_test, y_test, verbose=0)[1]  # Get MSE
print(f'Test Mean Squared Error: {mse}')

Test Mean Squared Error: 0.7722707986831665
