In this file we will load in the data, which requires the dataset to be in Google Drive as explained in the README. Then we will clean the data, preprocess, tokenize, and embed with DistilBERT. The prepared dataset will be saved as a pickle file. If on Colab, you should download the file from the contents.

## Load Data

In [None]:
import pandas as pd
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#pre-proccessing json file to get rid of invalid formatted review
file_path = '/content/drive/My Drive/goodreads_reviews_dedup.json'
#file_path = 'goodreads_reviews_dedup.json'

removed_lines_count = 0
valid_lines = []
max_lines = 100000

with open(file_path, 'r') as file:
    for line in file:
        if line.strip():
            try:
                json.loads(line)  # Check if the line is valid JSON
                valid_lines.append(line)
                if len(valid_lines) >= max_lines:  # Stop after 10,000 lines
                    break
            except json.JSONDecodeError:
                removed_lines_count += 1

cleaned_data = ''.join(valid_lines)
df = pd.read_json(cleaned_data, lines=True)

  df = pd.read_json(cleaned_data, lines=True)


In [None]:
df.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,24375664,5cd416f3efc3f944fce4ce2db2290d5e,5,Mind blowingly cool. Best science fiction I've...,Fri Aug 25 13:55:02 -0700 2017,Mon Oct 09 08:55:59 -0700 2017,Sat Oct 07 00:00:00 -0700 2017,Sat Aug 26 00:00:00 -0700 2017,16,0
1,8842281e1d1347389f2ab93d60773d4d,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,Sun Jul 30 07:44:10 -0700 2017,Wed Aug 30 00:00:26 -0700 2017,Sat Aug 26 12:05:52 -0700 2017,Tue Aug 15 13:23:18 -0700 2017,28,1
2,8842281e1d1347389f2ab93d60773d4d,6392944,5e212a62bced17b4dbe41150e5bb9037,3,I haven't read a fun mystery book in a while a...,Mon Jul 24 02:48:17 -0700 2017,Sun Jul 30 09:28:03 -0700 2017,Tue Jul 25 00:00:00 -0700 2017,Mon Jul 24 00:00:00 -0700 2017,6,0
3,8842281e1d1347389f2ab93d60773d4d,22078596,fdd13cad0695656be99828cd75d6eb73,4,"Fun, fast paced, and disturbing tale of murder...",Mon Jul 24 02:33:09 -0700 2017,Sun Jul 30 10:23:54 -0700 2017,Sun Jul 30 15:42:05 -0700 2017,Tue Jul 25 00:00:00 -0700 2017,22,4
4,8842281e1d1347389f2ab93d60773d4d,6644782,bd0df91c9d918c0e433b9ab3a9a5c451,4,A fun book that gives you a sense of living in...,Mon Jul 24 02:28:14 -0700 2017,Thu Aug 24 00:07:20 -0700 2017,Sat Aug 05 00:00:00 -0700 2017,Sun Jul 30 00:00:00 -0700 2017,8,0


In [None]:
print(removed_lines_count)

0


In [None]:
df.shape

(100000, 11)

## Pre-Processing

In [None]:
#data cleaning
df = df[df['rating'] != 0] #drop ratings of 0
df.reset_index(drop=True, inplace=True)
df = df[['review_text','rating']] #isolate columns we need
df.head()

Unnamed: 0,review_text,rating
0,Mind blowingly cool. Best science fiction I've...,5
1,This is a special book. It started slow for ab...,5
2,I haven't read a fun mystery book in a while a...,3
3,"Fun, fast paced, and disturbing tale of murder...",4
4,A fun book that gives you a sense of living in...,4


In [None]:
#check for any missing values in our dataset
null_val = df.isnull().sum()
print(null_val)

review_text    0
rating         0
dtype: int64


In [None]:
df.shape

(96804, 2)

In [None]:
eng_df = df

In [None]:
!pip install langdetect

[0m

In [None]:
from langdetect import detect

# Function to detect language
def is_english(text: str) -> bool:
    try:
        return detect(text) == 'en'
    except:
        return False  # Return False for invalid text input or errors

In [None]:
# Apply the function to filter English reviews
df['is_english'] = df['review_text'].apply(is_english)

In [None]:
# Keep only English reviews
eng_df = df[df['is_english']].copy()

# Drop the helper column if not needed
eng_df.drop(columns=['is_english'], inplace=True)

# Example output
print(eng_df.head())

                                         review_text  rating
0  Mind blowingly cool. Best science fiction I've...       5
1  This is a special book. It started slow for ab...       5
2  I haven't read a fun mystery book in a while a...       3
3  Fun, fast paced, and disturbing tale of murder...       4
4  A fun book that gives you a sense of living in...       4


In [None]:
eng_df.shape

(936, 2)

## Class Balancing

In [None]:
#class balancing
current_bal = df['rating'].value_counts()
print(current_bal)

rating
4    3608
5    3047
3    2033
2     680
1     249
Name: count, dtype: int64


In [None]:
# we can see that most ratings are 4 or 5
#combination of over/under sampling
rating_size = 100000
df_bal = pd.concat([
    df[df['rating'] == rating].sample(rating_size, replace=(count < rating_size), random_state=42)
    for rating, count in df['rating'].value_counts().items()])

In [None]:
#new value counts
df_bal = df_bal.sample(frac=1, random_state=42).reset_index(drop=True) #shuffle
new_bal = df_bal['rating'].value_counts()
print(new_bal)
print(df_bal.shape)

rating
1    100000
3    100000
2    100000
4    100000
5    100000
Name: count, dtype: int64
(500000, 2)


In [None]:
df_bal.head(10)

Unnamed: 0,review_text,rating
0,"Firstly, I stopped reading on page 76. This is...",1
1,Can't really say I've enjoyed this book much -...,3
2,This one simply was just not for me. The writi...,2
3,I read this short story because I thought I ha...,3
4,"I tried so hard to finish this, because I love...",1
5,I'm actually happy this book is the last in th...,3
6,I wanted to like this book. I really did. I ha...,1
7,Biographical verse novel about sculptor Mary E...,2
8,Mr. Nesbo takes a vacation \n I have read all ...,2
9,This is not what I expected it to be. I didn't...,2


In [None]:
#df = df_bal.head(50000)
#df = df.head(50000)
df.shape

(9617, 2)

## BERT

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import time
import pickle
import random
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertTokenizer, DistilBertModel

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
def tokenize_text(text):
    encoding = tokenizer.encode_plus(
        text,  # Single input text
        padding='max_length',  # Pad to the maximum sequence length
        truncation=True,  # Truncate to the maximum sequence length if necessary
        return_tensors='pt',  # Return PyTorch tensors
        add_special_tokens=True  # Add special tokens CLS and SEP
    )
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    return input_ids.squeeze(), attention_mask.squeeze()

In [None]:
# Apply the function to the DataFrame
eng_df['input_ids'], eng_df['attention_mask'] = zip(*eng_df['review_text'].map(tokenize_text))

In [None]:
batch_size = 64

In [None]:
def generate_embeddings(eng_df, batch_size):
    all_embeddings = []
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    with torch.no_grad():
        for i in tqdm(range(0, len(eng_df), batch_size), desc="Generating Embeddings"):
            batch_df = eng_df.iloc[i:i + batch_size]
            input_ids = torch.stack([x for x in batch_df['input_ids'].values]).to(device)
            attention_mask = torch.stack([x for x in batch_df['attention_mask'].values]).to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            word_embeddings = outputs.last_hidden_state  # Get the embeddings from the last layer
            averaged_embeddings = word_embeddings.mean(dim=1)  # Average the embeddings
            all_embeddings.extend(averaged_embeddings.cpu().numpy())

    return np.array(all_embeddings)

In [None]:
# Generate embeddings for all reviews
embeddings = generate_embeddings(eng_df, batch_size)

Generating Embeddings: 100%|████████████████| 1513/1513 [04:29<00:00,  5.62it/s]


In [None]:
# Assign the embeddings back to the DataFrame
eng_df['word_embeddings'] = list(embeddings)

In [None]:
import pickle

with open('eng_df.pkl', 'wb') as f:
    pickle.dump(eng_df, f)

In [None]:
eng_df.head()

Unnamed: 0,review_text,rating,input_ids,attention_mask,word_embeddings
0,Mind blowingly cool. Best science fiction I've...,5,"[tensor(101), tensor(2568), tensor(11221), ten...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[0.07275456, 0.048345067, 0.069500506, 0.08391..."
1,This is a special book. It started slow for ab...,5,"[tensor(101), tensor(2023), tensor(2003), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[-0.10508166, 0.044605456, 0.15093496, 0.08727..."
2,I haven't read a fun mystery book in a while a...,3,"[tensor(101), tensor(1045), tensor(4033), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[0.09070881, 0.02698918, 0.079526536, 0.053992..."
3,"Fun, fast paced, and disturbing tale of murder...",4,"[tensor(101), tensor(4569), tensor(1010), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[0.12166318, 0.026309181, 0.16683182, 0.057287..."
4,A fun book that gives you a sense of living in...,4,"[tensor(101), tensor(1037), tensor(4569), tens...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[0.12523821, 0.0054550497, 0.047890987, 0.1243..."
