In [18]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

In [19]:
dtypes = {'ISBN': 'str', 'Book-Title': 'str', 'Book-Author': 'str', 'Year-Of-Publication': 'str', 'Publisher': 'str', 'Image-URL-S': 'str', 'Image-URL-M': 'str', 'Image-URL-L': 'str'}

books_df = pd.read_csv('dataset/Books.csv', dtype=dtypes)
users_df = pd.read_csv('dataset/Users.csv')
ratings_df = pd.read_csv('dataset/Ratings.csv')

In [20]:
df = ratings_df.merge(books_df, how="left", on="ISBN")
df.head().to_csv('dataset/test.csv')
df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
3,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...


In [21]:
df['Book-Rating'].value_counts()

0     716109
8     103736
10     78610
7      76457
9      67541
5      50974
6      36924
4       8904
3       5996
2       2759
1       1770
Name: Book-Rating, dtype: int64

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1149780 entries, 0 to 1149779
Data columns (total 10 columns):
 #   Column               Non-Null Count    Dtype 
---  ------               --------------    ----- 
 0   User-ID              1149780 non-null  int64 
 1   ISBN                 1149780 non-null  object
 2   Book-Rating          1149780 non-null  int64 
 3   Book-Title           1031136 non-null  object
 4   Book-Author          1031135 non-null  object
 5   Year-Of-Publication  1031136 non-null  object
 6   Publisher            1031134 non-null  object
 7   Image-URL-S          1031136 non-null  object
 8   Image-URL-M          1031136 non-null  object
 9   Image-URL-L          1031132 non-null  object
dtypes: int64(2), object(8)
memory usage: 96.5+ MB


In [23]:
df['Book-Rating'].fillna(-1, inplace=True)
df['Book-Rating'] = df['Book-Rating'].astype('float32')
df['User-ID'] = df['User-ID'].astype('category').cat.codes
df['ISBN'] = df['ISBN'].astype('category').cat.codes
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1149780 entries, 0 to 1149779
Data columns (total 10 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   User-ID              1149780 non-null  int32  
 1   ISBN                 1149780 non-null  int32  
 2   Book-Rating          1149780 non-null  float32
 3   Book-Title           1031136 non-null  object 
 4   Book-Author          1031135 non-null  object 
 5   Year-Of-Publication  1031136 non-null  object 
 6   Publisher            1031134 non-null  object 
 7   Image-URL-S          1031136 non-null  object 
 8   Image-URL-M          1031136 non-null  object 
 9   Image-URL-L          1031132 non-null  object 
dtypes: float32(1), int32(2), object(7)
memory usage: 83.3+ MB


In [24]:
y = df['Book-Rating']  # Target variable (book ratings)

train_df, valid_df = train_test_split(
    df, test_size=0.1, stratify=y, random_state=42
)
train_df.shape


(1034802, 10)

In [25]:
valid_df.shape

(114978, 10)

In [26]:
from bookDataset import BookDataset

train_df['User-ID'] = train_df['User-ID'].astype('category')
train_df['ISBN'] = train_df['ISBN'].astype('category')
# Create train and validation datasets
train_dataset = BookDataset(train_df['User-ID'], train_df['ISBN'], train_df['Book-Rating'])
valid_dataset = BookDataset(valid_df['User-ID'], valid_df['ISBN'], valid_df['Book-Rating'])

# Create train and validation data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, num_workers=4)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=32, shuffle=False, num_workers=4)

In [27]:
class BookRecommender(torch.nn.Module):
    def __init__(self, num_users, num_isbns, embedding_dim):
        super(BookRecommender, self).__init__()
        self.user_embedding = torch.nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_dim)
        self.isbn_embedding = torch.nn.Embedding(num_embeddings=num_isbns, embedding_dim=embedding_dim)
        self.fc1 = torch.nn.Linear(embedding_dim * 2, 64)
        self.fc2 = torch.nn.Linear(64, 32)
        self.fc3 = torch.nn.Linear(32, 1)

    def forward(self, users, isbns):
        user_embeds = self.user_embedding(users)
        isbn_embeds = self.isbn_embedding(isbns)
        embeds = torch.cat([user_embeds, isbn_embeds], dim=1)
        x = torch.relu(self.fc1(embeds))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x.squeeze()

In [28]:
model = BookRecommender(num_users=len(train_dataset.user_ids.cat.categories),
                        num_isbns=len(train_dataset.isbns.cat.categories),
                        embedding_dim=64)
print(model)

BookRecommender(
  (user_embedding): Embedding(99262, 64)
  (isbn_embedding): Embedding(320166, 64)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=1, bias=True)
)


In [29]:
import torch.optim as optim
# Define loss function and optimizer
optimizer = optim.SGD(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

In [30]:
from bookDataset import BookDataset

num_batches = len(train_loader)
#print(train_dataset[301769])

#print("Number of batches in the DataLoader:", num_batches)

# Iterate through the first 5 batches
#for batch_idx, batch in enumerate(train_loader):
    #print('Batch Index:', batch_idx)
    #break

KeyError: 301769