In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn import preprocessing


In [2]:
dtypes = {'ISBN': 'str', 'Book-Title': 'str', 'Book-Author': 'str', 'Year-Of-Publication': 'str', 'Publisher': 'str', 'Image-URL-S': 'str', 'Image-URL-M': 'str', 'Image-URL-L': 'str'}

books_df = pd.read_csv('dataset/Books.csv', dtype=dtypes)
users_df = pd.read_csv('dataset/Users.csv')
ratings_df = pd.read_csv('dataset/Ratings.csv')

In [3]:
df = ratings_df.merge(books_df, how="left", on="ISBN")
df.head().to_csv('dataset/test.csv')

In [4]:
ratings_df.info()
ratings_df.head

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


<bound method NDFrame.head of          User-ID         ISBN  Book-Rating
0         276725   034545104X            0
1         276726   0155061224            5
2         276727   0446520802            0
3         276729   052165615X            3
4         276729   0521795028            6
...          ...          ...          ...
1149775   276704   1563526298            9
1149776   276706   0679447156            0
1149777   276709   0515107662           10
1149778   276721   0590442449           10
1149779   276723  05162443314            8

[1149780 rows x 3 columns]>

In [5]:
# Create a boolean mask that is True for rows that don't have a Book-Rating of 0
mask = ratings_df['Book-Rating'] != 0

# Use boolean indexing to select only the rows that don't have a Book-Rating of 0
ratings_df = ratings_df[mask]
ratings_df.head


<bound method NDFrame.head of          User-ID         ISBN  Book-Rating
1         276726   0155061224            5
3         276729   052165615X            3
4         276729   0521795028            6
6         276736   3257224281            8
7         276737   0600570967            6
...          ...          ...          ...
1149773   276704   0806917695            5
1149775   276704   1563526298            9
1149777   276709   0515107662           10
1149778   276721   0590442449           10
1149779   276723  05162443314            8

[433671 rows x 3 columns]>

In [None]:
lbl_user = preprocessing.LabelEncoder()
lbl_book = preprocessing.LabelEncoder()
ratings_df['User-ID'] = lbl_user.fit_transform(ratings_df['User-ID'].values)
ratings_df['ISBN'] = lbl_book.fit_transform(ratings_df['ISBN'].values)

In [7]:

train_df, valid_df = train_test_split(
    ratings_df, test_size=0.1
)
train_df.to_csv('dataset/test.csv')
valid_df.shape

(43368, 3)

In [8]:
valid_df.shape

(43368, 3)

In [9]:
from bookDataset import BookDataset

# Create train and validation datasets
train_dataset = BookDataset(train_df['User-ID'].values, train_df['ISBN'].values, train_df['Book-Rating'].values)
valid_dataset = BookDataset(valid_df['User-ID'].values, valid_df['ISBN'].values, valid_df['Book-Rating'].values)
# print(ratings_df.head())
print(len(train_dataset))
print(len(train_dataset.user_ids))
print(len(train_dataset.isbns))
print(len(train_dataset.ratings))
# Create train and validation data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=32, shuffle=False, num_workers=0, drop_last=True)

390303
390303
390303
390303


In [10]:
class BookRecommender(torch.nn.Module):
    def __init__(self, num_users, num_isbns, embedding_dim):
        super(BookRecommender, self).__init__()
        self.user_embedding = torch.nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_dim)
        self.isbn_embedding = torch.nn.Embedding(num_embeddings=num_isbns, embedding_dim=embedding_dim)
        self.fc1 = torch.nn.Linear(embedding_dim * 2, 64)
        self.fc2 = torch.nn.Linear(64, 32)
        self.fc3 = torch.nn.Linear(32, 1)

    def forward(self, users, isbns):
        user_embeds = self.user_embedding(users.long())
        isbn_embeds = self.isbn_embedding(isbns.long())
        embeds = torch.cat([user_embeds, isbn_embeds], dim=1)
        x = torch.relu(self.fc1(embeds.view(embeds.size(0), -1)))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [11]:
model = BookRecommender(num_users=len(lbl_user.classes_),
                        num_isbns=len(lbl_book.classes_),
                        embedding_dim=64)
print(model)

BookRecommender(
  (user_embedding): Embedding(77805, 64)
  (isbn_embedding): Embedding(185973, 64)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=1, bias=True)
)


In [12]:
import torch.optim as optim

learning_rate = 0.1
num_epochs = 10
batch_size = 64
embedding_dim = 32

# Define loss function and optimizer
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
criterion = torch.nn.MSELoss()

In [13]:
from bookDataset import BookDataset
num_batches = len(train_loader)

print("Number of batches in the DataLoader:", num_batches)

i=0        
for epoch in range(num_epochs):
    running_loss = 0.0
    for batch in train_loader:
        # zero the parameter gradients
        optimizer.zero_grad()
#         i += 1
#             print("Id from training loop: ", i)
        # forward + backward + optimize
        outputs = model(batch["user_id"], batch["isbn"])
        loss = criterion(outputs, batch["rating"].unsqueeze(1))
        loss.backward()
        optimizer.step()
        # print statistics
        running_loss += loss.item() * batch_size
#         print(i)
        i += 1
    epoch_loss = running_loss / len(train_dataset)
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, epoch_loss))

print('Finished Training')

Number of batches in the DataLoader: 12196
Epoch [1/10], Loss: 6.7902
Epoch [2/10], Loss: 6.5811
Epoch [3/10], Loss: 6.4606
Epoch [4/10], Loss: 6.3899
Epoch [5/10], Loss: 6.3290
Epoch [6/10], Loss: 6.2786
Epoch [7/10], Loss: 6.2413
Epoch [8/10], Loss: 6.2001
Epoch [9/10], Loss: 6.1645
Epoch [10/10], Loss: 6.1262
Finished Training
