In [28]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [29]:
dtypes = {'ISBN': 'str', 'Book-Title': 'str', 'Book-Author': 'str', 'Year-Of-Publication': 'str', 'Publisher': 'str', 'Image-URL-S': 'str', 'Image-URL-M': 'str', 'Image-URL-L': 'str'}

# books_df = pd.read_csv('dataset/Books.csv', dtype=dtypes)
# users_df = pd.read_csv('dataset/Users.csv')
# ratings_df = pd.read_csv('dataset/Ratings.csv')
movie_ratings_df = pd.read_csv('dataset/MovieRatings.csv')


In [30]:
# df = ratings_df.merge(books_df, how="left", on="ISBN")
# df.head().to_csv('dataset/test.csv')

In [31]:
# ratings_df.info()
# ratings_df.head
movie_ratings_df.head()
movie_ratings_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [32]:
# # Create a boolean mask that is True for rows that don't have a Book-Rating of 0
# mask = ratings_df['Book-Rating'] != 0

# # Use boolean indexing to select only the rows that don't have a Book-Rating of 0
# ratings_df = ratings_df[mask]
# ratings_df.head


In [33]:
from collections import Counter

# lbl_user = preprocessing.LabelEncoder()
# lbl_book = preprocessing.LabelEncoder()
# ratings_df['User-ID'] = lbl_user.fit_transform(ratings_df['User-ID'].values)
# ratings_df['ISBN'] = lbl_book.fit_transform(ratings_df['ISBN'].values)

# user_ratings_count = Counter(ratings_df['User-ID'])

# # Find users with less than 4 ratings
# users_to_remove = [user_id for user_id, count in user_ratings_count.items() if count < 4]

# # Remove users with less than 4 ratings from the dataset
# ratings_df = ratings_df[~ratings_df['User-ID'].isin(users_to_remove)]
# ratings_df.head

lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()
movie_ratings_df.userId = lbl_user.fit_transform(movie_ratings_df.userId.values)
movie_ratings_df.movieId = lbl_movie.fit_transform(movie_ratings_df.movieId.values)
movie_ratings_df.head


<bound method NDFrame.head of         userId  movieId  rating   timestamp
0            0        0     4.0   964982703
1            0        2     4.0   964981247
2            0        5     4.0   964982224
3            0       43     5.0   964983815
4            0       46     5.0   964982931
...        ...      ...     ...         ...
100831     609     9416     4.0  1493848402
100832     609     9443     5.0  1493850091
100833     609     9444     5.0  1494273047
100834     609     9445     5.0  1493846352
100835     609     9485     3.0  1493846415

[100836 rows x 4 columns]>

In [34]:
# train_df, valid_df = train_test_split(
#     ratings_df, test_size=0.1, stratify=ratings_df['Book-Rating'].values
# )
# train_df.to_csv('dataset/test.csv')

train_df, valid_df = train_test_split(
    movie_ratings_df, test_size=0.05, stratify=movie_ratings_df.rating.values
)
train_df.to_csv('dataset/test.csv')


In [35]:
valid_df.shape

(5042, 4)

In [36]:
from bookDataset import BookDataset

# Create train and validation datasets
# train_dataset = BookDataset(train_df['User-ID'].values, train_df['ISBN'].values, train_df['Book-Rating'].values)
# valid_dataset = BookDataset(valid_df['User-ID'].values, valid_df['ISBN'].values, valid_df['Book-Rating'].values)

train_dataset = BookDataset(train_df.userId.values, train_df.movieId.values, train_df.rating.values)
valid_dataset = BookDataset(valid_df.userId.values, valid_df.movieId.values, valid_df.rating.values)
# print(ratings_df.head())
print(len(train_dataset))
print(len(train_dataset.user_ids))
print(len(train_dataset.isbns))
print(len(train_dataset.ratings))
# Create train and validation data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True, num_workers=4, drop_last=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=64, shuffle=True, num_workers=4, drop_last=True)

95794
95794
95794
95794


In [37]:
class BookRecommender(torch.nn.Module):
    def __init__(self, num_users, num_isbns, embedding_dim):
        super(BookRecommender, self).__init__()
        self.user_embedding = torch.nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_dim)
        self.isbn_embedding = torch.nn.Embedding(num_embeddings=num_isbns, embedding_dim=embedding_dim)
        self.fc1 = torch.nn.Linear(embedding_dim * 2, 64)
        self.fc2 = torch.nn.Linear(64, 32)
        self.fc3 = torch.nn.Linear(32, 1)

    def forward(self, users, isbns):
        user_embeds = self.user_embedding(users.long())
        isbn_embeds = self.isbn_embedding(isbns.long())
        embeds = torch.cat([user_embeds, isbn_embeds], dim=1)
        x = torch.relu(self.fc1(embeds.view(embeds.size(0), -1)))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [38]:
# model = BookRecommender(num_users=len(lbl_user.classes_),
#                         num_isbns=len(lbl_book.classes_),
#                         embedding_dim=64)

model = BookRecommender(num_users=len(lbl_user.classes_),
                        num_isbns=len(lbl_movie.classes_),
                        embedding_dim=64)
print(model)

BookRecommender(
  (user_embedding): Embedding(610, 64)
  (isbn_embedding): Embedding(9724, 64)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=1, bias=True)
)


In [39]:
import torch.optim as optim

learning_rate = 0.1
num_epochs = 20
batch_size = 64

# Define loss function and optimizer
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.7)
criterion = torch.nn.MSELoss()

In [40]:
from bookDataset import BookDataset
num_batches = len(train_loader)

print("Number of batches in the DataLoader:", num_batches)

for epoch in range(num_epochs):
    running_loss = 0.0
    for batch in train_loader:
        # zero the parameter gradients
        optimizer.zero_grad()
        outputs = model(batch["user_id"], batch["isbn"])
        loss = criterion(outputs, batch["rating"].unsqueeze(1))
        loss.backward()
        optimizer.step()
        # print statistics
        running_loss += loss.item() * batch_size
    epoch_loss = running_loss / len(train_dataset)
    print("Learning Rate:", scheduler.get_last_lr())
    
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, epoch_loss))
    scheduler.step()
print('Finished Training')

Number of batches in the DataLoader: 1496
Learning Rate: [0.1]
Epoch [1/20], Loss: 0.9820
Learning Rate: [0.1]
Epoch [2/20], Loss: 0.8856
Learning Rate: [0.06999999999999999]
Epoch [3/20], Loss: 0.8430
Learning Rate: [0.06999999999999999]
Epoch [4/20], Loss: 0.8250
Learning Rate: [0.048999999999999995]
Epoch [5/20], Loss: 0.7979
Learning Rate: [0.048999999999999995]
Epoch [6/20], Loss: 0.7871
Learning Rate: [0.0343]
Epoch [7/20], Loss: 0.7669
Learning Rate: [0.0343]
Epoch [8/20], Loss: 0.7587
Learning Rate: [0.024009999999999997]
Epoch [9/20], Loss: 0.7433
Learning Rate: [0.024009999999999997]
Epoch [10/20], Loss: 0.7376
Learning Rate: [0.016806999999999996]
Epoch [11/20], Loss: 0.7246
Learning Rate: [0.016806999999999996]
Epoch [12/20], Loss: 0.7211
Learning Rate: [0.011764899999999997]
Epoch [13/20], Loss: 0.7114
Learning Rate: [0.011764899999999997]
Epoch [14/20], Loss: 0.7084
Learning Rate: [0.008235429999999997]
Epoch [15/20], Loss: 0.7010
Learning Rate: [0.008235429999999997]
Epo

In [41]:
from sklearn.metrics import mean_squared_error

total_loss = 0.0
total_correct = 0
total_samples = 0
print('hello', batch_size)
predictions = []
targets = []
model.eval()

with torch.no_grad():
    for batch in valid_loader:
        user_ids, isbns, ratings = batch['user_id'], batch['isbn'], batch['rating']
        outputs = model(user_ids, isbns)
        ratings = ratings.view(-1, 1)  # Reshape the target tensor
        predicted_ratings = torch.round(outputs)  # Round the predicted ratings

        predictions.extend(predicted_ratings.tolist())
        targets.extend(ratings.tolist())
    
        correct = (predicted_ratings == ratings).sum().item()
        total_correct += correct
        total_samples += ratings.size(0)

# Calculate mean squared error
rmse = mean_squared_error(targets, predictions)
print('Validation RMSE: {:.2f}'.format(rmse))

# Calculate accuracy
accuracy = total_correct / total_samples
print('Validation Accuracy: {:.2f}%'.format(accuracy * 100))


hello 64
Validation RMSE: 0.96
Validation Accuracy: 28.87%
