In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [None]:
dtypes = {'ISBN': 'str', 'Book-Title': 'str', 'Book-Author': 'str', 'Year-Of-Publication': 'str', 'Publisher': 'str', 'Image-URL-S': 'str', 'Image-URL-M': 'str', 'Image-URL-L': 'str'}

# books_df = pd.read_csv('dataset/Books.csv', dtype=dtypes)
# users_df = pd.read_csv('dataset/Users.csv')
ratings_df = pd.read_csv('dataset/Ratings.csv')
# movie_ratings_df = pd.read_csv('dataset/MovieRatings.csv')
ratings_df['Book-Rating'] = ratings_df['Book-Rating'] / 2
ratings_df.to_csv('dataset/ratings2.csv')

In [None]:
# df = ratings_df.merge(books_df, how="left", on="ISBN")
# df.head().to_csv('dataset/test.csv')

In [None]:
ratings_df.info()
ratings_df.head

In [None]:
# Create a boolean mask that is True for rows that don't have a Book-Rating of 0
mask = ratings_df['Book-Rating'] != 0

# Use boolean indexing to select only the rows that don't have a Book-Rating of 0
ratings_df = ratings_df[mask]
ratings_df.head


In [None]:
from collections import Counter

lbl_user = preprocessing.LabelEncoder()
lbl_book = preprocessing.LabelEncoder()
ratings_df['User-ID'] = lbl_user.fit_transform(ratings_df['User-ID'].values)
ratings_df['ISBN'] = lbl_book.fit_transform(ratings_df['ISBN'].values)

user_ratings_count = Counter(ratings_df['User-ID'])

# Find users with less than 4 ratings
users_to_remove = [user_id for user_id, count in user_ratings_count.items() if count < 10]

# Remove users with less than 4 ratings from the dataset
ratings_df = ratings_df[~ratings_df['User-ID'].isin(users_to_remove)]
ratings_df.head



In [None]:
train_df, valid_df = train_test_split(
    ratings_df, test_size=0.1, stratify=ratings_df['Book-Rating'].values
)

# train_df.to_csv('dataset/test.csv')


In [None]:
valid_df.shape

In [None]:
from bookDataset import BookDataset

# Create train and validation datasets
train_dataset = BookDataset(train_df['User-ID'].values, train_df['ISBN'].values, train_df['Book-Rating'].values)
valid_dataset = BookDataset(valid_df['User-ID'].values, valid_df['ISBN'].values, valid_df['Book-Rating'].values)

# Create train and validation data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True, num_workers=4, drop_last=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=128, shuffle=True, num_workers=4, drop_last=True)


In [None]:
class BookRecommender(torch.nn.Module):
    def __init__(self, num_users, num_isbns, embedding_dim):
        super(BookRecommender, self).__init__()
        self.user_embedding = torch.nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_dim)
        self.isbn_embedding = torch.nn.Embedding(num_embeddings=num_isbns, embedding_dim=embedding_dim)
        self.fc1 = torch.nn.Linear(embedding_dim * 2, 64)
        self.fc2 = torch.nn.Linear(64, 32)
        self.fc3 = torch.nn.Linear(32, 1)

    def forward(self, users, isbns):
        user_embeds = self.user_embedding(users.long())
        isbn_embeds = self.isbn_embedding(isbns.long())
        embeds = torch.cat([user_embeds, isbn_embeds], dim=1)
        x = torch.relu(self.fc1(embeds.view(embeds.size(0), -1)))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
model = BookRecommender(num_users=len(lbl_user.classes_),
                        num_isbns=len(lbl_book.classes_),
                        embedding_dim=64)

print(model)

In [None]:
import torch.optim as optim
learning_rate = 0.01
num_epochs = 20
batch_size = 128

# Define loss function and optimizer
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
scheduler= optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
criterion = torch.nn.SmoothL1Loss()

In [None]:
import matplotlib.pyplot as plt
num_batches = len(train_loader)
losses = []  # List to store the loss values

for epoch in range(num_epochs):
    running_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(batch["user_id"], batch["isbn"])
        loss = criterion(outputs, batch["rating"].unsqueeze(1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    epoch_loss = running_loss / num_batches
    losses.append(epoch_loss)
    
    print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, epoch_loss))
    scheduler.step()

# Plot the training loss
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.show()


In [None]:
from sklearn.metrics import mean_squared_error

total_loss = 0.0
total_correct = 0
total_samples = 0
print('hello', batch_size)
predictions = []
targets = []
model.eval()

with torch.no_grad():
    for batch in valid_loader:
        user_ids, isbns, ratings = batch['user_id'], batch['isbn'], batch['rating']
        outputs = model(user_ids, isbns)
        ratings = ratings.view(-1, 1)  # Reshape the target tensor
        predicted_ratings = torch.round(outputs)  # Round the predicted ratings
#         print(ratings.shape)
        predictions.extend(predicted_ratings.tolist())
        targets.extend(ratings.tolist())
    
        correct = ((predicted_ratings == ratings) | (predicted_ratings == ratings - 0.5) | (predicted_ratings == ratings + 0.5)).sum().item()
        total_correct += correct
        total_samples += ratings.size(0)

# Calculate mean squared error
rmse = mean_squared_error(targets, predictions)
print('Validation RMSE: {:.2f}'.format(rmse))

# Calculate accuracy
accuracy = total_correct / total_samples
print('Validation Accuracy: {:.2f}%'.format(accuracy * 100))


In [None]:
from bookDataset import BookDataset
import torch.optim as optim
import matplotlib.pyplot as plt
import os
from sklearn.metrics import mean_squared_error

os.makedirs("plots", exist_ok=True)

scenarios = [
    {
        'learning_rate': 0.1,
        'loss_function': torch.nn.MSELoss(),
        'num_epochs': 10,
        'scheduler': None,
        'num_batches': 64
    },
    {
        'learning_rate': 0.1,
        'loss_function': torch.nn.MSELoss(),
        'num_epochs': 100,
        'scheduler': None,
        'num_batches': 64
    },
    {
        'learning_rate': 0.01,
        'loss_function': torch.nn.SmoothL1Loss(),
        'num_epochs': 20,
        'scheduler': optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1),
        'num_batches': 128
    },
    {
        'learning_rate': 0.01,
        'loss_function': torch.nn.SmoothL1Loss(),
        'num_epochs': 200,
        'scheduler': optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1),
        'num_batches': 128
    },
    {
        'learning_rate': 0.001,
        'loss_function': torch.nn.L1Loss(),
        'num_epochs': 5,
        'scheduler': optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9),
        'num_batches': 32
    },
    {
        'learning_rate': 0.001,
        'loss_function': torch.nn.CrossEntropyLoss(),
        'num_epochs': 15,
        'scheduler': None,
        'num_batches': 256
    }
]

# Open the output file in append mode
with open("output.txt", "a") as output_file:
    for i, scenario in enumerate(scenarios):
        learning_rate = scenario['learning_rate']
        loss_function = scenario['loss_function']
        num_epochs = scenario['num_epochs']
        scheduler = scenario['scheduler']
        num_batches = scenario['num_batches']
        
        # Create a new instance of the model and optimizer with the current hyperparameters
        model = BookRecommender(num_users=len(lbl_user.classes_), num_isbns=len(lbl_book.classes_), embedding_dim=64)
        optimizer = optim.SGD(model.parameters(), lr=learning_rate)

        # Create train and validation data loaders based on number of batches
        train_loader = DataLoader(dataset=train_dataset, batch_size=num_batches, shuffle=True, num_workers=4, drop_last=True)
        valid_loader = DataLoader(dataset=valid_dataset, batch_size=num_batches, shuffle=True, num_workers=4, drop_last=True)
        
        # Print the current scenario and number of batches
        output_file.write(f"Scenario {i+1}: Learning Rate = {learning_rate}, Loss Function = {loss_function.__class__.__name__}\n")
        output_file.write(f"Number of Batches: {num_batches}\n")
        if scheduler is not None:
            output_file.write(f"Scheduler: {scheduler.__class__.__name__}\n")
        output_file.write("Epoch\tLoss\tValidation RMSE\tValidation Accuracy\n")
        
        # Training loop
        train_losses = []  # List to store the training loss values
        valid_losses = []  # List to store the validation RMSE values
        valid_accuracies = []  # List to store the validation accuracies
        rmse_values = []

        for epoch in range(num_epochs):
            model.train()  # Switch to training mode
            running_loss = 0.0
            for batch in train_loader:
                optimizer.zero_grad()
                outputs = model(batch["user_id"], batch["isbn"])
                loss = loss_function(outputs, batch["rating"].unsqueeze(1))
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            epoch_loss = running_loss / len(train_loader)
            train_losses.append(epoch_loss)

            # Perform validation
            model.eval()  # Switch to evaluation mode
            total_loss = 0.0
            total_correct = 0
            total_samples = 0
            predictions = []
            targets = []
            with torch.no_grad():
                for batch in valid_loader:
                    user_ids, isbns, ratings = batch['user_id'], batch['isbn'], batch['rating']
                    outputs = model(user_ids, isbns)
                    ratings = ratings.view(-1, 1)  # Reshape the target tensor
                    predicted_ratings = torch.round(outputs)  # Round the predicted ratings
                    predictions.extend(predicted_ratings.tolist())
                    targets.extend(ratings.tolist())
                    correct = ((predicted_ratings == ratings) | (predicted_ratings == ratings - 0.5) | (predicted_ratings == ratings + 0.5)).sum().item()
                    total_correct += correct
                    total_samples += ratings.size(0)
                    loss = loss_function(outputs, ratings)
                    total_loss += loss.item()
            valid_loss = total_loss / len(valid_loader)
            valid_losses.append(valid_loss)

            # Calculate RMSE
            rmse = mean_squared_error(targets, predictions, squared=False)
            rmse_values.append(rmse)
            # Calculate accuracy
            accuracy = total_correct / total_samples

            valid_accuracies.append(accuracy)

            # Save the epoch results to the output file
            output_file.write(f"{epoch+1}\t{epoch_loss:.4f}\t{valid_loss:.4f}\t{rmse:.2f}\t{accuracy:.2f}\n")

            # Adjust the learning rate using the scheduler if provided
            if scheduler is not None:
                scheduler.step()

        output_file.write("\n")  # Add a separator between different scenarios

        # Plot the training and validation losses
        plt.plot(train_losses, label='Training Loss')
        plt.plot(valid_losses, label='Validation RMSE')
        plt.xlabel('Epoch')
        plt.ylabel('Loss / RMSE')
        plt.title(f'Loss and RMSE - Scenario {i+1}')
        plt.legend()
        plot_file = os.path.join("plots", f'scenario_{i+1}_loss_plot.png')
        plt.savefig(plot_file)
        plt.close()

        # Plot the validation accuracy
        plt.plot(valid_accuracies)
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.title(f'Validation Accuracy - Scenario {i+1}')
        plot_file = os.path.join("plots", f'scenario_{i+1}_accuracy_plot.png')
        plt.savefig(plot_file)
        plt.close()

print('Finished Training')
