In [1]:
import pandas as pd
import numpy as np
from model import Recommender
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from dataset import MyDataset

In [2]:
dtypes = {'ISBN': 'str', 'Book-Title': 'str', 'Book-Author': 'str', 'Year-Of-Publication': 'str', 'Publisher': 'str', 'Image-URL-S': 'str', 'Image-URL-M': 'str', 'Image-URL-L': 'str'}

# books_df = pd.read_csv('dataset/Books.csv', dtype=dtypes)
# users_df = pd.read_csv('dataset/Users.csv')
ratings_df = pd.read_csv('dataset/Ratings.csv')
# movie_ratings_df = pd.read_csv('dataset/MovieRatings.csv')
ratings_df['Book-Rating'] = ratings_df['Book-Rating'] / 2
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   User-ID      1149780 non-null  int64  
 1   ISBN         1149780 non-null  object 
 2   Book-Rating  1149780 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 26.3+ MB


In [3]:
# df = ratings_df.merge(books_df, how="left", on="ISBN")
# df.head().to_csv('dataset/test.csv')

In [4]:
ratings_df.info()
ratings_df.head

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   User-ID      1149780 non-null  int64  
 1   ISBN         1149780 non-null  object 
 2   Book-Rating  1149780 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 26.3+ MB


<bound method NDFrame.head of          User-ID         ISBN  Book-Rating
0         276725   034545104X          0.0
1         276726   0155061224          2.5
2         276727   0446520802          0.0
3         276729   052165615X          1.5
4         276729   0521795028          3.0
...          ...          ...          ...
1149775   276704   1563526298          4.5
1149776   276706   0679447156          0.0
1149777   276709   0515107662          5.0
1149778   276721   0590442449          5.0
1149779   276723  05162443314          4.0

[1149780 rows x 3 columns]>

In [5]:
# Create a boolean mask that is True for rows that don't have a Book-Rating of 0
mask = ratings_df['Book-Rating'] != 0

# Use boolean indexing to select only the rows that don't have a Book-Rating of 0
ratings_df = ratings_df[mask]
ratings_df.head


<bound method NDFrame.head of          User-ID         ISBN  Book-Rating
1         276726   0155061224          2.5
3         276729   052165615X          1.5
4         276729   0521795028          3.0
6         276736   3257224281          4.0
7         276737   0600570967          3.0
...          ...          ...          ...
1149773   276704   0806917695          2.5
1149775   276704   1563526298          4.5
1149777   276709   0515107662          5.0
1149778   276721   0590442449          5.0
1149779   276723  05162443314          4.0

[433671 rows x 3 columns]>

In [6]:
from collections import Counter

lbl_user = preprocessing.LabelEncoder()
lbl_book = preprocessing.LabelEncoder()
ratings_df['User-ID'] = lbl_user.fit_transform(ratings_df['User-ID'].values)
ratings_df['ISBN'] = lbl_book.fit_transform(ratings_df['ISBN'].values)

user_ratings_count = Counter(ratings_df['User-ID'])

# Find users with less than 4 ratings
users_to_remove = [user_id for user_id, count in user_ratings_count.items() if count < 10]

# Remove users with less than 10 ratings from the dataset
ratings_df = ratings_df[~ratings_df['User-ID'].isin(users_to_remove)]
ratings_df.head



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df['User-ID'] = lbl_user.fit_transform(ratings_df['User-ID'].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings_df['ISBN'] = lbl_book.fit_transform(ratings_df['ISBN'].values)


<bound method NDFrame.head of          User-ID    ISBN  Book-Rating
133        77209    2884          5.0
134        77209   15804          4.5
135        77209   16137          5.0
136        77209   17027          4.5
137        77209   39925          4.5
...          ...     ...          ...
1149743    77175  117346          5.0
1149744    77175  117457          5.0
1149745    77175  125788          5.0
1149746    77175  134545          3.0
1149747    77175  141488          3.5

[295561 rows x 3 columns]>

In [7]:
train_df, valid_df = train_test_split(
    ratings_df, test_size=0.1, stratify=ratings_df['Book-Rating'].values
)

# train_df.to_csv('dataset/test.csv')


In [8]:
valid_df.shape

(29557, 3)

In [9]:
# Create train and validation datasets
train_dataset = MyDataset(train_df['User-ID'].values, train_df['ISBN'].values, train_df['Book-Rating'].values)
valid_dataset = MyDataset(valid_df['User-ID'].values, valid_df['ISBN'].values, valid_df['Book-Rating'].values)

# Create train and validation data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True, num_workers=4, drop_last=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=128, shuffle=True, num_workers=4, drop_last=True)


In [10]:
model = Recommender(num_users=len(lbl_user.classes_),
                        num_isbns=len(lbl_book.classes_),
                        embedding_dim=64)

print(model)

Recommender(
  (user_embedding): Embedding(77805, 64)
  (isbn_embedding): Embedding(185973, 64)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)


In [11]:
import torch.optim as optim
import torch
learning_rate = 0.01
num_epochs = 20
batch_size = 128

# Define loss function and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler= optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
criterion = torch.nn.SmoothL1Loss()

In [None]:
import torch.optim as optim
import matplotlib.pyplot as plt
import os
import torch
from sklearn.metrics import mean_squared_error

os.makedirs("two_fc_layer/Book_plots", exist_ok=True)

scenarios = [
    {
        'optimizer': 'SGD',
        'learning_rate': 0.1,
        'loss_function': torch.nn.MSELoss(),
        'num_epochs': 10,
        'scheduler': None,
        'num_batches': 64
    },
    {
        'optimizer': 'Adam',
        'learning_rate': 0.1,
        'loss_function': torch.nn.MSELoss(),
        'num_epochs': 10,
        'scheduler': None,
        'num_batches': 64
    },
    {
        'optimizer': 'Adam',
        'learning_rate': 0.1,
        'loss_function': torch.nn.MSELoss(),
        'num_epochs': 20,
        'scheduler': None,
        'num_batches': 32
    },
    {
        'optimizer': 'Adam',
        'learning_rate': 0.1,
        'loss_function': torch.nn.MSELoss(),
        'num_epochs': 100,
        'scheduler': None,
        'num_batches': 64
    },
    {
        'optimizer': 'Adam',
        'learning_rate': 0.01,
        'loss_function': torch.nn.SmoothL1Loss(),
        'num_epochs': 20,
        'scheduler': optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1),
        'num_batches': 128
    },
    {
        'optimizer': 'Adam',
        'learning_rate': 0.01,
        'loss_function': torch.nn.MSELoss(),
        'num_epochs': 200,
        'scheduler': optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1),
        'num_batches': 128
    }
]

# Open the output file in append mode
with open("two_fc_layer/book_results.txt", "a") as output_file:
    for i, scenario in enumerate(scenarios):
        learning_rate = scenario['learning_rate']
        loss_function = scenario['loss_function']
        num_epochs = scenario['num_epochs']
        scheduler = scenario['scheduler']
        num_batches = scenario['num_batches']
        optimizer_name = scenario['optimizer']
        
        # Create a new instance of the model and optimizer with the current hyperparameters
        model = Recommender(num_users=len(lbl_user.classes_), num_isbns=len(lbl_book.classes_), embedding_dim=64)
        if optimizer_name == 'SGD':
            optimizer = optim.SGD(model.parameters(), lr=learning_rate)
        elif optimizer_name == 'Adam':
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        # Create train and validation data loaders based on number of batches
        train_loader = DataLoader(dataset=train_dataset, batch_size=num_batches, shuffle=True, num_workers=4, drop_last=True)
        valid_loader = DataLoader(dataset=valid_dataset, batch_size=num_batches, shuffle=True, num_workers=4, drop_last=True)
        
        # Print the current scenario and number of batches
        output_file.write(f"Scenario {i+1}: Learning Rate = {learning_rate}, Loss Function = {loss_function.__class__.__name__}\n")
        output_file.write(f"Number of Batches: {num_batches}\n")
        if scheduler is not None:
            output_file.write(f"Scheduler: {scheduler.__class__.__name__}\n")
        output_file.write("Epoch\tLoss\tValidation loss\tRMSE\tValidation Accuracy\n")
        
        # Training loop
        train_losses = []  # List to store the training loss values
        valid_losses = []  # List to store the validation RMSE values
        valid_accuracies = []  # List to store the validation accuracies
        rmse_values = []

        for epoch in range(num_epochs):
            model.train()  # Switch to training mode
            running_loss = 0.0
            for batch in train_loader:
                optimizer.zero_grad()
                outputs = model(batch["user_id"], batch["isbn"])
                loss = loss_function(outputs, batch["rating"].unsqueeze(1))
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            epoch_loss = running_loss / len(train_loader)
            train_losses.append(epoch_loss)
            if scheduler is not None:
                scheduler.step() 
            # Perform validation
            model.eval()  # Switch to evaluation mode
            total_loss = 0.0
            total_correct = 0
            total_samples = 0
            predictions = []
            targets = []
            with torch.no_grad():
                for batch in valid_loader:
                    user_ids, isbns, ratings = batch['user_id'], batch['isbn'], batch['rating']
                    outputs = model(user_ids, isbns)
                    ratings = ratings.view(-1, 1)  # Reshape the target tensor
                    predicted_ratings = torch.round(outputs)  # Round the predicted ratings
                    predictions.extend(predicted_ratings.tolist())
                    targets.extend(ratings.tolist())
                    correct = ((predicted_ratings == ratings) | (predicted_ratings == ratings - 0.5) | (predicted_ratings == ratings + 0.5)).sum().item()
                    total_correct += correct
                    total_samples += ratings.size(0)
                    loss = loss_function(outputs, ratings)
                    total_loss += loss.item()
            valid_loss = total_loss / len(valid_loader)
            valid_losses.append(valid_loss)

            # Calculate RMSE
            rmse = mean_squared_error(targets, predictions, squared=True)
            rmse_values.append(rmse)
            # Calculate accuracy
            accuracy = total_correct / total_samples

            valid_accuracies.append(accuracy)

            # Save the epoch results to the output file
            output_file.write(f"{epoch+1}\t{epoch_loss:.4f}\t{valid_loss:.4f}\t{rmse:.2f}\t{100*accuracy:.2f}\n")

        output_file.write("\n")  # Add a separator between different scenarios

        # Plot the training and validation losses
        plt.plot(train_losses, label='Training Loss')
        plt.plot(valid_losses, label='Validation loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss / RMSE')
        plt.title(f'Training and Validation Loss - Scenario {i+1}')
        plt.legend()
        plot_file = os.path.join("two_fc_layer/Book_plots", f'scenario_{i+1}_loss_plot.png')
        plt.savefig(plot_file)
        plt.close()

        # Plot the validation accuracy
        plt.plot(valid_accuracies)
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.title(f'Validation Accuracy - Scenario {i+1}')
        plot_file = os.path.join("two_fc_layer/Book_plots", f'scenario_{i+1}_accuracy_plot.png')
        plt.savefig(plot_file)
        plt.close()

print('Finished Training')
