In [2]:
import pandas as pd
from model import Recommender
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from dataset import MyDataset

In [3]:
movie_ratings_df = pd.read_csv('dataset/MovieRatings.csv')
movie_ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [4]:
from collections import Counter

lbl_user = preprocessing.LabelEncoder()
lbl_book = preprocessing.LabelEncoder()
movie_ratings_df['userId'] = lbl_user.fit_transform(movie_ratings_df['userId'].values)
movie_ratings_df['movieId'] = lbl_book.fit_transform(movie_ratings_df['movieId'].values)

user_ratings_count = Counter(movie_ratings_df['userId'])

# Find users with less than 4 ratings
users_to_remove = [user_id for user_id, count in user_ratings_count.items() if count < 4]

# Remove users with less than 10 ratings from the dataset
movie_ratings_df = movie_ratings_df[~movie_ratings_df['userId'].isin(users_to_remove)]
movie_ratings_df.head



<bound method NDFrame.head of         userId  movieId  rating   timestamp
0            0        0     4.0   964982703
1            0        2     4.0   964981247
2            0        5     4.0   964982224
3            0       43     5.0   964983815
4            0       46     5.0   964982931
...        ...      ...     ...         ...
100831     609     9416     4.0  1493848402
100832     609     9443     5.0  1493850091
100833     609     9444     5.0  1494273047
100834     609     9445     5.0  1493846352
100835     609     9485     3.0  1493846415

[100836 rows x 4 columns]>

In [5]:
train_df, valid_df = train_test_split(
    movie_ratings_df, test_size=0.1, stratify=movie_ratings_df['rating'].values
)

In [6]:
valid_df.shape

(10084, 4)

In [7]:
# Create train and validation datasets
train_dataset = MyDataset(train_df['userId'].values, train_df['movieId'].values, train_df['rating'].values)
valid_dataset = MyDataset(valid_df['userId'].values, valid_df['movieId'].values, valid_df['rating'].values)

# Create train and validation data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True, num_workers=4, drop_last=True)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=128, shuffle=True, num_workers=4, drop_last=True)


In [8]:
model = Recommender(num_users=len(lbl_user.classes_),
                        num_isbns=len(lbl_book.classes_),
                        embedding_dim=64)
print(model)

Recommender(
  (user_embedding): Embedding(610, 64)
  (isbn_embedding): Embedding(9724, 64)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=16, bias=True)
  (fc4): Linear(in_features=16, out_features=1, bias=True)
)


In [9]:
import torch.optim as optim
import torch
learning_rate = 0.01
num_epochs = 20
batch_size = 128

# Define loss function and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler= optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
criterion = torch.nn.SmoothL1Loss()

In [10]:
# from sklearn.metrics import mean_squared_error
# num_epochs = 5

# for epoch in range(num_epochs):
#     model.train()  # Switch to training mode
#     running_loss = 0.0
#     for batch in train_loader:
#         optimizer.zero_grad()
#         outputs = model(batch["user_id"], batch["isbn"])
#         loss = criterion(outputs, batch["rating"].unsqueeze(1))
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#     epoch_loss = running_loss / len(train_loader)

#     # Perform validation
#     model.eval()  # Switch to evaluation mode
#     total_loss = 0.0
#     total_correct = 0
#     total_samples = 0
#     predictions = []
#     targets = []
#     with torch.no_grad():
#         for batch in valid_loader:
#             user_ids, isbns, ratings = batch['user_id'], batch['isbn'], batch['rating']
#             outputs = model(user_ids, isbns)
#             ratings = ratings.view(-1, 1)  # Reshape the target tensor
#             predicted_ratings = torch.round(outputs)  # Round the predicted ratings
#             predictions.extend(predicted_ratings.tolist())
#             targets.extend(ratings.tolist())
#             correct = ((predicted_ratings == ratings) | (predicted_ratings == ratings - 0.5) | (predicted_ratings == ratings + 0.5)).sum().item()
#             total_correct += correct
#             total_samples += ratings.size(0)
#             loss = criterion(outputs, ratings)
#             total_loss += loss.item()
#     valid_loss = total_loss / len(valid_loader)

#     # Calculate RMSE
#     rmse = mean_squared_error(targets, predictions, squared=True)

#     # Calculate accuracy
#     accuracy = total_correct / total_samples

#     # Print the results for the current epoch
#     print(f"Epoch: {epoch+1}")
#     print(f"Train Loss: {epoch_loss:.4f}")
#     print(f"Validation Loss: {valid_loss:.4f}")
#     print(f"RMSE: {rmse:.2f}")
#     print(f"Validation Accuracy: {100*accuracy:.2f}%")
#     print()  # Print an empty line between epochs


In [11]:
import torch.optim as optim
import matplotlib.pyplot as plt
import os
import torch
from sklearn.metrics import mean_squared_error

os.makedirs("four_fc_layer/Movie_plots", exist_ok=True)

scenarios = [
    {
        'optimizer': 'SGD',
        'learning_rate': 0.1,
        'loss_function': torch.nn.MSELoss(),
        'num_epochs': 10,
        'scheduler': None,
        'num_batches': 128
    },
    {
        'optimizer': 'Adam',
        'learning_rate': 0.1,
        'loss_function': torch.nn.MSELoss(),
        'num_epochs': 10,
        'scheduler': None,
        'num_batches': 64
    },
    {
        'optimizer': 'Adam',
        'learning_rate': 0.1,
        'loss_function': torch.nn.MSELoss(),
        'num_epochs': 20,
        'scheduler': None,
        'num_batches': 32
    },
    {
        'optimizer': 'Adam',
        'learning_rate': 0.1,
        'loss_function': torch.nn.MSELoss(),
        'num_epochs': 100,
        'scheduler': None,
        'num_batches': 64
    },
    {
        'optimizer': 'Adam',
        'learning_rate': 0.01,
        'loss_function': torch.nn.SmoothL1Loss(),
        'num_epochs': 20,
        'scheduler': optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1),
        'num_batches': 128
    },
    {
        'optimizer': 'Adam',
        'learning_rate': 0.01,
        'loss_function': torch.nn.MSELoss(),
        'num_epochs': 200,
        'scheduler': optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1),
        'num_batches': 128
    }
]

# Open the output file in append mode
with open("four_fc_layer/movie_results.txt", "a") as output_file:
    for i, scenario in enumerate(scenarios):
        learning_rate = scenario['learning_rate']
        loss_function = scenario['loss_function']
        num_epochs = scenario['num_epochs']
        scheduler = scenario['scheduler']
        num_batches = scenario['num_batches']
        optimizer_name = scenario['optimizer']
        
        # Create a new instance of the model and optimizer with the current hyperparameters
        model = Recommender(num_users=len(lbl_user.classes_), num_isbns=len(lbl_book.classes_), embedding_dim=64)
        if optimizer_name == 'SGD':
            optimizer = optim.SGD(model.parameters(), lr=learning_rate)
        elif optimizer_name == 'Adam':
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        # Create train and validation data loaders based on number of batches
        train_loader = DataLoader(dataset=train_dataset, batch_size=num_batches, shuffle=True, num_workers=4, drop_last=True)
        valid_loader = DataLoader(dataset=valid_dataset, batch_size=num_batches, shuffle=True, num_workers=4, drop_last=True)
        
        # Print the current scenario and number of batches
        output_file.write(f"Scenario {i+1}: Learning Rate = {learning_rate}, Loss Function = {loss_function.__class__.__name__}\n")
        output_file.write(f"Number of Batches: {num_batches}\n")
        if scheduler is not None:
            output_file.write(f"Scheduler: {scheduler.__class__.__name__}\n")
        output_file.write("Epoch\tLoss\tValidation loss\tRMSE\tValidation Accuracy\n")
        
        # Training loop
        train_losses = []  # List to store the training loss values
        valid_losses = []  # List to store the validation RMSE values
        valid_accuracies = []  # List to store the validation accuracies
        rmse_values = []

        for epoch in range(num_epochs):
            model.train()  # Switch to training mode
            running_loss = 0.0
            for batch in train_loader:
                optimizer.zero_grad()
                outputs = model(batch["user_id"], batch["isbn"])
                loss = loss_function(outputs, batch["rating"].unsqueeze(1))
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            epoch_loss = running_loss / len(train_loader)
            train_losses.append(epoch_loss)
            if scheduler is not None:
                scheduler.step() 
            # Perform validation
            model.eval()  # Switch to evaluation mode
            total_loss = 0.0
            total_correct = 0
            total_samples = 0
            predictions = []
            targets = []
            with torch.no_grad():
                for batch in valid_loader:
                    user_ids, isbns, ratings = batch['user_id'], batch['isbn'], batch['rating']
                    outputs = model(user_ids, isbns)
                    ratings = ratings.view(-1, 1)  # Reshape the target tensor
                    predicted_ratings = torch.round(outputs)  # Round the predicted ratings
                    predictions.extend(predicted_ratings.tolist())
                    targets.extend(ratings.tolist())
                    correct = ((predicted_ratings == ratings) | (predicted_ratings == ratings - 0.5) | (predicted_ratings == ratings + 0.5)).sum().item()
                    total_correct += correct
                    total_samples += ratings.size(0)
                    loss = loss_function(outputs, ratings)
                    total_loss += loss.item()
            valid_loss = total_loss / len(valid_loader)
            valid_losses.append(valid_loss)

            # Calculate RMSE
            rmse = mean_squared_error(targets, predictions, squared=True)
            rmse_values.append(rmse)
            # Calculate accuracy
            accuracy = total_correct / total_samples

            valid_accuracies.append(accuracy)

            # Save the epoch results to the output file
            output_file.write(f"{epoch+1}\t{epoch_loss:.4f}\t{valid_loss:.4f}\t{rmse:.2f}\t{100*accuracy:.2f}\n")

        output_file.write("\n")  # Add a separator between different scenarios

        # Plot the training and validation losses
        plt.plot(train_losses, label='Training Loss')
        plt.plot(valid_losses, label='Validation RMSE')
        plt.xlabel('Epoch')
        plt.ylabel('Loss / RMSE')
        plt.title(f'Loss and RMSE - Scenario {i+1}')
        plt.legend()
        plot_file = os.path.join("four_fc_layer/Movie_plots", f'scenario_{i+1}_loss_plot.png')
        plt.savefig(plot_file)
        plt.close()

        # Plot the validation accuracy
        plt.plot(valid_accuracies)
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.title(f'Validation Accuracy - Scenario {i+1}')
        plot_file = os.path.join("four_fc_layer/Movie_plots", f'scenario_{i+1}_accuracy_plot.png')
        plt.savefig(plot_file)
        plt.close()

print('Finished Training')




Finished Training
