In [None]:
#@title Step 1: Imports

import pandas as pd
import numpy as np
import math
import torch
%matplotlib inline
import matplotlib.pyplot as plt  
import os                        
import sklearn.datasets          
from torch.utils.tensorboard import SummaryWriter
import json
import copy
from sklearn.model_selection import train_test_split

In [None]:
#@title Step 2: Download data
!pip install kaggle

!mkdir ~/.kaggle

kaggle_username = "wmd0701" #@param {type:"string"}
kaggle_api_key = "8f525bc765511d324f8509b938d7f39c" #@param {type:"string"}

assert len(kaggle_username) > 0 and len(kaggle_api_key) > 0

api_token = {"username": kaggle_username,"key": kaggle_api_key}

with open('kaggle.json', 'w') as file:
    json.dump(api_token, file)

!mv kaggle.json ~/.kaggle/kaggle.json

!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c cil-collaborative-filtering-2021

!unzip data_train.csv.zip 
!unzip sampleSubmission.csv.zip
print()

number_of_users, number_of_movies = (10000, 1000)

data_pd = pd.read_csv('data_train.csv')
print(data_pd.head(5))
print('\nShape', data_pd.shape)

submission_pd = pd.read_csv('sampleSubmission.csv.zip')
print("\n\n",submission_pd.head(5))
print("\nShape:", submission_pd.shape)
print("\n\nSummary:", np.unique(data_pd.Prediction.values, return_counts=True))

sparsity = 1.0 - data_pd.shape[0] / (number_of_users * number_of_movies)
print("\nSparsity:", sparsity)

Downloading sampleSubmission.csv.zip to /content
  0% 0.00/2.92M [00:00<?, ?B/s]
100% 2.92M/2.92M [00:00<00:00, 98.7MB/s]
Downloading data_train.csv.zip to /content
  0% 0.00/3.33M [00:00<?, ?B/s]
100% 3.33M/3.33M [00:00<00:00, 107MB/s]
Archive:  data_train.csv.zip
  inflating: data_train.csv          
Archive:  sampleSubmission.csv.zip
  inflating: sampleSubmission.csv    

       Id  Prediction
0  r44_c1           4
1  r61_c1           3
2  r67_c1           4
3  r72_c1           3
4  r86_c1           5

Shape (1176952, 2)


         Id  Prediction
0   r37_c1           3
1   r73_c1           3
2  r156_c1           3
3  r160_c1           3
4  r248_c1           3

Shape: (1176952, 2)


Summary: (array([1, 2, 3, 4, 5]), array([ 43508,  99180, 274327, 324700, 435237]))

Sparsity: 0.8823048


In [None]:
#@title Step 3: Split data

train_size = 0.9

train_pd, test_pd = train_test_split(data_pd, train_size=train_size, random_state=0)
print(train_pd.shape)
print(test_pd.shape)

def extract_users_items_predictions(data_pd):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    return users, movies, predictions

train_users, train_movies, train_predictions = extract_users_items_predictions(train_pd)
test_users, test_movies, test_predictions = extract_users_items_predictions(test_pd)
all_users, all_movies, all_predictions = extract_users_items_predictions(data_pd)

(1059256, 2)
(117696, 2)


In [None]:
#@title Step 4: Use GPU if available

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device is {device}.")

Device is cuda.


In [None]:
#@title Step 5: Create data loaders
batch_size = 64

# datasets and data loaders for training/validation
train_users_  = torch.tensor(train_users, device=device).int()
train_movies_ = torch.tensor(train_movies, device=device).int()
train_predictions_ = torch.tensor(train_predictions, device=device).float()
test_users_  = torch.tensor(test_users, device=device).int()
test_movies_ = torch.tensor(test_movies, device=device).int()
test_predictions_ = torch.tensor(test_predictions, device=device).float()
train_set = torch.utils.data.TensorDataset(train_users_, train_movies_, train_predictions_)
test_set  = torch.utils.data.TensorDataset(test_users_ , test_movies_ , test_predictions_)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader  = torch.utils.data.DataLoader(test_set , batch_size=batch_size, shuffle=False)

# datasets and data loaders for the whole task
all_users_  = torch.tensor(all_users, device=device).int()
all_movies_ = torch.tensor(all_movies, device=device).int()
all_predictions_ = torch.tensor(all_predictions, device=device).float()
all_set = torch.utils.data.TensorDataset(all_users_, all_movies_, all_predictions_)
all_loader = torch.utils.data.DataLoader(all_set , batch_size=batch_size, shuffle=True)

print(len(train_loader))
print(len(test_loader))
print(len(all_loader))

16551
1839
18390


In [None]:
#@title Step 6: scaled sigmoided SVD++ class

class ssSVDpp(torch.nn.Module):
  def __init__(self, number_of_users=number_of_users, number_of_movies=number_of_movies, rank=20, biased=True):
    super().__init__()
    self.biased = biased
    self.rank = rank
    self.user_matrix   = torch.nn.Embedding(number_of_users , self.rank)
    self.movie_matrix  = torch.nn.Embedding(number_of_movies, self.rank)
    self.user_biases   = torch.nn.Embedding(number_of_users , 1)
    self.movie_biases  = torch.nn.Embedding(number_of_movies, 1)
    self.global_biases = torch.nn.Embedding(1, 1)

    # sigmoid activation
    self.sigmoid       = torch.nn.Sigmoid()

    # constants
    self.five = torch.tensor(5.)
    self.half = torch.tensor(.5)
    self.zero = torch.tensor(0).to(device)
    self.register_buffer('const_five', self.five)
    self.register_buffer('const_half', self.half)
    self.register_buffer('const_zero', self.zero)


  def forward(self, user, movie):
    pred  = (self.user_matrix(user) * self.movie_matrix(movie)).sum(1, keepdim=True)
    if self.biased:
      pred += self.user_biases(user) + self.movie_biases(movie) + self.global_biases(self.zero)
    
    pred = self.sigmoid(pred.squeeze()) * self.five + self.half
    return pred.squeeze()

In [None]:
#@title Step 7: Instantiate model
model = ssSVDpp(number_of_users, number_of_movies, rank=20, biased=True)

# put the model in the device memory
model = model.to(device)

# count total number of parameters including non trainable
total_params_count = sum(p.numel() for p in model.parameters())
# count total trainable parameters
trainable_params_count = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total number of trainable parameters: {total_params_count}")
print(f"Number of trainable parameters: {trainable_params_count}")

Total number of trainable parameters: 231001
Number of trainable parameters: 231001


In [None]:
#@title Step 8: Embedding initialization

def weights_init(m):
    if isinstance(m, torch.nn.Embedding):
        torch.nn.init.xavier_uniform_(m.weight.data)

model.apply(weights_init)

ssSVDpp(
  (user_matrix): Embedding(10000, 20)
  (movie_matrix): Embedding(1000, 20)
  (user_biases): Embedding(10000, 1)
  (movie_biases): Embedding(1000, 1)
  (global_biases): Embedding(1, 1)
  (sigmoid): Sigmoid()
)

In [None]:
#@title Step 9: Loss and optimizer

loss_func = torch.nn.MSELoss()


learning_rate  = 3e-4
regularization = 5e-5
optimizer_user  = torch.optim.Adam([model.user_matrix.weight , model.user_biases.weight], lr=learning_rate, weight_decay=regularization)
optimizer_movie = torch.optim.Adam([model.movie_matrix.weight, model.movie_biases.weight, model.global_biases.weight], lr=learning_rate, weight_decay=regularization)
optimizer_all   = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=regularization)


In [None]:
#@title Step 10: Training and validation

# set seed to make result reproducible
torch.manual_seed(0)

# number of epochs
n_epochs = 30

# whether to train in ALS style (optimize U and V alternatively)
# or in normal style
ALS_train = False

# whether to train with 90% data and validate with 10%,
# or to train with 100% data
validate = True

if validate:
    t_loader = train_loader
else:
    t_loader = all_loader
  

step = 0
for epoch in range(n_epochs):
    
    # train model
    model.train()
    total_loss_train, total_accuracy_train = 0., 0.
    for i, (user, movie, rating) in enumerate(t_loader):
       
        if ALS_train: 
            optimizer_user.zero_grad()  
            output = model(user, movie) 
            loss = loss_func(output, rating) 
            loss.backward() 
            optimizer_user.step()

            optimizer_movie.zero_grad()
            output = model(user, movie)
            loss = loss_func(output, rating)
            loss.backward()
            optimizer_movie.step()
        else:
            optimizer_all.zero_grad()   
            output = model(user, movie) 
            loss = loss_func(output, rating) 
            loss.backward() 
            optimizer_all.step() 

        total_loss_train += loss.item()
        
        # measure accuracy
        if validate:
            total_accuracy_train += (output.round() == rating).float().sum()
        
        if step % 5000 == 0:
            print('[Epoch %03d] - Step %04d> train loss: %.4f' % (epoch, step, loss.item()))
        step += 1


    # validate model
    if validate:
        model.eval()
        total_loss_test, total_accuracy_test = 0., 0.
        with torch.no_grad():
            for i, (user, movie, rating) in enumerate(test_loader):
                output = model(user, movie)
                loss = loss_func(output, rating)

                total_loss_test += loss.item()
                total_accuracy_test += (output.round() == rating).float().sum()

                if step % 5000 == 0:
                    print('[Epoch %03d] - Step %04d> test  loss: %.4f' % (epoch, step, loss.item()))      



    total_loss_train /= len(t_loader)
    print('[Epoch %03d] - > avg train loss: %.4f' % (epoch, total_loss_train)) 

    if validate:
        total_loss_test  /= len(test_loader) 
        total_accuracy_train /= (len(t_loader)*batch_size)
        total_accuracy_test  /= (len(test_loader)*batch_size)    
    
        print('[Epoch %03d] - > avg test  loss: %.4f' % (epoch, total_loss_test))
        print('[Epoch %03d] - > avg train accu: %.4f' % (epoch, total_accuracy_train)) 
        print('[Epoch %03d] - > avg test  accu: %.4f' % (epoch, total_accuracy_test)) 

In [None]:
#@title Step 11: Generate predictions

submission_users, submission_movies, submission_predictions = extract_users_items_predictions(submission_pd)
print(submission_users)
print(submission_movies)
print(submission_predictions)

submission_users_  = torch.tensor(submission_users, device=device).int()
submission_movies_ = torch.tensor(submission_movies, device=device).int()

model.eval()
with torch.no_grad():
    output = model(submission_users_, submission_movies_)

output = output.to('cpu')
print(output)

In [None]:
#@title Step 12: Clamp output in range 1.0 ~ 5.0

print(output.min())
print(output.max())
print()

output = torch.clamp(output, min=1.0, max=5.0)
print(output.min())
print(output.max())
print()

tensor(1.)
tensor(5.)

tensor(1.)
tensor(5.)



In [None]:
#@title Step 13: Generate .csv submission file

submission_users = (submission_users+1).astype(str)
submission_users = np.char.add('r', submission_users)

submission_movies = (submission_movies+1).astype(str)
submission_movies = np.char.add('_c', submission_movies)

submission_IDs = np.char.add(submission_users, submission_movies)

my_submission = pd.DataFrame({'ID': submission_IDs, 'Prediction': output})

print(my_submission.head(10))

my_submission.to_csv("my_submission.csv", index=False)

# re-check
wtf = pd.read_csv('my_submission.csv')
wtf.head(10)

        ID  Prediction
0   r37_c1         3.0
1   r73_c1         3.0
2  r156_c1         4.0
3  r160_c1         3.0
4  r248_c1         3.0
5  r256_c1         3.0
6  r284_c1         3.0
7  r400_c1         3.0
8  r416_c1         4.0
9  r456_c1         3.0


Unnamed: 0,ID,Prediction
0,r37_c1,3.0
1,r73_c1,3.0
2,r156_c1,4.0
3,r160_c1,3.0
4,r248_c1,3.0
5,r256_c1,3.0
6,r284_c1,3.0
7,r400_c1,3.0
8,r416_c1,4.0
9,r456_c1,3.0
