In [None]:
#@title Step 1: Imports

import pandas as pd
import numpy as np
import math
import torch
%matplotlib inline
import matplotlib.pyplot as plt  
import os                       
import sklearn.datasets         
from torch.utils.tensorboard import SummaryWriter
import json
import copy
from sklearn.model_selection import train_test_split

In [None]:
#@title Step 2: Download data
!pip install kaggle

!mkdir ~/.kaggle

kaggle_username = "wmd0701" #@param {type:"string"}
kaggle_api_key = "8f525bc765511d324f8509b938d7f39c" #@param {type:"string"}

assert len(kaggle_username) > 0 and len(kaggle_api_key) > 0

api_token = {"username": kaggle_username,"key": kaggle_api_key}

with open('kaggle.json', 'w') as file:
    json.dump(api_token, file)

!mv kaggle.json ~/.kaggle/kaggle.json

!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c cil-collaborative-filtering-2021

!unzip data_train.csv.zip 
!unzip sampleSubmission.csv.zip
print()

number_of_users, number_of_movies = (10000, 1000)

data_pd = pd.read_csv('data_train.csv')
print(data_pd.head(5))
print('\nShape', data_pd.shape)

submission_pd = pd.read_csv('sampleSubmission.csv.zip')
print("\n\n",submission_pd.head(5))
print("\nShape:", submission_pd.shape)
print("\n\nSummary:", np.unique(data_pd.Prediction.values, return_counts=True))

sparsity = 1.0 - data_pd.shape[0] / (number_of_users * number_of_movies)
print("\nSparsity:", sparsity)

Downloading data_train.csv.zip to /content
  0% 0.00/3.33M [00:00<?, ?B/s]
100% 3.33M/3.33M [00:00<00:00, 112MB/s]
Downloading sampleSubmission.csv.zip to /content
  0% 0.00/2.92M [00:00<?, ?B/s]
100% 2.92M/2.92M [00:00<00:00, 200MB/s]
Archive:  data_train.csv.zip
  inflating: data_train.csv          
Archive:  sampleSubmission.csv.zip
  inflating: sampleSubmission.csv    

       Id  Prediction
0  r44_c1           4
1  r61_c1           3
2  r67_c1           4
3  r72_c1           3
4  r86_c1           5

Shape (1176952, 2)


         Id  Prediction
0   r37_c1           3
1   r73_c1           3
2  r156_c1           3
3  r160_c1           3
4  r248_c1           3

Shape: (1176952, 2)


Summary: (array([1, 2, 3, 4, 5]), array([ 43508,  99180, 274327, 324700, 435237]))

Sparsity: 0.8823048


In [None]:
#@title Step 3: Split data

train_size = 0.9

train_pd, test_pd = train_test_split(data_pd, train_size=train_size, random_state=0)
print(train_pd.shape)
print(test_pd.shape)

def extract_users_items_predictions(data_pd):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    return users, movies, predictions

(1059256, 2)
(117696, 2)


In [None]:
#@title Step 4: Use GPU if available

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device is {device}.")

Device is cuda.


In [None]:
#@title Step 5: Classifier - create data loaders
batch_size = 64

train_users, train_movies, train_predictions = extract_users_items_predictions(train_pd)
test_users, test_movies, test_predictions = extract_users_items_predictions(test_pd)
all_users, all_movies, all_predictions = extract_users_items_predictions(data_pd)

# class encodes are 01234, not 12345
train_predictions = train_predictions - 1
test_predictions  = test_predictions  - 1
all_predictions   = all_predictions   - 1

# datasets and data loaders for training/validation
train_users_  = torch.tensor(train_users, device=device).int()
train_movies_ = torch.tensor(train_movies, device=device).int()
train_predictions_ = torch.tensor(train_predictions, device=device).long()
test_users_  = torch.tensor(test_users, device=device).int()
test_movies_ = torch.tensor(test_movies, device=device).int()
test_predictions_ = torch.tensor(test_predictions, device=device).long()
train_set = torch.utils.data.TensorDataset(train_users_, train_movies_, train_predictions_)
test_set  = torch.utils.data.TensorDataset(test_users_ , test_movies_ , test_predictions_)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader  = torch.utils.data.DataLoader(test_set , batch_size=batch_size, shuffle=False)

# datasets and data loaders for the whole task
all_users_  = torch.tensor(all_users, device=device).int()
all_movies_ = torch.tensor(all_movies, device=device).int()
all_predictions_ = torch.tensor(all_predictions, device=device).long()
all_set = torch.utils.data.TensorDataset(all_users_, all_movies_, all_predictions_)
all_loader = torch.utils.data.DataLoader(all_set , batch_size=batch_size, shuffle=True)

print(len(train_loader))
print(len(test_loader))
print(len(all_loader))

16551
1839
18390


In [None]:
#@title Step 6: Classifier - MLP class

class MLP(torch.nn.Module):
    def __init__(self, number_of_users, number_of_movies, embedding_size=64):
        super().__init__()
        self.embedding_layer_users = torch.nn.Embedding(number_of_users, embedding_size)
        self.embedding_layer_movies = torch.nn.Embedding(number_of_movies, embedding_size)
        
        self.feed_forward = torch.nn.Sequential(
            torch.nn.Linear(in_features=embedding_size*2, out_features=embedding_size),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=embedding_size, out_features=int(embedding_size/2)),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=int(embedding_size/2), out_features=int(embedding_size/4)),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=int(embedding_size/4), out_features=5),
        )
        
    def forward(self, users, movies):
        users_embedding = self.embedding_layer_users(users)
        movies_embedding = self.embedding_layer_movies(movies)
        output = torch.cat([users_embedding, movies_embedding], dim=1)
        output = self.feed_forward(output)
        return output

# help function for calculating accuracy
def multi_acc(y_pred, y_true):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_true).float()
    return correct_pred.sum()

In [None]:
#@title Step 7: Classifier - initialize

def weights_init(m):
    if isinstance(m, torch.nn.Embedding):
        torch.nn.init.xavier_uniform_(m.weight.data)

model_MLP = MLP(number_of_users, number_of_movies, embedding_size=64)

# put the model in the device memory
model_MLP = model_MLP.to(device)

# count total number of parameters including non trainable
total_params_count = sum(p.numel() for p in model_MLP.parameters())
# count total trainable parameters
trainable_params_count = sum(p.numel() for p in model_MLP.parameters() if p.requires_grad)

print(f"Total number of trainable parameters: {total_params_count}")
print(f"Number of trainable parameters: {trainable_params_count}")
print(model_MLP.apply(weights_init))

Total number of trainable parameters: 714949
Number of trainable parameters: 714949
MLP(
  (embedding_layer_users): Embedding(10000, 64)
  (embedding_layer_movies): Embedding(1000, 64)
  (feed_forward): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=16, bias=True)
    (5): ReLU()
    (6): Linear(in_features=16, out_features=5, bias=True)
  )
)


In [None]:
#@title Step 8: Classifier - loss and optimier

loss_func = torch.nn.CrossEntropyLoss()

learning_rate  = 3e-4
regularization = 5e-5
optimizer = torch.optim.Adam(model_MLP.parameters(), lr=learning_rate, weight_decay=regularization)

In [None]:
#@title Step 9: Classifier - training and validation


# set seed to make result reproducible
torch.manual_seed(0)

# number of epochs
n_epochs = 11

# whether to train with 90% data and validate with 10%,
# or to train with 100% data
validate = True

if validate:
    t_loader = train_loader
else:
    t_loader = all_loader
  

step = 0
for epoch in range(n_epochs):
    
    # train model
    model_MLP.train()
    total_loss_train, total_accuracy_train = 0., 0.
    for i, (user, movie, rating) in enumerate(t_loader):
        
        optimizer.zero_grad()   
        output = model_MLP(user, movie) 
        loss = loss_func(output, rating) 
        loss.backward()
        optimizer.step() 
        total_loss_train += loss.item()
        
        if validate:
            total_accuracy_train += multi_acc(output, rating)

        if step % 5000 == 0:
            print('[Epoch %03d] - Step %04d> train loss: %.4f' % (epoch, step, loss.item()))
        
        step += 1

    # validate model
    if validate:
        model_MLP.eval()
        total_loss_test, total_accuracy_test = 0., 0.
        with torch.no_grad():
            for i, (user, movie, rating) in enumerate(test_loader):
                output = model_MLP(user, movie)
                loss = loss_func(output, rating)
                total_loss_test += loss.item()
                # total_accuracy_test += (output.round() == rating).float().sum()
                total_accuracy_test += multi_acc(output, rating)

                if step % 5000 == 0:
                    print('[Epoch %03d] - Step %04d> test  loss: %.4f' % (epoch, step, loss.item()))      



    total_loss_train /= len(t_loader)
    print('[Epoch %03d] - > avg train loss: %.4f' % (epoch, total_loss_train))

    if validate:
        total_loss_test  /= len(test_loader)
        total_accuracy_train /= (len(t_loader)*batch_size)
        total_accuracy_test  /= (len(test_loader)*batch_size)
        
        print('[Epoch %03d] - > avg test  loss: %.4f' % (epoch, total_loss_test))
        print('[Epoch %03d] - > avg train accu: %.4f' % (epoch, total_accuracy_train)) 
        print('[Epoch %03d] - > avg test  accu: %.4f' % (epoch, total_accuracy_test)) 

In [None]:
#@title Step 10: Regressor - create data loaders
batch_size = 64

train_users, train_movies, train_predictions = extract_users_items_predictions(train_pd)
test_users, test_movies, test_predictions = extract_users_items_predictions(test_pd)
all_users, all_movies, all_predictions = extract_users_items_predictions(data_pd)

# datasets and data loaders for training/validation
train_users_  = torch.tensor(train_users, device=device).int()
train_movies_ = torch.tensor(train_movies, device=device).int()
train_predictions_ = torch.tensor(train_predictions, device=device).float()
test_users_  = torch.tensor(test_users, device=device).int()
test_movies_ = torch.tensor(test_movies, device=device).int()
test_predictions_ = torch.tensor(test_predictions, device=device).float()
train_set = torch.utils.data.TensorDataset(train_users_, train_movies_, train_predictions_)
test_set  = torch.utils.data.TensorDataset(test_users_ , test_movies_ , test_predictions_)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader  = torch.utils.data.DataLoader(test_set , batch_size=batch_size, shuffle=False)

# datasets and data loaders for the whole task
all_users_  = torch.tensor(all_users, device=device).int()
all_movies_ = torch.tensor(all_movies, device=device).int()
all_predictions_ = torch.tensor(all_predictions, device=device).float()
all_set = torch.utils.data.TensorDataset(all_users_, all_movies_, all_predictions_)
all_loader = torch.utils.data.DataLoader(all_set , batch_size=batch_size, shuffle=True)

print(len(train_loader))
print(len(test_loader))
print(len(all_loader))

16551
1839
18390


In [None]:
#@title Step 11: Regressor - SVD++ class

class SVDpp(torch.nn.Module):
  def __init__(self, number_of_users=number_of_users, number_of_movies=number_of_movies, rank=20, biased=True):
    super().__init__()
    self.biased = biased
    self.rank = rank
    self.user_matrix   = torch.nn.Embedding(number_of_users , self.rank)
    self.movie_matrix  = torch.nn.Embedding(number_of_movies, self.rank)
    self.user_biases   = torch.nn.Embedding(number_of_users , 1)
    self.movie_biases  = torch.nn.Embedding(number_of_movies, 1)
    self.global_biases = torch.nn.Embedding(1, 1)

    # constants
    self.zero = torch.tensor(0).to(device)
    self.register_buffer('const_zero', self.zero)


  def forward(self, user, movie):
    pred  = (self.user_matrix(user) * self.movie_matrix(movie)).sum(1, keepdim=True)
    if self.biased:
      pred += self.user_biases(user) + self.movie_biases(movie) + self.global_biases(self.zero)
    
    return pred.squeeze()

In [None]:
#@title Step 12: Regressor - initialize
model_SVD = SVDpp(number_of_users, number_of_movies, rank=20, biased=True)

# put the model in the device memory
model_SVD = model_SVD.to(device)

# count total number of parameters including non trainable
total_params_count = sum(p.numel() for p in model_SVD.parameters())
# count total trainable parameters
trainable_params_count = sum(p.numel() for p in model_SVD.parameters() if p.requires_grad)

print(f"Total number of trainable parameters: {total_params_count}")
print(f"Number of trainable parameters: {trainable_params_count}")
print(model_SVD.apply(weights_init))

Total number of trainable parameters: 231001
Number of trainable parameters: 231001
SVDpp(
  (user_matrix): Embedding(10000, 20)
  (movie_matrix): Embedding(1000, 20)
  (user_biases): Embedding(10000, 1)
  (movie_biases): Embedding(1000, 1)
  (global_biases): Embedding(1, 1)
)


In [None]:
#@title Step 13: Regressor - loss and optimizer

loss_func = torch.nn.MSELoss()

learning_rate  = 3e-4
regularization = 5e-5

optimizer = torch.optim.Adam(model_SVD.parameters(), lr=learning_rate, weight_decay=regularization)

In [None]:
#@title Step 14: Regressor - training and validation

# set seed to make result reproducible
torch.manual_seed(0)

# number of epochs
n_epochs = 23

# whether to train with 90% data and validate with 10%,
# or to train with 100% data
validate = True

if validate:
    t_loader = train_loader
else:
    t_loader = all_loader
  

step = 0
for epoch in range(n_epochs):
    
    # train model
    model_SVD.train()
    total_loss_train, total_accuracy_train = 0., 0.
    for i, (user, movie, rating) in enumerate(t_loader):
       
        optimizer.zero_grad()   
        output = model_SVD(user, movie) 
        loss = loss_func(output, rating) 
        loss.backward() 
        optimizer.step() 

        total_loss_train += loss.item()
        
        # measure accuracy
        if validate:
            total_accuracy_train += (output.round() == rating).float().sum()
        
        if step % 5000 == 0:
            print('[Epoch %03d] - Step %04d> train loss: %.4f' % (epoch, step, loss.item()))
        step += 1


    # validate model
    if validate:
        model_SVD.eval()
        total_loss_test, total_accuracy_test = 0., 0.
        with torch.no_grad():
            for i, (user, movie, rating) in enumerate(test_loader):
                output = model_SVD(user, movie)
                loss = loss_func(output, rating)

                total_loss_test += loss.item()
                total_accuracy_test += (output.round() == rating).float().sum()

                if step % 5000 == 0:
                    print('[Epoch %03d] - Step %04d> test  loss: %.4f' % (epoch, step, loss.item()))      



    total_loss_train /= len(t_loader)
    print('[Epoch %03d] - > avg train loss: %.4f' % (epoch, total_loss_train)) 

    if validate:
        total_loss_test  /= len(test_loader) 
        total_accuracy_train /= (len(t_loader)*batch_size)
        total_accuracy_test  /= (len(test_loader)*batch_size)   
    
        print('[Epoch %03d] - > avg test  loss: %.4f' % (epoch, total_loss_test))
        print('[Epoch %03d] - > avg train accu: %.4f' % (epoch, total_accuracy_train)) 
        print('[Epoch %03d] - > avg test  accu: %.4f' % (epoch, total_accuracy_test)) 

In [None]:
#@title Step 15: Combo - class

class Combo(torch.nn.Module):
  def __init__(self, regressor=model_SVD, classifier=model_MLP):
    super().__init__()
    
    # regressor and classifier are pre-trained
    self.regressor  = copy.deepcopy(regressor)
    self.classifier = copy.deepcopy(classifier)

    # freeze pre-trained models
    for param in self.regressor.parameters():
		    param.requires_grad = False
    for param in self.classifier.parameters():
		    param.requires_grad = False

    # alpha
    self.alpha = torch.nn.Embedding(number_of_movies, 1)
    torch.nn.init.zeros_(self.alpha.weight)

    # constants
    self.one = torch.tensor(1.0)
    self.register_buffer('const_one', self.one)
    
  def forward(self, user, movie):
    _, classifier_output  = torch.max(self.classifier(user, movie), dim = 1)
    
    # classifier outputs are categorical values 0~4, we need 1~5
    classifier_output = classifier_output + self.one
    
    regressor_output = self.regressor(user, movie)
    
    alpha = self.alpha(movie).squeeze()
    
    output = (self.one - alpha) * regressor_output + alpha * classifier_output
    output = torch.squeeze(output)
    return output

In [None]:
#@title Step 16: Combo - initialize

model_combo = Combo(regressor=model_SVD, classifier=model_MLP)

# put the model in the device memory
model_combo = model_combo.to(device)

# count total number of parameters including non trainable
total_params_count = sum(p.numel() for p in model_combo.parameters())
# count total trainable parameters
trainable_params_count = sum(p.numel() for p in model_combo.parameters() if p.requires_grad)

print(f"Total number of trainable parameters: {total_params_count}")
print(f"Number of trainable parameters: {trainable_params_count}")

Total number of trainable parameters: 946950
Number of trainable parameters: 1000


In [None]:
#@title Step 17: Combo - loss and optimizer

loss_func = torch.nn.MSELoss()


learning_rate  = 1e-5
regularization = 1e-5

optimizer = torch.optim.Adam(model_combo.parameters(), lr=learning_rate, weight_decay=regularization)

In [None]:
#@title Step 18: Combo - training and validation

# set seed to make result reproducible
torch.manual_seed(0)

# number of epochs
n_epochs = 30

# whether to train with 90% data and validate with 10%,
# or to train with 100% data
validate = True

if validate:
    t_loader = train_loader
else:
    t_loader = all_loader
  

step = 0
for epoch in range(n_epochs):
    
    # train model
    model_combo.train()
    total_loss_train, total_accuracy_train = 0., 0.
    for i, (user, movie, rating) in enumerate(t_loader):
        
        optimizer.zero_grad()   
        output = model_combo(user, movie) 
        loss = loss_func(output, rating) 
        loss.backward()
        optimizer.step() 
        total_loss_train += loss.item()
        
        # output is 0.5 --- 5.5
        if validate:
            total_accuracy_train += (output.round() == rating).float().sum()
        
        if step % 5000 == 0:
            print('[Epoch %03d] - Step %04d> train loss: %.4f' % (epoch, step, loss.item()))
        step += 1

    # validate model
    if validate:
        model_combo.eval()
        total_loss_test, total_accuracy_test = 0., 0.
        with torch.no_grad():
            for i, (user, movie, rating) in enumerate(test_loader):
                
                output = model_combo(user, movie)
                loss = loss_func(output, rating)
                total_loss_test += loss.item()
                total_accuracy_test += (output.round() == rating).float().sum()

                if step % 5000 == 0:
                    print('[Epoch %03d] - Step %04d> test  loss: %.4f' % (epoch, step, loss.item()))      



    total_loss_train /= len(t_loader)
    print('[Epoch %03d] - > avg train loss: %.4f' % (epoch, total_loss_train)) 

    if validate:
        total_loss_test  /= len(test_loader)
        total_accuracy_train /= (len(t_loader)*batch_size)
        total_accuracy_test  /= (len(test_loader)*batch_size)
        
        print('[Epoch %03d] - > avg test  loss: %.4f' % (epoch, total_loss_test))
        print('[Epoch %03d] - > avg train accu: %.4f' % (epoch, total_accuracy_train)) 
        print('[Epoch %03d] - > avg test  accu: %.4f' % (epoch, total_accuracy_test)) 