In [None]:
#@title Step 1: Imports

import pandas as pd
import numpy as np
import math
import torch
%matplotlib inline
import matplotlib.pyplot as plt  # For plotting graphs
import os                        # For manipulating file paths
import sklearn.datasets          # For regression data
from torch.utils.tensorboard import SummaryWriter
import json
import copy
from sklearn.model_selection import train_test_split

!pip install torchensemble
from torchensemble import BaggingRegressor, GradientBoostingRegressor, FusionRegressor, AdversarialTrainingRegressor, SnapshotEnsembleRegressor
from torchensemble.utils.logging import set_logger

Collecting torchensemble
  Downloading torchensemble-0.1.5-py3-none-any.whl (39 kB)
Collecting scikit-learn>=0.23.0
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.5 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn, torchensemble
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.2 threadpoolctl-2.2.0 torchensemble-0.1.5


In [None]:
#@title Step 2: Download data
!pip install kaggle

!mkdir ~/.kaggle

kaggle_username = "wmd0701" #@param {type:"string"}
kaggle_api_key = "8f525bc765511d324f8509b938d7f39c" #@param {type:"string"}

assert len(kaggle_username) > 0 and len(kaggle_api_key) > 0

api_token = {"username": kaggle_username,"key": kaggle_api_key}

with open('kaggle.json', 'w') as file:
    json.dump(api_token, file)

!mv kaggle.json ~/.kaggle/kaggle.json

!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c cil-collaborative-filtering-2021

!unzip data_train.csv.zip 
!unzip sampleSubmission.csv.zip
print()

number_of_users, number_of_movies = (10000, 1000)

data_pd = pd.read_csv('data_train.csv')
print(data_pd.head(5))
print('\nShape', data_pd.shape)

submission_pd = pd.read_csv('sampleSubmission.csv.zip')
print("\n\n",submission_pd.head(5))
print("\nShape:", submission_pd.shape)
print("\n\nSummary:", np.unique(data_pd.Prediction.values, return_counts=True))

sparsity = 1.0 - data_pd.shape[0] / (number_of_users * number_of_movies)
print("\nSparsity:", sparsity)

Downloading sampleSubmission.csv.zip to /content
  0% 0.00/2.92M [00:00<?, ?B/s]
100% 2.92M/2.92M [00:00<00:00, 98.7MB/s]
Downloading data_train.csv.zip to /content
  0% 0.00/3.33M [00:00<?, ?B/s]
100% 3.33M/3.33M [00:00<00:00, 110MB/s]
Archive:  data_train.csv.zip
  inflating: data_train.csv          
Archive:  sampleSubmission.csv.zip
  inflating: sampleSubmission.csv    

       Id  Prediction
0  r44_c1           4
1  r61_c1           3
2  r67_c1           4
3  r72_c1           3
4  r86_c1           5

Shape (1176952, 2)


         Id  Prediction
0   r37_c1           3
1   r73_c1           3
2  r156_c1           3
3  r160_c1           3
4  r248_c1           3

Shape: (1176952, 2)


Summary: (array([1, 2, 3, 4, 5]), array([ 43508,  99180, 274327, 324700, 435237]))

Sparsity: 0.8823048


In [None]:
#@title Step 3: Split to training set and test set

train_size = 0.9

train_pd, test_pd = train_test_split(data_pd, train_size=train_size, random_state=0)
print(train_pd.shape)
print(test_pd.shape)

def extract_users_items_predictions(data_pd):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    return users, movies, predictions

train_users, train_movies, train_predictions = extract_users_items_predictions(train_pd)
test_users, test_movies, test_predictions = extract_users_items_predictions(test_pd)
all_users, all_movies, all_predictions = extract_users_items_predictions(data_pd)

(1059256, 2)
(117696, 2)


In [None]:
#@title Step 4: Use GPU if available

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device is {device}.")

Device is cuda.


In [None]:
#@title Step 5: Create data loaders
batch_size = 64

# datasets and data loaders for training/validation
train_users_  = torch.tensor(train_users, device=device).int()
train_movies_ = torch.tensor(train_movies, device=device).int()
train_predictions_ = torch.tensor(train_predictions, device=device).float()
test_users_  = torch.tensor(test_users, device=device).int()
test_movies_ = torch.tensor(test_movies, device=device).int()
test_predictions_ = torch.tensor(test_predictions, device=device).float()
train_set = torch.utils.data.TensorDataset(train_users_, train_movies_, train_predictions_)
test_set  = torch.utils.data.TensorDataset(test_users_ , test_movies_ , test_predictions_)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader  = torch.utils.data.DataLoader(test_set , batch_size=batch_size, shuffle=False)

# datasets and data loaders for the whole task
all_users_  = torch.tensor(all_users, device=device).int()
all_movies_ = torch.tensor(all_movies, device=device).int()
all_predictions_ = torch.tensor(all_predictions, device=device).float()
all_set = torch.utils.data.TensorDataset(all_users_, all_movies_, all_predictions_)
all_loader = torch.utils.data.DataLoader(all_set , batch_size=batch_size, shuffle=True)

print(len(train_loader))
print(len(test_loader))
print(len(all_loader))

16551
1839
18390


In [None]:
#@title Step 6: scaled sigmoided SVD++ class

def weights_init(m):
        if isinstance(m, torch.nn.Embedding):
            torch.nn.init.xavier_uniform_(m.weight.data)

class ssSVDpp(torch.nn.Module):
  def __init__(self, number_of_users=number_of_users, number_of_movies=number_of_movies, rank=20, biased=True):
    super().__init__()
    self.biased = biased
    self.rank = rank
    self.sigmoid       = torch.nn.Sigmoid()
    self.user_matrix   = torch.nn.Embedding(number_of_users , self.rank)
    self.movie_matrix  = torch.nn.Embedding(number_of_movies, self.rank)
    self.user_biases   = torch.nn.Embedding(number_of_users , 1)
    self.movie_biases  = torch.nn.Embedding(number_of_movies, 1)
    self.global_biases = torch.nn.Embedding(1, 1)

    # self.five and self.half are constant and should not be trained
    self.five = torch.tensor(5.)
    self.half = torch.tensor(.5)
    self.zero = torch.tensor(0).to(device)
    self.register_buffer('const_five', self.five)
    self.register_buffer('const_half', self.half)
    self.register_buffer('const_zero', self.zero)
    
    # embedding initialization
    self.apply(weights_init)

  def forward(self, user, movie):
    pred  = (self.user_matrix(user) * self.movie_matrix(movie)).sum(1, keepdim=True)
    if self.biased:
      pred += self.user_biases(user) + self.movie_biases(movie) + self.global_biases(self.zero)
    
    # scale output between 1 and 5 since it is movie rating
    pred = self.sigmoid(pred.squeeze()) * self.five + self.half
    return pred.squeeze()

In [None]:
#@title Step 7: Ensemble model and optimizer

# model_ensemble = FusionRegressor(
# model_ensemble = GradientBoostingRegressor(
# model_ensemble = SnapshotEnsembleRegressor(
model_ensemble = BaggingRegressor(
    estimator=ssSVDpp,
    n_estimators=50,
    cuda=torch.cuda.is_available(),
    n_jobs=10
)

# optimizer
model_ensemble.set_optimizer('Adam', lr=3e-4, weight_decay=5e-5)

# logger
# !rm -r logs
logger = set_logger('bagging_ssSVDpp_50estimators_50epochs')

Log will be saved in '/content/logs'.
Create folder 'logs/'
Start logging into file /content/logs/bagging_ssSVDpp_50estimators_50epochs-2021_07_30_15_49.log...


In [None]:
#@title Step 8: Train and validate

# whether to train with 90% data and validate with 10%
# or train with 100% data
validate = True

# whether save model
save_model = False

# epochs
n_epochs = 30

if validate:
    model_ensemble.fit(
        train_loader,
        epochs=n_epochs,
        test_loader=test_loader,
        log_interval=5000,
        save_model=save_model
    )
else:
    model_ensemble.fit(
        all_loader,
        epochs=n_epochs,
        log_interval=5000,
        save_model=save_model
    )

In [None]:
#@title Step 9: Generate predictions

submission_users, submission_movies, submission_predictions = extract_users_items_predictions(submission_pd)
print(submission_users)
print(submission_movies)
print(submission_predictions)

submission_users_  = torch.tensor(submission_users).int()
submission_movies_ = torch.tensor(submission_movies).int()
submission_set     = torch.utils.data.TensorDataset(submission_users_ , submission_movies_)
submission_loader  = torch.utils.data.DataLoader(submission_set, shuffle=False, batch_size=len(submission_movies))

for _, (user, movie) in enumerate(submission_loader):
    output = model_ensemble.predict(user, movie)

[  36   72  155 ... 9977 9981 9995]
[  0   0   0 ... 999 999 999]
[3 3 3 ... 3 3 3]


In [None]:
#@title Step 10: Clamp output in range 1.0 ~ 5.0

print(output.min())
print(output.max())
print()

output = torch.clamp(output, min=1.0, max=5.0)
print(output.min())
print(output.max())
print()

tensor(0.9464)
tensor(5.2638)

tensor(1.)
tensor(5.)



In [None]:
#@title Step 11: Generate submission file

submission_users = (submission_users+1).astype(str)
submission_users = np.char.add('r', submission_users)

submission_movies = (submission_movies+1).astype(str)
submission_movies = np.char.add('_c', submission_movies)

submission_IDs = np.char.add(submission_users, submission_movies)

my_submission = pd.DataFrame({'ID': submission_IDs, 'Prediction': output})

print(my_submission.head(10))

my_submission.to_csv("my_submission.csv", index=False)

# re-check
wtf = pd.read_csv('my_submission.csv')
wtf.head(10)

        ID  Prediction
0   r37_c1    3.257298
1   r73_c1    3.116233
2  r156_c1    3.737209
3  r160_c1    3.335373
4  r248_c1    3.417657
5  r256_c1    3.406450
6  r284_c1    3.230401
7  r400_c1    3.238351
8  r416_c1    3.548342
9  r456_c1    3.286206


Unnamed: 0,ID,Prediction
0,r37_c1,3.257298
1,r73_c1,3.116233
2,r156_c1,3.737209
3,r160_c1,3.335373
4,r248_c1,3.417657
5,r256_c1,3.40645
6,r284_c1,3.230401
7,r400_c1,3.238351
8,r416_c1,3.548342
9,r456_c1,3.286206
