In [None]:
!pip install scikit-surprise
!pip install optuna

Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 5.1MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617644 sha256=2446f4589d948981b498a34e43c41311e61acdf1a0d411e427c97de9e8161224
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1
Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/1a/18/b49ca91cf592747e19f2d333c2a86cd7c81895b922a5a09adf6335471576/optuna-2.8.0-py3-none-any.whl (301kB)
[K     |█████████████████

In [None]:
# Mount Google Drive and set data paths.
import os
from google.colab import drive
from google.colab import files

drive.mount('/content/gdrive')
DATA_PATH = "/content/gdrive/My Drive/ETH/Computational Intelligence Lab/CIL-Project/data"
TRAIN_DATA_PATH = os.path.join(DATA_PATH, "data_train.csv")
TEST_DATA_PATH = os.path.join(DATA_PATH, "data_test.csv")

Mounted at /content/gdrive


In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split
import surprise
from surprise import accuracy
from surprise.model_selection import cross_validate
import optuna

In [None]:
def load_data(file_path: str, full_dataset: bool, train_val_split: bool, random_seed: int = 0, train_size: float = 0):
    data_pd = pd.read_csv(file_path)

    # Reduce Dataset for Testing
    if not full_dataset:
        data_pd = data_pd.head(10000)

    if train_val_split:
        train_pd, val_pd = train_test_split(data_pd, train_size=train_size, random_state=random_seed)
        return train_pd, val_pd
    else:
        return data_pd

def __extract_users_items_ratings(data_pd: pd.DataFrame):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    ratings = data_pd.Prediction.values
    return users, movies, ratings

def create_surprise_data(data_pd):
    users, movies, ratings = __extract_users_items_ratings(data_pd)

    df = pd.DataFrame({
        'users': users,
        'movies': movies,
        'ratings': ratings
    })
    reader = surprise.Reader(rating_scale=(1, 5))
    return surprise.Dataset.load_from_df(df[['users', 'movies', 'ratings']], reader=reader)

def __get_tensors_from_dataframe(data_pd: pd.DataFrame):
    users, movies, ratings = __extract_users_items_ratings(data_pd)
    users_torch = torch.tensor(users, dtype=torch.int64)
    movies_torch = torch.tensor(movies, dtype=torch.int64)
    ratings_torch = torch.tensor(ratings, dtype=torch.int64)

    return users_torch, movies_torch, ratings_torch


def create_dataset(data_pd: pd.DataFrame, test_dataset: bool = False):
    users_torch, movies_torch, ratings_torch = __get_tensors_from_dataframe(data_pd)

    if not test_dataset:
        return TensorDataset(users_torch, movies_torch, ratings_torch)
    else:
        test_ids = data_pd.Id
        return test_ids, TensorDataset(users_torch, movies_torch)

In [None]:
random_seed = 42
full_dataset = True

np.random.seed(random_seed)

train_pd = load_data(
    file_path=TRAIN_DATA_PATH,
    full_dataset=full_dataset,
    train_val_split=False
)

In [None]:
train_data = create_surprise_data(train_pd)
trainset, testset = surprise.model_selection.train_test_split(train_data, test_size=0.2)

In [None]:
from surprise import SVDpp

def objective(trial):
    n_factors = trial.suggest_int("n_factors", 2, 150) #default 100
    n_epochs = trial.suggest_int("n_epochs", 10, 30) #default 20
    #init_mean = trial.suggest_int("init_mean", 0, 5) #removed since best for normal svd is 0
    init_std_dev = trial.suggest_float("init_std_dev", 0, 1) #default 0.1
    lr_all = trial.suggest_float("lr_all", 0, 1e-2) #default 0.005
    reg_all = trial.suggest_float("reg_all", 0, 1e-1) #default 0.02

    algo = SVDpp(n_factors=n_factors, n_epochs=n_epochs, init_std_dev=init_std_dev, lr_all=lr_all, reg_all=reg_all)
    
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    return rmse

In [None]:
study = optuna.create_study(direction="minimize")

[32m[I 2021-06-08 22:59:18,739][0m A new study created in memory with name: no-name-55a52d0b-4373-4479-9eb0-ba4786f7a4db[0m


In [None]:
# Enqueue trials with default parameters.
study.enqueue_trial(
    {
        "n_factors": 100,
        "n_epochs": 20,
        "init_mean": 0,
        "init_std_dev": 0.1,
        "lr_all": 0.005,
        "reg_all": 0.02
    }
)


enqueue_trial is experimental (supported from v1.2.0). The interface can change in the future.


create_trial is experimental (supported from v2.0.0). The interface can change in the future.


add_trial is experimental (supported from v2.0.0). The interface can change in the future.



In [None]:
study.optimize(objective, n_trials=50)

[32m[I 2021-06-09 00:14:34,314][0m Trial 0 finished with value: 1.0248801009603785 and parameters: {'n_factors': 100, 'n_epochs': 20, 'init_std_dev': 0.1, 'lr_all': 0.005, 'reg_all': 0.02}. Best is trial 0 with value: 1.0248801009603785.[0m


RMSE: 1.0249


[32m[I 2021-06-09 00:58:06,187][0m Trial 1 finished with value: 1.0540804189426745 and parameters: {'n_factors': 60, 'n_epochs': 17, 'init_std_dev': 0.526530868001093, 'lr_all': 0.006901962261376904, 'reg_all': 0.026166469730685805}. Best is trial 0 with value: 1.0248801009603785.[0m


RMSE: 1.0541


[32m[I 2021-06-09 01:45:40,905][0m Trial 2 finished with value: 1.1041517438689377 and parameters: {'n_factors': 115, 'n_epochs': 11, 'init_std_dev': 0.7083119410181268, 'lr_all': 0.002235927536414383, 'reg_all': 0.026675589874553086}. Best is trial 0 with value: 1.0248801009603785.[0m


RMSE: 1.1042


[32m[I 2021-06-09 02:08:39,026][0m Trial 3 finished with value: 1.0257114041669189 and parameters: {'n_factors': 49, 'n_epochs': 10, 'init_std_dev': 0.75380714922283, 'lr_all': 0.006539941122843741, 'reg_all': 0.08239083024540535}. Best is trial 0 with value: 1.0248801009603785.[0m


RMSE: 1.0257


[32m[I 2021-06-09 03:06:21,208][0m Trial 4 finished with value: 1.1499450133732179 and parameters: {'n_factors': 42, 'n_epochs': 29, 'init_std_dev': 0.02917734343647338, 'lr_all': 0.008972348666968668, 'reg_all': 0.0028482124576205362}. Best is trial 0 with value: 1.0248801009603785.[0m


RMSE: 1.1499


[32m[I 2021-06-09 04:40:52,057][0m Trial 5 finished with value: 0.9996434784878007 and parameters: {'n_factors': 101, 'n_epochs': 24, 'init_std_dev': 0.08495244489981713, 'lr_all': 0.0009125851561880716, 'reg_all': 0.06340961885100015}. Best is trial 5 with value: 0.9996434784878007.[0m


RMSE: 0.9996


In [None]:
# NOTE: We do not prune any trials since surprise does not support partial fit.
pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
test_pd = load_data(
    file_path=TEST_DATA_PATH,
    full_dataset=full_dataset,
    train_val_split=False
)

train_data = create_surprise_data(train_pd).build_full_trainset()
test_ids, test_data = create_dataset(test_pd, test_dataset=True)
test_ids = test_ids.to_numpy()

from surprise import SVDpp

n_factors = 101
n_epochs = 24
init_std_dev = 0.08495244489981713
lr_all = 0.0009125851561880716
reg_all = 0.06340961885100015

algo = SVDpp(n_factors=n_factors, n_epochs=n_epochs, init_std_dev=init_std_dev, lr_all=lr_all, reg_all=reg_all)

algo.fit(train_data)

predictions = []
for user, movie in test_data:
    prediction = algo.predict(user.item(), movie.item()).est
    predictions.append(prediction)

output = np.stack((test_ids, predictions), axis=1)

pd.DataFrame(output, columns=["Id", "Prediction"]).to_csv("svdpp_output.csv", index=None)
files.download("svdpp_output.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>