In [None]:
!pip install scikit-surprise
!pip install optuna

Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 2.8MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617638 sha256=468db30ab94bd14409adf25735ee296f4da77fe68e2cb3777fcb89adfda4b2f6
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1
Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/1a/18/b49ca91cf592747e19f2d333c2a86cd7c81895b922a5a09adf6335471576/optuna-2.8.0-py3-none-any.whl (301kB)
[K     |█████████████████

In [None]:
# Mount Google Drive and set data paths.
import os
from google.colab import drive
from google.colab import files

drive.mount('/content/gdrive')
DATA_PATH = "/content/gdrive/My Drive/ETH/Computational Intelligence Lab/CIL-Project/data"
TRAIN_DATA_PATH = os.path.join(DATA_PATH, "data_train.csv")
TEST_DATA_PATH = os.path.join(DATA_PATH, "data_test.csv")

Mounted at /content/gdrive


In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset
from sklearn.model_selection import train_test_split
import surprise
from surprise import accuracy
from surprise.model_selection import cross_validate
import optuna

In [None]:
def load_data(file_path: str, full_dataset: bool, train_val_split: bool, random_seed: int = 0, train_size: float = 0):
    data_pd = pd.read_csv(file_path)

    # Reduce Dataset for Testing
    if not full_dataset:
        data_pd = data_pd.head(10000)

    if train_val_split:
        train_pd, val_pd = train_test_split(data_pd, train_size=train_size, random_state=random_seed)
        return train_pd, val_pd
    else:
        return data_pd

def __extract_users_items_ratings(data_pd: pd.DataFrame):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    ratings = data_pd.Prediction.values
    return users, movies, ratings

def create_surprise_data(data_pd):
    users, movies, ratings = __extract_users_items_ratings(data_pd)

    df = pd.DataFrame({
        'users': users,
        'movies': movies,
        'ratings': ratings
    })
    reader = surprise.Reader(rating_scale=(1, 5))
    return surprise.Dataset.load_from_df(df[['users', 'movies', 'ratings']], reader=reader)

def __get_tensors_from_dataframe(data_pd: pd.DataFrame):
    users, movies, ratings = __extract_users_items_ratings(data_pd)
    users_torch = torch.tensor(users, dtype=torch.int64)
    movies_torch = torch.tensor(movies, dtype=torch.int64)
    ratings_torch = torch.tensor(ratings, dtype=torch.int64)

    return users_torch, movies_torch, ratings_torch


def create_dataset(data_pd: pd.DataFrame, test_dataset: bool = False):
    users_torch, movies_torch, ratings_torch = __get_tensors_from_dataframe(data_pd)

    if not test_dataset:
        return TensorDataset(users_torch, movies_torch, ratings_torch)
    else:
        test_ids = data_pd.Id
        return test_ids, TensorDataset(users_torch, movies_torch)

In [None]:
random_seed = 42
full_dataset = True

np.random.seed(random_seed)

train_pd = load_data(
    file_path=TRAIN_DATA_PATH,
    full_dataset=full_dataset,
    train_val_split=False
)

In [None]:
train_data = create_surprise_data(train_pd)
trainset, testset = surprise.model_selection.train_test_split(train_data, test_size=0.2)

In [None]:
from surprise import SlopeOne

def objective(trial):
    algo = SlopeOne()
    
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    return rmse

In [None]:
study = optuna.create_study(direction="minimize")

[32m[I 2021-06-08 19:04:25,110][0m A new study created in memory with name: no-name-bb39e27f-0ff5-4438-bc8e-966e7f79b5a8[0m


In [None]:
study.optimize(objective, n_trials=1)

[32m[I 2021-06-08 19:05:04,506][0m Trial 0 finished with value: 0.9995171061773653 and parameters: {}. Best is trial 0 with value: 0.9995171061773653.[0m


RMSE: 0.9995


In [None]:
# NOTE: We do not prune any trials since surprise does not support partial fit.
pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Study statistics: 
  Number of finished trials:  1
  Number of pruned trials:  0
  Number of complete trials:  1
Best trial:
  Value:  0.9995171061773653
  Params: 


In [None]:
test_pd = load_data(
    file_path=TEST_DATA_PATH,
    full_dataset=full_dataset,
    train_val_split=False
)

train_data = create_surprise_data(train_pd).build_full_trainset()
test_ids, test_data = create_dataset(test_pd, test_dataset=True)
test_ids = test_ids.to_numpy()

from surprise import SlopeOne

algo = SlopeOne()

algo.fit(train_data)

predictions = []
for user, movie in test_data:
    prediction = algo.predict(user.item(), movie.item()).est
    predictions.append(prediction)

output = np.stack((test_ids, predictions), axis=1)

pd.DataFrame(output, columns=["Id", "Prediction"]).to_csv("slopeone_output.csv", index=None)
files.download("slopeone_output.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>