### TRINH THE SON - 20127617

In [1]:
import datetime
import numpy as np
import os
import pandas as pd
import shutil
import urllib.request
import zipfile

In [2]:
# VARIANTS object defines data configurations for different file sizes.
# Each key represents the data size (e.g., "100k" for 100 thousand).
VARIANTS = {
    "100k": {"filename": "u.data", "sep": "\t"},
    "1m": {"filename": "ratings.dat", "sep": r"::"},
    "20m": {"filename": "ratings.csv", "sep": ","},
}

In [3]:
# Define the chosen data variant (e.g., "100k", "1m", or "20m")
variant = "100k"

# Check if the chosen variant is a valid key in the VARIANTS object
if variant not in VARIANTS:
    # If not valid, raise an error
    raise ValueError(
        f"Invalid variant: {variant}. Valid options are {list(VARIANTS.keys())}"
    )

In [4]:
# Construct the URL for downloading the data based on the chosen variant
url = f"http://files.grouplens.org/datasets/movielens/ml-{variant}.zip"

In [5]:
# Extract information from the VARIANTS object for the chosen variant
variant_info = VARIANTS[variant]

# Destructure filename property from the variant information
filename = variant_info["filename"]

# Construct the directory name based on the variant
dirname = f"ml-{variant}"

# Construct the path to the downloaded zip file
zip_path = os.path.join(dirname + ".zip")

In [6]:
with urllib.request.urlopen(url) as r, open(zip_path, "wb") as f:
    shutil.copyfileobj(r, f)
with zipfile.ZipFile(zip_path, "r") as zf:
    zf.extractall()

In [7]:
# Remove zip file after extraction (optional)
os.remove(zip_path)

In [8]:
csv_path = os.path.join(dirname, filename)
names = ["u_id", "i_id", "rating", "timestamp"]
dtype = {"u_id": np.uint32, "i_id": np.uint32, "rating": np.float64}

In [9]:
df = pd.read_csv(
    csv_path,
    names=names,
    dtype=dtype,
    header=0,
    sep=VARIANTS[variant]["sep"],
)

df.sort_values(by="u_id", inplace=True)
df.reset_index(drop=True, inplace=True)

In [10]:
train = df.sample(frac=0.8, random_state=7)
val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
test = df.drop(train.index.tolist()).drop(val.index.tolist())

In [40]:
def _initialization(n_users, n_items, n_factors):
    bu = np.zeros(n_users)
    bi = np.zeros(n_items)

    pu = np.random.normal(0, 0.1, (n_users, n_factors))
    qi = np.random.normal(0, 0.1, (n_items, n_factors))

    return bu, bi, pu, qi

def _run_epoch(X, bu, bi, pu, qi, global_mean, n_factors, lr, reg):
    for i in range(X.shape[0]):
        user, item, rating = int(X[i, 0]), int(X[i, 1]), X[i, 2]

        # Predict current rating
        pred = global_mean + bu[user] + bi[item]

        for factor in range(n_factors):
            pred += pu[user, factor] * qi[item, factor]

        err = rating - pred

        # Update biases
        bu[user] += lr * (err - reg * bu[user])
        bi[item] += lr * (err - reg * bi[item])

        # Update latent factors
        for factor in range(n_factors):
            puf = pu[user, factor]
            qif = qi[item, factor]

            pu[user, factor] += lr * (err * qif - reg * puf)
            qi[item, factor] += lr * (err * puf - reg * qif)

    return bu, bi, pu, qi

def _compute_val_metrics(X_val, bu, bi, pu, qi, global_mean, n_factors):
    residuals = []

    for i in range(X_val.shape[0]):
        user, item, rating = int(X_val[i, 0]), int(X_val[i, 1]), X_val[i, 2]
        pred = global_mean

        if user > -1:
            pred += bu[user]

        if item > -1:
            pred += bi[item]

        if (user > -1) and (item > -1):
            for factor in range(n_factors):
                pred += pu[user, factor] * qi[item, factor]

        residuals.append(rating - pred)

    residuals = np.array(residuals)
    loss = np.square(residuals).mean()
    rmse = np.sqrt(loss)
    mae = np.absolute(residuals).mean()

    return loss, rmse, mae

In [38]:
import time

from functools import wraps
from math import trunc

In [35]:
def get_version():
    return __version__


def _timer(text=""):
    def decorator(func):

        @wraps(func)
        def wrapper(*args, **kwargs):
            start = time.time()
            result = func(*args, **kwargs)
            end = time.time()

            hours = trunc((end - start) / 3600)
            minutes = trunc((end - start) / 60)
            seconds = round((end - start) % 60)

            if hours > 1:
                print(
                    text + "{} hours {} min and {} sec".format(hours, minutes, seconds)
                )
            elif hours == 1:
                print(
                    text + "{} hour {} min and {} sec".format(hours, minutes, seconds)
                )
            elif minutes >= 1:
                print(text + "{} min and {} sec".format(minutes, seconds))
            else:
                print(text + "{} sec".format(seconds))

            return result

        return wrapper

    return decorator

In [36]:
class SVD:
    def __init__(
        self,
        lr=0.005,
        reg=0.02,
        n_epochs=20,
        n_factors=100,
        early_stopping=False,
        shuffle=False,
        min_delta=0.001,
        min_rating=1,
        max_rating=5,
    ):

        self.lr = lr
        self.reg = reg
        self.n_epochs = n_epochs
        self.n_factors = n_factors
        self.early_stopping = early_stopping
        self.shuffle = shuffle
        self.min_delta = min_delta
        self.min_rating = min_rating
        self.max_rating = max_rating

    def fit(self, X, X_val=None):
        X = self._preprocess_data(X)

        if X_val is not None:
            X_val = self._preprocess_data(X_val, train=False, verbose=False)
            self._init_metrics()

        self.global_mean_ = np.mean(X[:, 2])
        self._run_sgd(X, X_val)

        return self

    def _preprocess_data(self, X, train=True, verbose=True):
        print("Preprocessing data...\n")
        X = X.copy()

        if train:  # Mappings have to be created
            user_ids = X["u_id"].unique().tolist()
            item_ids = X["i_id"].unique().tolist()

            n_users = len(user_ids)
            n_items = len(item_ids)

            user_idx = range(n_users)
            item_idx = range(n_items)

            self.user_mapping_ = dict(zip(user_ids, user_idx))
            self.item_mapping_ = dict(zip(item_ids, item_idx))

        X["u_id"] = X["u_id"].map(self.user_mapping_)
        X["i_id"] = X["i_id"].map(self.item_mapping_)

        # Tag validation set unknown users/items with -1 (enables
        # `fast_methods._compute_val_metrics` detecting them)
        X.fillna(-1, inplace=True)

        X["u_id"] = X["u_id"].astype(np.int32)
        X["i_id"] = X["i_id"].astype(np.int32)

        return X[["u_id", "i_id", "rating"]].values

    def _init_metrics(self):
        metrics = np.zeros((self.n_epochs, 3), dtype=float)
        self.metrics_ = pd.DataFrame(metrics, columns=["Loss", "RMSE", "MAE"])

    def _run_sgd(self, X, X_val):
        n_users = len(np.unique(X[:, 0]))
        n_items = len(np.unique(X[:, 1]))

        bu, bi, pu, qi = _initialization(n_users, n_items, self.n_factors)

        # Run SGD
        for epoch_ix in range(self.n_epochs):
            start = self._on_epoch_begin(epoch_ix)

            if self.shuffle:
                X = _shuffle(X)

            bu, bi, pu, qi = _run_epoch(
                X, bu, bi, pu, qi, self.global_mean_, self.n_factors, self.lr, self.reg
            )

            if X_val is not None:
                self.metrics_.loc[epoch_ix, :] = _compute_val_metrics(
                    X_val, bu, bi, pu, qi, self.global_mean_, self.n_factors
                )
                self._on_epoch_end(
                    start,
                    self.metrics_.loc[epoch_ix, "Loss"],
                    self.metrics_.loc[epoch_ix, "RMSE"],
                    self.metrics_.loc[epoch_ix, "MAE"],
                )

                if self.early_stopping:
                    val_rmse = self.metrics_["RMSE"].tolist()
                    if self._early_stopping(val_rmse, epoch_ix, self.min_delta):
                        break

            else:
                self._on_epoch_end(start)

        self.bu_ = bu
        self.bi_ = bi
        self.pu_ = pu
        self.qi_ = qi

    def predict(self, X, clip=True):
        return [
            self.predict_pair(u_id, i_id, clip)
            for u_id, i_id in zip(X["u_id"], X["i_id"])
        ]

    def predict_pair(self, u_id, i_id, clip=True):
        user_known, item_known = False, False
        pred = self.global_mean_

        if u_id in self.user_mapping_:
            user_known = True
            u_ix = self.user_mapping_[u_id]
            pred += self.bu_[u_ix]

        if i_id in self.item_mapping_:
            item_known = True
            i_ix = self.item_mapping_[i_id]
            pred += self.bi_[i_ix]

        if user_known and item_known:
            pred += np.dot(self.pu_[u_ix], self.qi_[i_ix])

        if clip:
            pred = self.max_rating if pred > self.max_rating else pred
            pred = self.min_rating if pred < self.min_rating else pred

        return pred

    def _early_stopping(self, val_rmse, epoch_idx, min_delta):
        if epoch_idx > 0:
            if val_rmse[epoch_idx] + min_delta > val_rmse[epoch_idx - 1]:
                self.metrics_ = self.metrics_.loc[: (epoch_idx + 1), :]
                return True
        return False

    def _on_epoch_begin(self, epoch_ix):
        start = time.time()
        end = "  | " if epoch_ix < 9 else " | "
        print("Epoch {}/{}".format(epoch_ix + 1, self.n_epochs), end=end)

        return start

    def _on_epoch_end(self, start, val_loss=None, val_rmse=None, val_mae=None):
        end = time.time()

        if val_loss is not None:
            print(f"val_loss: {val_loss:.2f}", end=" - ")
            print(f"val_rmse: {val_rmse:.2f}", end=" - ")
            print(f"val_mae: {val_mae:.2f}", end=" - ")

        print(f"took {end - start:.1f} sec")

In [30]:
from sklearn.metrics import mean_absolute_error

In [41]:
svd = SVD(
    lr=0.001,
    reg=0.005,
    n_epochs=100,
    n_factors=15,
    early_stopping=True,
    shuffle=False,
    min_rating=1,
    max_rating=5,
)
svd.fit(X=train, X_val=val)

pred = svd.predict(test)
mae = mean_absolute_error(test["rating"], pred)

print(f"Test MAE: {mae:.2f}")

Preprocessing data...

Preprocessing data...

Epoch 1/100  | val_loss: 1.28 - val_rmse: 1.13 - val_mae: 0.95 - took 3.4 sec
Epoch 2/100  | val_loss: 1.28 - val_rmse: 1.13 - val_mae: 0.95 - took 3.2 sec
Test MAE: 0.95
