### TRINH THE SON - 20127617

In [3]:
import datetime
import numpy as np
import os
import pandas as pd
import shutil
import urllib.request
import zipfile

In [4]:
VARIANTS = {
    "100k": {"filename": "u.data", "sep": "\t"},
    "1m": {"filename": "ratings.dat", "sep": r"::"},
    "20m": {"filename": "ratings.csv", "sep": ","},
}

In [5]:
variant = "100k"

if variant not in VARIANTS:
    raise ValueError(
        f"Invalid variant: {variant}. Valid options are {list(VARIANTS.keys())}"
    )

In [6]:
url = f"http://files.grouplens.org/datasets/movielens/ml-{variant}.zip"

In [7]:
variant_info = VARIANTS[variant]
filename = variant_info["filename"]
dirname = f"ml-{variant}"
zip_path = os.path.join(dirname + ".zip")

In [8]:
with urllib.request.urlopen(url) as r, open(zip_path, "wb") as f:
    shutil.copyfileobj(r, f)
with zipfile.ZipFile(zip_path, "r") as zf:
    zf.extractall()

In [9]:
# Remove zip file after extraction (optional)
os.remove(zip_path)

In [10]:
csv_path = os.path.join(dirname, filename)
names = ["u_id", "i_id", "rating", "timestamp"]
dtype = {"u_id": np.uint32, "i_id": np.uint32, "rating": np.float64}

In [11]:
df = pd.read_csv(
    csv_path,
    names=names,
    dtype=dtype,
    header=0,
    sep=VARIANTS[variant]["sep"],
)

df.sort_values(by="u_id", inplace=True)
df.reset_index(drop=True, inplace=True)

In [12]:
train = df.sample(frac=0.8, random_state=7)
val = df.drop(train.index.tolist()).sample(frac=0.5, random_state=8)
test = df.drop(train.index.tolist()).drop(val.index.tolist())

In [13]:
class SVD:
    def __init__(
        self,
        lr=0.005,
        reg=0.02,
        n_epochs=20,
        n_factors=100,
        early_stopping=False,
        shuffle=False,
        min_delta=0.001,
        min_rating=1,
        max_rating=5,
    ):

        self.lr = lr
        self.reg = reg
        self.n_epochs = n_epochs
        self.n_factors = n_factors
        self.early_stopping = early_stopping
        self.shuffle = shuffle
        self.min_delta = min_delta
        self.min_rating = min_rating
        self.max_rating = max_rating

    def fit(self, X, X_val=None):
        X = self._preprocess_data(X)

        if X_val is not None:
            X_val = self._preprocess_data(X_val, train=False, verbose=False)
            self._init_metrics()

        self.global_mean_ = np.mean(X[:, 2])
        self._run_sgd(X, X_val)

        return self

    def _preprocess_data(self, X, train=True, verbose=True):
        print("Preprocessing data...\n")
        X = X.copy()

        if train:  # Mappings have to be created
            user_ids = X["u_id"].unique().tolist()
            item_ids = X["i_id"].unique().tolist()

            n_users = len(user_ids)
            n_items = len(item_ids)

            user_idx = range(n_users)
            item_idx = range(n_items)

            self.user_mapping_ = dict(zip(user_ids, user_idx))
            self.item_mapping_ = dict(zip(item_ids, item_idx))

        X["u_id"] = X["u_id"].map(self.user_mapping_)
        X["i_id"] = X["i_id"].map(self.item_mapping_)

        # Tag validation set unknown users/items with -1 (enables
        # `fast_methods._compute_val_metrics` detecting them)
        X.fillna(-1, inplace=True)

        X["u_id"] = X["u_id"].astype(np.int32)
        X["i_id"] = X["i_id"].astype(np.int32)

        return X[["u_id", "i_id", "rating"]].values

    def _init_metrics(self):
        metrics = np.zeros((self.n_epochs, 3), dtype=float)
        self.metrics_ = pd.DataFrame(metrics, columns=["Loss", "RMSE", "MAE"])

    def _run_sgd(self, X, X_val):
        n_users = len(np.unique(X[:, 0]))
        n_items = len(np.unique(X[:, 1]))

        bu, bi, pu, qi = _initialization(n_users, n_items, self.n_factors)

        # Run SGD
        for epoch_ix in range(self.n_epochs):
            start = self._on_epoch_begin(epoch_ix)

            if self.shuffle:
                X = _shuffle(X)

            bu, bi, pu, qi = _run_epoch(
                X, bu, bi, pu, qi, self.global_mean_, self.n_factors, self.lr, self.reg
            )

            if X_val is not None:
                self.metrics_.loc[epoch_ix, :] = _compute_val_metrics(
                    X_val, bu, bi, pu, qi, self.global_mean_, self.n_factors
                )
                self._on_epoch_end(
                    start,
                    self.metrics_.loc[epoch_ix, "Loss"],
                    self.metrics_.loc[epoch_ix, "RMSE"],
                    self.metrics_.loc[epoch_ix, "MAE"],
                )

                if self.early_stopping:
                    val_rmse = self.metrics_["RMSE"].tolist()
                    if self._early_stopping(val_rmse, epoch_ix, self.min_delta):
                        break

            else:
                self._on_epoch_end(start)

        self.bu_ = bu
        self.bi_ = bi
        self.pu_ = pu
        self.qi_ = qi

    def predict(self, X, clip=True):
        return [
            self.predict_pair(u_id, i_id, clip)
            for u_id, i_id in zip(X["u_id"], X["i_id"])
        ]

    def predict_pair(self, u_id, i_id, clip=True):
        user_known, item_known = False, False
        pred = self.global_mean_

        if u_id in self.user_mapping_:
            user_known = True
            u_ix = self.user_mapping_[u_id]
            pred += self.bu_[u_ix]

        if i_id in self.item_mapping_:
            item_known = True
            i_ix = self.item_mapping_[i_id]
            pred += self.bi_[i_ix]

        if user_known and item_known:
            pred += np.dot(self.pu_[u_ix], self.qi_[i_ix])

        if clip:
            pred = self.max_rating if pred > self.max_rating else pred
            pred = self.min_rating if pred < self.min_rating else pred

        return pred

    def _early_stopping(self, val_rmse, epoch_idx, min_delta):
        if epoch_idx > 0:
            if val_rmse[epoch_idx] + min_delta > val_rmse[epoch_idx - 1]:
                self.metrics_ = self.metrics_.loc[: (epoch_idx + 1), :]
                return True
        return False

    def _on_epoch_begin(self, epoch_ix):
        start = time.time()
        end = "  | " if epoch_ix < 9 else " | "
        print("Epoch {}/{}".format(epoch_ix + 1, self.n_epochs), end=end)

        return start

    def _on_epoch_end(self, start, val_loss=None, val_rmse=None, val_mae=None):
        end = time.time()

        if val_loss is not None:
            print(f"val_loss: {val_loss:.2f}", end=" - ")
            print(f"val_rmse: {val_rmse:.2f}", end=" - ")
            print(f"val_mae: {val_mae:.2f}", end=" - ")

        print(f"took {end - start:.1f} sec")