In [73]:
import numpy as np
import pandas as pd

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

import seaborn as sns

In [74]:
df_train = pd.read_csv("../../data/ml-100k/ub.base", sep="\t", 
                 header=None, names=['user id', 'item id', 'rating', 'timestamp'])
df_test = pd.read_csv("../../data/ml-100k/ub.test", sep="\t", 
                 header=None, names=['user id', 'item id', 'rating', 'timestamp'])

In [75]:
df_items = pd.read_csv("../../data/ml-100k/u.item", sep="|", 
                 header=None, encoding='latin-1',
                 names=["movie id", "movie title", "release date", "video release date",
                        "IMDb URL", "unknown", "Action", "Adventure", "Animation",
                        "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
                        "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
                        "Thriller", "War", "Western"])


In [76]:
user_movies_train = (
    pd.pivot(data=df_train, index='user id', columns='item id', values='rating')
)

# I subtract each user's average rating to magnify individual preferences
# Replace NaN with 0.0, as this is now the "neutral" value

train_matrix = user_movies_train.sub(user_movies_train.mean(axis=1), axis=0).fillna(0.0)

In [77]:
user_movies_test = (
    pd.pivot(data=df_test, index='user id', columns='item id', values='rating')
)

# I subtract each user's average rating to magnify individual preferences
# Replace NaN with 0.0, as this is now the "neutral" value

test_matrix = user_movies_test.sub(user_movies_train.mean(axis=1), axis=0).fillna(0.0)

In [78]:
svd = TruncatedSVD(n_components=250, n_iter=100, random_state=42)
svd.fit(train_matrix)
svd.explained_variance_ratio_.sum()

0.8473552038580683

In [79]:
U = svd.transform(train_matrix) / svd.singular_values_
Sigma_matrix = np.diag(svd.singular_values_)
VT = svd.components_

U.shape, Sigma_matrix.shape, VT.shape

((943, 250), (250, 250), (250, 1675))

In [80]:
user_embeddings = pd.DataFrame(U).assign(**{"user id" : lambda x: train_matrix.index})
movies_embeddings = pd.DataFrame(VT.T).assign(**{"item id" : lambda x: train_matrix.columns})

In [81]:
train_set = df_train.merge(user_embeddings, on="user id").merge(movies_embeddings, on="item id")
X_train = (
    train_set
    .drop(["user id", "rating", "timestamp"], axis=1)
    .rename(columns= {"item id": "movie id"})
    .merge(df_items.drop(['movie title', 'release date', 'video release date', 'IMDb URL', 'unknown'], axis=1))
    .drop("movie id", axis=1)
)
y_train = train_set['rating']

X_train.shape, y_train.shape

((90570, 518), (90570,))

In [82]:
regressor = LGBMRegressor(n_jobs=-1).fit(X_train, y_train)

In [83]:
train_preds = regressor.predict(X_train)

In [84]:
train_preds.min(), train_preds.max()

(0.8104889778346184, 5.02592380976201)

In [85]:
mean_squared_error(y_train, train_preds, squared=False)

0.7954787702827636

In [86]:
test_set = df_test.merge(user_embeddings, on="user id").merge(movies_embeddings, on="item id")
X_test = (
    test_set
    .drop(["user id", "rating", "timestamp"], axis=1)
    .rename(columns= {"item id": "movie id"})
    .merge(df_items.drop(['movie title', 'release date', 'video release date', 'IMDb URL', 'unknown'], axis=1))
    .drop("movie id", axis=1)
)

y_test = test_set['rating']
X_test.shape, y_test.shape

((9423, 518), (9423,))

In [87]:
test_preds = regressor.predict(X_test)

In [88]:
test_preds.min(), test_preds.max()

(1.2322675326825407, 4.833632989510372)

In [89]:
mean_squared_error(y_test, test_preds, squared=False)

0.9870712212369216