In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import metrics
import myfm
from myfm import RelationBlock
import pandas as pd
from scipy import sparse as sps
from mapper import DefaultMapper

# read movielens 100k data.
from movielens100k_data import MovieLens100kDataManager

In [2]:
data_manager = MovieLens100kDataManager()
df_train, df_test = data_manager.load_ranking(fold=3)

user_info = data_manager.load_userinfo().set_index('user_id')
user_info['age'] = user_info.age // 5 * 5
user_info['zipcode'] = user_info.zipcode.str[0]

user_info_ohe = OneHotEncoder(handle_unknown='ignore').fit(user_info)

In [3]:
user_to_internal = DefaultMapper(df_train.user_id.values)
movie_to_internal = DefaultMapper(df_train.movie_id)

In [4]:
date_be = OneHotEncoder(handle_unknown='ignore').fit(
    df_train.timestamp.dt.date.values.reshape(-1, 1)
)
def categorize_date(df):
    return date_be.transform(df.timestamp.dt.date.values[:, np.newaxis])

In [5]:
X_date_train = categorize_date(df_train)
X_date_test  = categorize_date(df_test)

## Implement flavor of SVD++

We add "all users who have evaluated a movie in the train set" as the movie's feature.

In [6]:
movie_vs_watched = dict()
for row in df_train.itertuples():
    user_id = row.user_id
    movie_id = row.movie_id
    movie_vs_watched.setdefault(movie_id, list()).append(user_id)

In [7]:
feature_group_sizes = [
    len(date_be.categories_[0]) # date
] + [
    len(user_to_internal) # user ids
] + [
    len(c) for c in user_info_ohe.categories_ # user attributes
] + [
    len(movie_to_internal), # movie ids
    len(user_to_internal) # all users who watched the movies
]

grouping = [ i for i, size in enumerate(feature_group_sizes) for _ in range(size)]

In [8]:
# given user/movie ids, add additional infos and return it as sparse
def augment_user_id(user_ids):
    X = sps.lil_matrix((len(user_ids), len(user_to_internal)))
    for index, user_id in enumerate(user_ids):
        X[index, user_to_internal[user_id]] = 1
    return sps.hstack(
        [X.tocsr(), user_info_ohe.transform(user_info.reindex(user_ids))],
        format='csr'
    )

def augment_movie_id(movie_ids):
    X = sps.lil_matrix((len(movie_ids), len(movie_to_internal)+len(user_to_internal)))
    for index, movie_id in enumerate(movie_ids):
        X[index, movie_to_internal[movie_id]] = 1
        watched_users = movie_vs_watched.get(movie_id, [])
        normalizer = 1 / max(len(watched_users), 1) ** 0.5
        for uid in watched_users:
            X[index, user_to_internal[uid] + len(movie_to_internal)] = normalizer
    return X.tocsr()

In [9]:
def get_key_and_index(keys):
    unique_keys = np.unique(keys)
    to_index = { key: i for i, key in enumerate(unique_keys)}
    index = np.asarray([to_index[j] for j in keys])
    return unique_keys, index

## User Relation Block to express data
See [Rendle 2013](http://www.vldb.org/pvldb/vol6/p337-rendle.pdf) how comlexity dcrease drastically in this case (and most cases with bipartite graph structure).

In [10]:
# Create RelationBlock.
train_blocks = []
test_blocks = []
for source, target in [(df_train, train_blocks), (df_test, test_blocks)]:
    unique_users, user_map = get_key_and_index(source.user_id) 
    target.append(
        RelationBlock(user_map, augment_user_id(unique_users))
    )
    unique_movies, movie_map = get_key_and_index(source.movie_id)
    target.append(
        RelationBlock(movie_map, augment_movie_id(unique_movies))
    )

In [11]:
fm = myfm.MyFMRegressor(rank=8)
fm.fit(X_date_train, df_train.rating.values, X_rel=train_blocks,
        grouping=grouping,
        n_kept_samples=95, n_iter=100);

alpha = 1.58 w0 = 3.64 : 100%|██████████| 100/100 [00:03<00:00, 28.63it/s]


In [12]:
test_predictions = fm.predict(X_date_test, test_blocks)

In [13]:
rmse = (
    (test_predictions - df_test.rating.values)**2
).mean() ** 0.5
mae = np.abs(test_predictions - df_test.rating).mean()

# Note the improvement from "id_only" case.
# Compare this with methods like ones in https://paperswithcode.com/sota/collaborative-filtering-on-movielens-100k
print('rmse={}, mae={}'.format(rmse, mae))

rmse=0.880576057570827, mae=0.6919916880160701


In [14]:
# If we use the original data format, it takes much more!
X_original_format = sps.hstack([
    X_date_train
] + [rel.data[rel.original_to_block] for rel in train_blocks])

fm_rawformat = myfm.MyFMRegressor(rank=8).fit(X_original_format, df_train.rating,
                                          grouping=grouping, n_iter=6, n_kept_samples=1)

alpha = 1.23 w0 = 3.72 : 100%|██████████| 6/6 [00:05<00:00,  1.14it/s]


In [15]:
# They shoud be same up to floating point artifact.
fm_rawformat.predictor_.samples[-1].V - fm.predictor_.samples[0].V

array([[ 3.46944695e-16,  1.80411242e-15, -8.04911693e-16, ...,
         1.83186799e-15, -8.84708973e-17,  3.15025783e-15],
       [ 2.02615702e-15,  1.72084569e-15, -4.32986980e-15, ...,
         3.55271368e-15, -2.49800181e-16,  3.83026943e-15],
       [-2.94209102e-15, -2.77555756e-17, -2.10942375e-15, ...,
         1.04083409e-15, -2.59514632e-15,  2.66453526e-15],
       ...,
       [ 7.49400542e-16,  7.49400542e-16,  1.04083409e-15, ...,
         8.88178420e-16, -4.99600361e-16,  6.10622664e-16],
       [-2.10942375e-15,  7.07767178e-16,  5.55111512e-17, ...,
         6.66133815e-16,  2.91433544e-16,  1.94289029e-15],
       [-1.04083409e-15,  2.49800181e-16, -8.88178420e-16, ...,
        -1.22124533e-15,  5.55111512e-16, -5.55111512e-17]])