In [1]:
from collections import defaultdict
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import pandas as pd
from scipy import sparse as sps

# read movielens 100k data.
import myfm
from myfm import RelationBlock
from myfm.utils.benchmark_data import MovieLens100kDataManager
from myfm.utils.encoders import(
    DataFrameEncoder, CategoryValueToSparseEncoder, BinningEncoder, MultipleValuesToSparseEncoder
)

In [2]:
# Implement side information and flavor of SVD++
# We add "all users who have evaluated a movie in the train set" or
# "all movies rated by a user" as a feture of user/movie.
use_date = True # use date info or not
use_iu = True # use implicit user feature
use_ii = True # use implicit item feature
use_user_info = True # use user information
use_movie_info = True # use movie information

In [3]:
data_manager = MovieLens100kDataManager()
df_train, df_test = data_manager.load_rating_predefined_split(fold=1) # fold 1 is the toughest one

user_info = data_manager.load_user_info().set_index('user_id')
user_info['zipcode'] = user_info.zipcode.str[0]

user_encoder = DataFrameEncoder().add_column(
    'user_id', CategoryValueToSparseEncoder(user_info.index)
)
if use_user_info:
    user_encoder.add_column(
        'age', BinningEncoder(user_info.age, n_percentiles=10)
    ).add_column(
        'occupation', CategoryValueToSparseEncoder(user_info.occupation)
    ).add_column(
        'zipcode', CategoryValueToSparseEncoder(user_info.zipcode)
    )

In [4]:
movie_info = data_manager.load_movie_info()
movie_info['release_year'] = movie_info['release_date'].dt.year

movie_info = movie_info[['movie_id', 'release_year', 'genres']].set_index('movie_id')

movie_encoder = DataFrameEncoder().add_column(
    'movie_id', CategoryValueToSparseEncoder(movie_info.index)
)
if use_movie_info:
    movie_encoder.add_column(
        'release_year', BinningEncoder(movie_info.release_year)
    ).add_column(
        'genres', MultipleValuesToSparseEncoder(movie_info.genres, sep='|')
    )

if use_date:
    date_encoder = CategoryValueToSparseEncoder(df_train.timestamp.dt.date.values)
else:
    date_encoder = None

In [5]:
def string_agg(int_list) -> str:
    return ','.join([str(y) for y in int_list])

if use_iu:
    user_info['user_implicit_feature'] = (
        df_train.groupby('user_id')
        .movie_id.agg(string_agg)
        .reindex(user_info.index)
        .fillna('')
    )
if use_ii:
    movie_info['movie_implicit_feature'] = (
        df_train.groupby('movie_id')
        .user_id.agg(string_agg)
        .reindex(movie_info.index)
        .fillna('')
    )

In [6]:
user_info.head()

Unnamed: 0_level_0,age,gender,occupation,zipcode,user_implicit_feature
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,24,M,technician,8,"1,2,3,4,5,7,8,9,11,13,15,16,18,19,21,22,25,26,..."
2,53,F,other,9,"1,10,14,25,100,111,127,237,242,255,258,269,272..."
3,23,M,writer,3,"181,258,260,268,271,288,302,303,317,319,320,32..."
4,24,M,technician,4,"11,210,258,271,300,301,324,327,328,329,358,359..."
5,33,F,other,1,"21,25,29,50,63,66,70,95,99,101,105,121,135,145..."


In [7]:
movie_info.head()

Unnamed: 0_level_0,release_year,genres,movie_implicit_feature
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1995.0,Animation|Children's|Comedy,"1,2,6,13,16,21,25,26,42,44,54,57,58,59,62,63,6..."
2,1995.0,Action|Adventure|Thriller,"1,22,30,42,49,72,83,102,110,130,178,197,234,24..."
3,1995.0,Thriller,"1,43,59,63,82,104,207,216,244,268,269,276,280,..."
4,1995.0,Action|Comedy|Drama,"1,7,12,13,16,18,22,43,49,59,83,84,92,94,102,11..."
5,1995.0,Crime|Drama|Thriller,"1,13,21,28,72,92,118,130,135,188,207,234,255,2..."


In [8]:
if use_iu:
    user_encoder.add_column(
        'user_implicit_feature',
        MultipleValuesToSparseEncoder(user_info.user_implicit_feature, normalize=True)
    )
if use_ii:
    movie_encoder.add_column(
        'movie_implicit_feature',
        MultipleValuesToSparseEncoder(movie_info.movie_implicit_feature, normalize=True)
    )
    

In [9]:
# given user/movie ids, add additional infos and return it as sparse
def augment_user_id(user_ids):
    return user_encoder.encode_df(
        user_info.reindex(user_ids).reset_index()
    )

def augment_movie_id(movie_ids):
    return movie_encoder.encode_df(
        movie_info.reindex(movie_ids).reset_index()
    )

## User Relation Block to express data
See [\[Rendle 2013\]](http://www.vldb.org/pvldb/vol6/p337-rendle.pdf) how comlexity dcrease drastically in this case (and most cases with bipartite graph structure).

In [10]:
# Create RelationBlock.
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.unique.html
train_blocks = []
test_blocks = []
for source, target in [(df_train, train_blocks), (df_test, test_blocks)]:
    unique_users, user_map = np.unique(source.user_id, return_inverse=True)
    target.append(
        RelationBlock(user_map, augment_user_id(unique_users))
    )
    unique_movies, movie_map = np.unique(source.movie_id, return_inverse=True)
    target.append(
        RelationBlock(movie_map, augment_movie_id(unique_movies))
    )
    
if use_date:
    X_date_train = date_encoder.to_sparse(df_train.timestamp.dt.date.values)
    X_date_test = date_encoder.to_sparse(df_test.timestamp.dt.date.values)
else:
    X_date_train = None
    X_date_test = None

## Regression

In [11]:
group_shapes = ([len(date_encoder)] if use_date else []) + user_encoder.encoder_shapes + movie_encoder.encoder_shapes

fm = myfm.MyFMRegressor(rank=10)
fm.fit(
    X_date_train, df_train.rating.values, X_rel=train_blocks,
    group_shapes=group_shapes,
    X_test=X_date_test, X_rel_test=test_blocks,
    y_test=df_test.rating.values,
    n_iter=512, n_kept_samples=512
);

alpha = 1.66 w0 = 3.40  rmse_this: 0.97 mae_this: 0.76: 100%|█████████████████████████| 512/512 [00:23<00:00, 22.13it/s]


In [12]:
test_predictions = fm.predict(X_date_test, test_blocks)

rmse = (
    (test_predictions - df_test.rating.values)**2
).mean() ** 0.5
mae = np.abs(test_predictions - df_test.rating).mean()

# Note the improvement from "id_only" case.
# Compare this with methods like ones in https://paperswithcode.com/sota/collaborative-filtering-on-movielens-100k
print('rmse={}, mae={}'.format(rmse, mae))

rmse=0.8833165623495484, mae=0.6940611748122608


## Ordered Probit Regression

In [13]:
fm_probit = myfm.MyFMOrderedProbit(rank=10)
fm_probit.fit(
    X_date_train, df_train.rating.values - 1, X_rel=train_blocks,
    group_shapes=group_shapes,
    n_iter=512, n_kept_samples=512
);

w0 = 0.21, cutpoint = ['-2.250', '-1.278', '-0.045', '1.376'] : 100%|█████████████████| 512/512 [00:41<00:00, 12.28it/s]


In [14]:
test_prediction_ordered_prob = fm_probit.predict_proba(X_date_test, test_blocks)
test_prediction_ordered_mean = 1 + test_prediction_ordered_prob.dot(np.arange(5)) # class 0 => rating 1 shift

rmse = (
    (test_prediction_ordered_mean - df_test.rating.values) **2
).mean() ** 0.5
mae = np.abs(test_prediction_ordered_mean - df_test.rating).mean()
test_predictions = fm.predict(X_date_test, test_blocks)

print('rmse={}, mae={}'.format(rmse, mae))

rmse=0.8808022284735955, mae=0.6920880592029289


In [15]:
# If we use the original data format, it takes much more!
X_original_format = []
if use_date:
    X_original_format.append(X_date_train)

X_original_format.extend(
    [rel.data[rel.original_to_block] for rel in train_blocks]
)

X_original_format = sps.hstack(X_original_format, format='csr')

fm_rawformat = myfm.MyFMRegressor(rank=10).fit(
    X_original_format, df_train.rating,
    group_shapes=group_shapes, n_iter=6, n_kept_samples=1
)

alpha = 0.08 w0 = 3.29 : 100%|████████████████████████████████████████████████████████████| 6/6 [00:13<00:00,  2.18s/it]


In [16]:
# They shoud be same up to floating point artifact.
fm_rawformat.predictor_.samples[-1].V - fm.predictor_.samples[5].V

array([[ 1.33920652e-15, -5.23886490e-16,  1.29410371e-15, ...,
         0.00000000e+00,  4.85722573e-17, -2.08166817e-16],
       [ 3.27515792e-15,  2.44249065e-15,  1.52655666e-15, ...,
         1.87350135e-15,  1.91513472e-15, -1.49880108e-15],
       [ 4.44089210e-16, -2.33146835e-15, -3.05311332e-16, ...,
         4.63518113e-15, -1.38777878e-16, -2.27595720e-15],
       ...,
       [-2.77555756e-16, -1.66533454e-16,  8.04911693e-16, ...,
        -5.55111512e-17, -1.94289029e-16,  2.49800181e-16],
       [ 2.08166817e-17, -4.29344060e-16,  8.75167994e-16, ...,
         1.66533454e-16,  4.99600361e-16,  1.11022302e-16],
       [ 8.04911693e-16,  1.80411242e-16, -3.33066907e-16, ...,
        -2.77555756e-16, -1.80411242e-16, -5.55111512e-17]])