In [1]:
from collections import defaultdict
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import myfm
from myfm import RelationBlock
import pandas as pd
from scipy import sparse as sps
# read movielens 100k data.
from myfm.utils.benchmark_data import MovieLens100kDataManager
from myfm.utils.encoders import(
    DataFrameEncoder, CategoryValueToSparseEncoder, BinningEncoder, ManyToManyEncoder
)

In [2]:
# Implement side information and flavor of SVD++
# We add "all users who have evaluated a movie in the train set" or
# "all movies rated by a user" as a feture of user/movie.
use_date = True # use date info or not
use_iu = True # use implicit user feature
use_ii = True # use implicit item feature
use_user_info = True # use user information
use_movie_info = True # use movie information

In [3]:
data_manager = MovieLens100kDataManager()
df_train, df_test = data_manager.load_rating_predefined_split(fold=1) # fold 1 is the toughest one

user_info = data_manager.load_user_info().set_index('user_id')
user_info['zipcode'] = user_info.zipcode.str[0]

user_encoder = DataFrameEncoder().add_column(
    'user_id', CategoryValueToSparseEncoder(user_info.index)
)
if use_user_info:
    user_encoder.add_column(
        'age', BinningEncoder(user_info.age, n_percentiles=10)
    ).add_column(
        'occupation', CategoryValueToSparseEncoder(user_info.occupation)
    ).add_column(
        'zipcode', CategoryValueToSparseEncoder(user_info.zipcode)
    )

In [4]:
movie_info, movie_genres = data_manager.load_movie_info()
movie_info['release_year'] = movie_info['release_date'].dt.year

movie_info = movie_info[['movie_id', 'release_year']].set_index('movie_id')

movie_encoder = DataFrameEncoder().add_column(
    'movie_id', CategoryValueToSparseEncoder(movie_info.index)
)
if use_movie_info:
    movie_encoder.add_column(
        'release_year', BinningEncoder(movie_info.release_year)
    ).add_many_to_many(
        'movie_id', 'genre', ManyToManyEncoder(movie_genres.genre)
    )


date_encoder = CategoryValueToSparseEncoder(df_train.timestamp.dt.date.values)

In [5]:
user_info.head()

Unnamed: 0_level_0,age,gender,occupation,zipcode
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,8
2,53,F,other,9
3,23,M,writer,3
4,24,M,technician,4
5,33,F,other,1


In [6]:
movie_info.head()

Unnamed: 0_level_0,release_year
movie_id,Unnamed: 1_level_1
1,1995.0
2,1995.0
3,1995.0
4,1995.0
5,1995.0


In [7]:
if use_iu:
    user_encoder.add_many_to_many(
        'user_id', 'movie_id', ManyToManyEncoder(df_train.movie_id, normalize=True)
    )
if use_ii:
    movie_encoder.add_many_to_many(
        'movie_id', 'user_id', ManyToManyEncoder(df_train.user_id, normalize=True)
    )
    

In [8]:
# given user/movie ids, add additional infos and return it as sparse
def augment_user_id(user_ids):
    right_joined = [df_train] if use_iu else []
    return user_encoder.encode_df(
        user_info.reindex(user_ids).reset_index(), right_joined
    )

def augment_movie_id(movie_ids):
    right_joined = []
    if use_movie_info:
        right_joined.append(movie_genres)
    if use_ii:
        right_joined.append(df_train)
    return movie_encoder.encode_df(
        movie_info.reindex(movie_ids).reset_index(), right_joined
    )

## User Relation Block to express data
See [\[Rendle 2013\]](http://www.vldb.org/pvldb/vol6/p337-rendle.pdf) how comlexity dcrease drastically in this case (and most cases with bipartite graph structure).

In [9]:
# Create RelationBlock.
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.unique.html
train_blocks = []
test_blocks = []
for source, target in [(df_train, train_blocks), (df_test, test_blocks)]:
    unique_users, user_map = np.unique(source.user_id, return_inverse=True)
    target.append(
        RelationBlock(user_map, augment_user_id(unique_users))
    )
    unique_movies, movie_map = np.unique(source.movie_id, return_inverse=True)
    target.append(
        RelationBlock(movie_map, augment_movie_id(unique_movies))
    )
    
if use_date:
    X_date_train = date_encoder.to_sparse(df_train.timestamp.dt.date.values)
    X_date_test = date_encoder.to_sparse(df_test.timestamp.dt.date.values)
else:
    X_date_train = None
    X_date_test = None

## Regression

In [10]:
group_shapes = [len(date_encoder)] + user_encoder.encoder_shapes + movie_encoder.encoder_shapes

fm = myfm.MyFMRegressor(rank=10)
fm.fit(
    X_date_train, df_train.rating.values, X_rel=train_blocks,
    group_shapes=group_shapes,
    X_test=X_date_test, X_rel_test=test_blocks,
    y_test=df_test.rating.values,
    n_iter=512, n_kept_samples=512
);

alpha = 1.64 w0 = 3.62  rmse_this: 0.97 mae_this: 0.76: 100%|██████████| 512/512 [00:24<00:00, 20.90it/s]


In [11]:
test_predictions = fm.predict(X_date_test, test_blocks)

rmse = (
    (test_predictions - df_test.rating.values)**2
).mean() ** 0.5
mae = np.abs(test_predictions - df_test.rating).mean()

# Note the improvement from "id_only" case.
# Compare this with methods like ones in https://paperswithcode.com/sota/collaborative-filtering-on-movielens-100k
print('rmse={}, mae={}'.format(rmse, mae))

rmse=0.8835596352777759, mae=0.6942521867272365


## Ordered Probit Regression

In [12]:
fm_probit = myfm.MyFMOrderedProbit(rank=10)
fm_probit.fit(
    X_date_train, df_train.rating.values - 1, X_rel=train_blocks,
    group_shapes=group_shapes,
    n_iter=512, n_kept_samples=512
);

w0 = -0.11, cutpoint = ['-2.667', '-1.699', '-0.474', '0.955'] : 100%|██████████| 512/512 [00:49<00:00, 10.41it/s]


In [13]:
test_prediction_ordered_prob = fm_probit.predict_proba(X_date_test, test_blocks)
test_prediction_ordered_mean = 1 + test_prediction_ordered_prob.dot(np.arange(5)) # class 0 => rating 1 shift

rmse = (
    (test_prediction_ordered_mean - df_test.rating.values) **2
).mean() ** 0.5
mae = np.abs(test_prediction_ordered_mean - df_test.rating).mean()
test_predictions = fm.predict(X_date_test, test_blocks)

print('rmse={}, mae={}'.format(rmse, mae))

rmse=0.8817068974664438, mae=0.6927534156607223


In [14]:
# If we use the original data format, it takes much more!
X_original_format = []
if use_date:
    X_original_format.append(X_date_train)

X_original_format.extend(
    [rel.data[rel.original_to_block] for rel in train_blocks]
)

X_original_format = sps.hstack(X_original_format, format='csr')

fm_rawformat = myfm.MyFMRegressor(rank=10).fit(
    X_original_format, df_train.rating,
    group_shapes=group_shapes, n_iter=6, n_kept_samples=1
)

alpha = 0.07 w0 = 3.67 : 100%|██████████| 6/6 [00:12<00:00,  2.09s/it]


In [19]:
# They shoud be same up to floating point artifact.
fm_rawformat.predictor_.samples[-1].V - fm.predictor_.samples[5].V

array([[-9.71445147e-16,  4.30211422e-16,  2.77555756e-16, ...,
        -1.38777878e-16,  6.66133815e-16, -2.08166817e-16],
       [ 1.05471187e-15,  5.55111512e-17, -1.27675648e-15, ...,
         5.55111512e-16, -1.11022302e-15,  1.16573418e-15],
       [-1.88737914e-15, -1.88737914e-15,  9.99200722e-16, ...,
         4.66293670e-15, -1.80411242e-15, -3.46944695e-15],
       ...,
       [-1.11022302e-16, -2.08166817e-17,  1.80411242e-16, ...,
        -1.87350135e-16, -9.99200722e-16, -2.84494650e-16],
       [ 5.55111512e-16, -6.24500451e-17,  2.09554596e-15, ...,
         1.43635104e-15,  2.83106871e-15, -6.10622664e-16],
       [-6.52256027e-16,  9.71445147e-16, -4.08006962e-15, ...,
        -3.46944695e-16, -1.40165657e-15,  5.34294831e-16]])