In [None]:
%matplotlib inline
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics

import myfm

# read movielens 100k data.
from movielens100k_loader import load_dataset

df_train, df_test = load_dataset(
    id_only=True, fold=3
) # Note the dependence on the fold

In [2]:
df_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,6,5,887431973


In [15]:
def test_myfm(df_train, df_test, rank=8, grouping=None, classification=False, n_iter=100, samples=95):
    explanation_columns = ['user_id', 'movie_id']
    ohe = OneHotEncoder(handle_unknown='ignore')
    X_train = ohe.fit_transform(df_train[explanation_columns])
    X_test = ohe.transform(df_test[explanation_columns])
    y_train = df_train.rating.values
    y_test = df_test.rating.values
    if classification:
        y_train = y_train >= 4
        y_test = y_test >=4
        fm = myfm.MyFMClassifier(rank=rank, random_seed=334)
    else:
        fm = myfm.MyFMRegressor(rank=rank, random_seed=114514)
    
    if grouping:
        grouping = [ i for i, category in enumerate(ohe.categories_) for _ in category]
    X_train *= 1
    X_test *= 1
    fm.fit(X_train, y_train, grouping=grouping, n_iter=n_iter, n_kept_samples=samples)
    
    if classification:
        prediction = fm.predict_proba(X_test)
        roc = metrics.roc_auc_score(y_test, prediction)
        ll = metrics.log_loss(y_test, prediction)
        print('roc={roc}, ll={ll}'.format(roc=roc, ll=ll))
    else:
        prediction = fm.predict(X_test)
        rmse = ((y_test - prediction) ** 2).mean() ** .5
        mae = np.abs(y_test - prediction).mean()
        print('rmse={rmse}, mae={mae}'.format(rmse=rmse, mae=mae))
    return fm

### Regression

In [16]:
# basic regression
test_myfm(df_train, df_test, rank=8, classification=False, n_iter=300);

alpha = 1.55 w0 = 3.10 : 100%|██████████| 300/300 [00:05<00:00, 55.87it/s]


rmse=0.8993437575803191, mae=0.7040433136140822


In [17]:
# with grouping
fm = test_myfm(df_train, df_test, rank=8, grouping=True, classification=False)

alpha = 1.56 w0 = 3.33 : 100%|██████████| 100/100 [00:02<00:00, 38.28it/s]


rmse=0.8959382764109612, mae=0.7048050699780434


### Classification

In [None]:
test_myfm(df_train, df_test, rank=8, classification=True);

In [None]:
test_myfm(df_train, df_test, rank=8, classification=True, grouping=True);

### Trace plot of hyper parameters

In [None]:
trace = fm.get_hyper_trace()
trace.head()

In [None]:
trace[['alpha']].plot();

In [None]:
trace[[c for c in trace.columns if c.startswith('mu_w')]].plot();

In [None]:
trace[[c for c in trace.columns if c.startswith('lambda_w')]].plot();

In [None]:
trace[[c for c in trace.columns if c.startswith('mu_V')]].plot();

In [None]:
trace[[c for c in trace.columns if c.startswith('lambda_V')]].plot();