# FastAI Recommender

In [2]:
import sys, os, torch, fastai
import pandas as pd
import numpy as np
import scrapbook as sb

from tempfile import TemporaryDirectory
from fastai.collab import collab_learner, CollabDataBunch, load_learner
from recommenders.utils.constants import (
DEFAULT_USER_COL as USER,
DEFAULT_ITEM_COL as ITEM,
DEFAULT_RATING_COL as RATING,
DEFAULT_TIMESTAMP_COL as TIMESTAMP,
DEFAULT_PREDICTION_COL as PREDICTION)
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.models.fastai.fastai_utils import cartesian_product, score
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.evaluation.python_evaluation import rmse, mae, rsquared, exp_var

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Fast AI version: {}".format(fastai.__version__))
print("Torch version: {}".format(torch.__version__))
print("Cuda Available: {}".format(torch.cuda.is_available()))
print("CuDNN Enabled: {}".format(torch.backends.cudnn.enabled))

System version: 3.7.13 (default, Mar 29 2022, 02:18:16) 
[GCC 7.5.0]
Pandas version: 1.3.5
Fast AI version: 1.0.61
Torch version: 1.11.0+cu102
Cuda Available: True
CuDNN Enabled: True


In [3]:
Top_k = 10
Movie_data_size = '100k'
N_factors = 40
epochs = 5

In [4]:
ratings_df = movielens.load_pandas_df(size=Movie_data_size,
                                     header=[USER, ITEM, RATING, TIMESTAMP])
ratings_df[USER] = ratings_df[USER].astype('str')
ratings_df[ITEM] = ratings_df[ITEM].astype('str')
ratings_df.head()

100%|█████████████████████████████████████| 4.81k/4.81k [00:02<00:00, 1.91kKB/s]


Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [19]:
train_valid_df, test_df = python_stratified_split(ratings_df,
                                                 ratio=0.75,
                                                 min_rating=1,
                                                 filter_by='item',
                                                 col_user=USER,
                                                 col_item=ITEM)
test_df = test_df[test_df.userID.isin(train_valid_df.userID)]

### Training

In [20]:
np.random.seed(27)
torch.manual_seed(27)
torch.cuda.manual_seed_all(27)

In [21]:
with Timer() as preprocess_time:
    data = CollabDataBunch.from_df(train_valid_df,
                                  user_name=USER,
                                  item_name=ITEM,
                                  rating_name=RATING,
                                  valid_pct=0)

In [24]:
data.show_batch()

userID,itemID,target
511,292,5.0
393,940,2.0
655,1012,3.0
170,322,5.0
60,496,4.0


데이터에 대해 *EmbeddingDotBias* 모델을 기본으로 사용하는 `collab_learner`를 만든다. 임베딩 파라미터들은 사전에 정의되지 않고 모델에 의해 학습된다.   
모델은 learning rate를 최대로 설정한 후 5 에폭 학습시킨다. 학습기는 각 에폭마다 learning rate를 낮출 것이다.

In [25]:
learn = collab_learner(data, n_factors=N_factors, y_range=[0, 5.5], wd=0.1)
learn.model

EmbeddingDotBias(
  (u_weight): Embedding(944, 40)
  (i_weight): Embedding(1683, 40)
  (u_bias): Embedding(944, 1)
  (i_bias): Embedding(1683, 1)
)

In [27]:
with Timer() as train_time:
    learn.fit_one_cycle(epochs, max_lr=0.005)

print('Took {} seconds for training'.format(train_time))
    

epoch     train_loss  valid_loss  time    
0         0.959332    #na#        00:02     
1         0.895546    #na#        00:02     
2         0.758320    #na#        00:02     
3         0.655181    #na#        00:02     
4         0.552219    #na#        00:02     
Took 10.8386 seconds for training


In [28]:
# Save the learner so it can be loaded back
tmp = TemporaryDirectory()
model_path = os.path.join(tmp.name, 'FastAI_movielens_model.pkl')
learn.export(model_path)

### Generating Recommedations


In [64]:
# Load the learner
learner = load_learner(tmp.name, 'FastAI_movielens_model.pkl')

# Get all users and items that te model knows
total_users, total_items = learner.data.train_ds.x.classes.values()
total_items = total_items[1:]
total_users = total_users[1:]

# Get all users from the test set and remove any users that were known in the train
test_users = test_df[USER].unique()
test_users = np.intersect1d(test_users, total_users)

# Build cartesian prod of test set users and all items known to the model
users_items = cartesian_product(np.array(test_users), np.array(total_items))
users_items = pd.DataFrame(users_items, columns=[USER, ITEM])

# Remove the user/items combi that are in the training set
train_removed = pd.merge(users_items, train_valid_df.astype(str), on=[USER, ITEM], how='left')
train_removed = train_removed[train_removed[RATING].isna()][[USER, ITEM]]


### Score the model to find the top-K recommendation

In [65]:
with Timer() as test_time:
    top_k_scores = score(learner,
                        test_df=train_removed,
                        user_col=USER,
                        item_col=ITEM,
                        prediction_col=PREDICTION)
print('Took {} seconds for {} predictions.'.format(test_time, len(train_removed)))

Took 0.9122 seconds for 1511060 predictions.


In [69]:
eval_map = map_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM,
                   col_rating=RATING, col_prediction=PREDICTION,
                   relevancy_method='top_k', k=Top_k)

eval_ndcg = ndcg_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM,
                   col_rating=RATING, col_prediction=PREDICTION,
                   relevancy_method='top_k', k=Top_k)

eval_precision = precision_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM,
                   col_rating=RATING, col_prediction=PREDICTION,
                   relevancy_method='top_k', k=Top_k)

eval_recall = recall_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM,
                   col_rating=RATING, col_prediction=PREDICTION,
                   relevancy_method='top_k', k=Top_k)

In [71]:
print("Model:\t" + learn.__class__.__name__,
      "Top K:\t%d" % Top_k,
      "MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Model:	CollabLearner
Top K:	10
MAP:	0.027923
NDCG:	0.163131
Precision@K:	0.142842
Recall@K:	0.056960


In [75]:
scores = score(learner, test_df=test_df.copy(),
              user_col=USER,
              item_col=ITEM,
              prediction_col=PREDICTION)

eval_r2 = rsquared(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_rmse = rmse(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_mae = mae(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_exp_var = exp_var(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)

print("Model:\t" + learn.__class__.__name__,
      "RMSE:\t%f" % eval_rmse,
      "MAE:\t%f" % eval_mae,
      "Explained variance:\t%f" % eval_exp_var,
      "R squared:\t%f" % eval_r2, sep='\n')


Model:	CollabLearner
RMSE:	0.902726
MAE:	0.713194
Explained variance:	0.345878
R squared:	0.345169
