# Factorization Machines
論文リンク：https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf   
今回は[fastFM](https://github.com/ibayer/fastFM)というライブラリを使用して実装

In [None]:
#!pip instal fastFM

In [None]:
import os
from time import time, gmtime, strftime
from configparser import ConfigParser
import numpy as np
import pandas as pd
import pickle
from fastFM import sgd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from utils import export_result


In [None]:
# configファイルの読み込み
config_filename = './config/FM_config.ini'

config = ConfigParser()
config.read(config_filename)

for key in config['model'].keys():
    print(key,config['model'][key])


In [None]:
def config2paramDict(config):
    '''configファイルのmodel部分をdictにして返す関数'''
    param_dict = {}
    for key in config['model'].keys():
        if key == 'n_iter' or key == 'rank':
            param_dict[key] = int(config['model'][key])
        else:
            param_dict[key] = float(config['model'][key])
    return param_dict


In [None]:
param_dict = config2paramDict(config)
print(param_dict)

## データの入力
fastFMで二値分類をする場合ラベルを{+1, -1}にする必要があるためデータセットのラベルを変換しなくてはいけない

In [None]:
# 単に{+1,-1}にするための関数
def map_rating(x):
    if x == 0:
        return -1
    else:
        return 1
    

In [None]:
# メモリに乗り切らない場合はサンプルを減らす
# 後のDictVectorizerで全量ないと予測・評価でコケるため一旦全てを結合する

train_df = pd.read_csv('../data/MovieLens20M/classification/train20m.csv')
eval_df = pd.read_csv('../data/MovieLens20M/classification/eval20m.csv')
test_df = pd.read_csv('../data/MovieLens20M/classification/test20m.csv')

# fastFMのfit時に改めてtrain_test_splitされるためtrainとevalはこの後も一緒にする
dataset = pd.concat([train_df, eval_df])
dataset = pd.concat([dataset, test_df])
test_size = len(test_df)

print('Test data size: {}'.format(test_size))

dataset.rating = dataset.rating.map(lambda x: map_rating(x))
dataset.head()

del train_df
del eval_df
del test_df


In [None]:
# DictVectorizer用にdictのlistを作成
X_list = []

# 目的変数はDictVectorizerにいれないのでそのまま分割
# yは一次元にreshapeする（.reshape(-1,1)とかすると実行中にコケる）
y_train = np.array(dataset.iloc[:-test_size,-1]).reshape(-1,)
y_test = np.array(dataset.iloc[-test_size:,-1]).reshape(-1,)

t1 = time()
for row in dataset.iloc[:,:3].itertuples(index=False, name=None):
    X_list.append({"user": str(int(row[0])), "item": str(int(row[1]))})
t2 = time()
print('Finished in {:.4f} seconds'.format(t2-t1))
del dataset


In [None]:
print(len(X_list))

In [None]:
v = DictVectorizer()
X = v.fit_transform(X_list)

X_train = X[:-test_size]
X_test = X[-test_size:]
del X_list


## 学習

In [None]:
# 実行時間
raw_execute_time = gmtime()
execute_time = strftime("%Y%m%d_%H%M%S", raw_execute_time )

fm = sgd.FMClassification(n_iter=param_dict['n_iter'], 
                          init_stdev=param_dict['init_stdev'], 
                          l2_reg_w=param_dict['l2_reg_w'],
                          l2_reg_V=param_dict['l2_reg_v'], 
                          rank=param_dict['rank'], 
                          step_size=param_dict['step_size'])
fm.fit(X_train, y_train)


##### モデルファイルの保存

In [None]:
filename = '../models/' + execute_time + '_FM.sav'
pickle.dump(fm, open(filename, 'wb'))

## 予測・評価

In [None]:
y_pred = fm.predict(X_test)
auc = roc_auc_score(y_test, y_pred)
accuracy = accuracy_score(y_test, np.round(y_pred))

rankname = 'rank-' + str(param_dict['rank']) + ':'
print('Accuracy:')
print(rankname, '{:.4f}'.format(accuracy))


In [None]:
# 結果の出力
model_name = 'FactorizationMachines'
export_result(model_name, auc, accuracy, config_filename, execute_time)
