# NeuralFM
Link: https://www.comp.nus.edu.sg/~xiangnan/papers/sigir17-nfm.pdf

In [None]:
import os
import itertools
import shutil
import math
from configparser import ConfigParser
from datetime import datetime
from time import time, gmtime, strftime

import numpy as np
import pandas as pd
import tensorflow as tf
from utils import EarlyStoppingHook, export_result
from sklearn.metrics import roc_auc_score, accuracy_score

In [None]:
# configファイルの読み込み
config_filename = './config/NeuralFM_config.ini'

config = ConfigParser()
config.read(config_filename)

for key in config['model'].keys():
    print(key, config['model'][key])


In [None]:
# データの入力
HEADER = ['user_id', 'item_id', 'rating']
HEADER_DEFAULTS = [['0'], ['0'], [0]]

FEATURE_NAMES = ['user_id', 'item_id']
CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE = {
  'user_id': int(config['model']['user_bucket_size']),
  'item_id' : int(config['model']['item_bucket_size'])
  }

USED_FEATURE_NAMES = ['user_id', 'item_id', 'rating']

CATEGORICAL_FEATURE_NAMES =  list(CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE.keys())
TARGET = 'rating'
TARGET_LABELS = [0,1]


In [None]:
user_embedding_dim = int(config['model']['user_embedding_dim'])
item_embedding_dim = int(config['model']['item_embedding_dim'])
    
# カラム情報取得
categorical_hash_user_raw = \
tf.feature_column.categorical_column_with_hash_bucket('user_id', CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE['user_id'])
categorical_hash_item_raw = \
    tf.feature_column.categorical_column_with_hash_bucket('item_id', CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE['item_id'])

categorical_hash_user = tf.feature_column.indicator_column(categorical_hash_user_raw)    
categorical_hash_item = tf.feature_column.indicator_column(categorical_hash_item_raw)
categorical_feature_linear = [categorical_hash_user, categorical_hash_item]
    
# 後半のEmbeddingパートを作成
categorical_feature_user_emb = tf.feature_column.embedding_column(
    categorical_column=categorical_hash_user_raw, dimension=user_embedding_dim)
categorical_feature_item_emb = tf.feature_column.embedding_column(
    categorical_column=categorical_hash_item_raw, dimension=item_embedding_dim)
categorical_feature_emb = [categorical_feature_user_emb, categorical_feature_item_emb]


In [None]:
params = {
    'categorical_feature_linear': categorical_feature_linear,
    'categorical_feature_emb': categorical_feature_emb,
    'hidden_units': [64, 32],
    'dropout_prob': float(config['model']['dropout_prob']),
    'n_classes': len(TARGET_LABELS)
}

In [None]:
def NeuralFM_fn(features, labels, mode, params):
    input_features_linear = tf.feature_column.input_layer(features, params['categorical_feature_linear'])
    input_features_dnn = tf.feature_column.input_layer(features, params['categorical_feature_emb'])
    
    feature_sq_sum = tf.square(tf.reduce_sum(input_features_dnn, axis=1, keepdims=True))
    feature_sum_sq = tf.reduce_sum(tf.square(input_features_dnn), axis=1, keepdims=True)
    cross_term = 0.5 * tf.subtract(feature_sq_sum, feature_sum_sq)
    
    for hidden_unit in params['hidden_units']:
        layer = tf.layers.dense(cross_term, units=hidden_unit, activation=tf.nn.relu)
    layer = tf.nn.dropout(layer, rate=params['dropout_prob'])
    logits = tf.layers.dense(layer, params['n_classes'], activation=None)
    
    predicted_classes = tf.argmax(logits, 1)
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'class_ids': predicted_classes[:, tf.newaxis],
            'probabilities': tf.nn.softmax(logits),
            'logits': logits,
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
    
    accuracy = tf.metrics.accuracy(labels=labels,
                               predictions=predicted_classes,
                               name='acc_op')
    
    metrics = {'accuracy': accuracy}
    tf.summary.scalar('accuracy', accuracy[1])

    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(
            mode, loss=loss, eval_metric_ops=metrics)
    
    optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)


In [None]:
def parse_input_config(config, phase):
    '''
    configをパースする関数（内部使用）
    configはいつもの設定ファイル
    phaseは{train, eval, predict}のいずれか
    batch_sizeやnum_epochsなどint型で入って欲しい変数がstrになってしまいintへの変換が必要となったためこの関数を用意した
    '''
    filename_pattern = config[phase]['filename_pattern']
    batch_size = int(config[phase]['batch_size'])
    try:
        num_epochs = int(config[phase]['num_epochs'])
    except:
        num_epochs = 1 # Noneにすると一生止まらない
    # これだけは正直共通にしておきたい
    skip_header_lines = int(config[phase]['skip_header_lines'])

    return filename_pattern, batch_size, num_epochs, skip_header_lines



In [None]:
def parse_csv_row(csv_row):
    '''
    csvをparseする関数
    途中でHEADERやHEADER_DEFAULTSを使用しているのでそれらを定義する関数か何かが必要
    （lambdaで使用しているため引数に加えることはできない）
    '''
    columns = tf.decode_csv(csv_row, record_defaults=HEADER_DEFAULTS)
    features = dict(zip(HEADER, columns))

    target = features.pop(TARGET)
    return features, target


In [None]:
def csv_input_fn(config, phase, mode=tf.estimator.ModeKeys.EVAL):
    '''
    ファイル名のパターンとか学習・評価時の設定は全てconfigに入れておいた方が管理しやすい
    （いつ・どういうタイミングで？）
    modeは直打ちで渡す方が楽そう（どこで？）
    '''
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    filename_pattern, batch_size, num_epochs, skip_header_lines = parse_input_config(config, phase)

    # ファイル名のパターンを元にデータの読み込み
    file_names = tf.matching_files(filename_pattern)
    dataset = tf.data.TextLineDataset(filenames=file_names)
    dataset = dataset.skip(skip_header_lines)

    # バッチサイズ分だけ切り出しgenerateする
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda csv_row: parse_csv_row(csv_row))
    dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()

    features, target = iterator.get_next()

    return features, target

In [None]:
def json_serving_input_fn():
    '''
    serving用のinput_fn
    同じくTARGETやUSED_FEATURE_NAMEを呼び出す関数がここにも必要
    '''
    receiver_tensor = {}
    for feature_name in USED_FEATURE_NAMES:
        dtype = tf.float32 if feature_name == TARGET else tf.string
        receiver_tensor[feature_name] = tf.placeholder(shape=[None], dtype=dtype)

    return tf.estimator.export.ServingInputReceiver(receiver_tensor, receiver_tensor)


In [None]:
train_input_fn = lambda: csv_input_fn(config=config, 
                                      phase='train', 
                                      mode=tf.estimator.ModeKeys.TRAIN)

eval_input_fn = lambda: csv_input_fn(config=config,
                                     phase='eval', 
                                     mode=tf.estimator.ModeKeys.EVAL)


In [None]:
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                  max_steps=int(config['train']['max_steps']),
                  hooks=[EarlyStoppingHook(int(config['model']['early_stop']))]
                  )

eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                exporters=[tf.estimator.LatestExporter(name="estimate",  
                                                       serving_input_receiver_fn=json_serving_input_fn)],
                steps=None,
                throttle_secs = 15
                )

In [None]:
raw_execute_time = gmtime()
execute_time = strftime("%Y%m%d_%H%M%S", raw_execute_time )
model_dir = os.path.join(config['path']['model_dir'], execute_time)
print(model_dir)

In [None]:
run_config = tf.estimator.RunConfig().replace(model_dir=model_dir,save_checkpoints_secs=300)


In [None]:
estimator = tf.estimator.Estimator(model_fn=NeuralFM_fn, 
                                  params=params, 
                                  config=run_config)

In [None]:
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

## 予測&評価

In [None]:
test_data = pd.read_csv(config['predict']['filename_pattern'])
test_size = len(test_data)


In [None]:
predict_input_fn = lambda: csv_input_fn(config=config, 
                                      phase='predict', 
                                      mode=tf.estimator.ModeKeys.PREDICT)


In [None]:
predictions = estimator.predict(input_fn=predict_input_fn)

In [None]:
values = list(map(lambda item: item["class_ids"][0],list(itertools.islice(predictions, test_size))))


In [None]:
test_value = np.array(test_data.iloc[:,2])
pred_value = np.array(values)
pred_value_binary = np.round(pred_value)

auc = roc_auc_score(test_value, pred_value)
accuracy = accuracy_score(test_value, pred_value_binary)
print('AUC: {:.4f}\nAccuracy: {:.4f}'.format(auc, accuracy))


In [None]:
# 結果の出力
model_name = 'NeuralFM'
export_result(model_name, auc, accuracy, config_filename, execute_time)
