# Wide and Deep
論文リンク：https://arxiv.org/pdf/1606.07792.pdf

In [None]:
import os
import itertools
from configparser import ConfigParser
from time import time, gmtime, strftime

import numpy as np
import pandas as pd
import tensorflow as tf
from utils import EarlyStoppingHook

from sklearn.metrics import roc_auc_score, accuracy_score

In [None]:
# configファイルの読み込み
config_filename = './config/WideAndDeep_config.ini'

config = ConfigParser()
config.read(config_filename)


In [None]:
config['train']['filename_pattern']

In [None]:
# カラムの定義
HEADER = ['user_id', 'item_id', 'rating']
HEADER_DEFAULTS = [['0'], ['0'], ['0']]

FEATURE_NAMES = ['user_id', 'item_id']
CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE = {
  'user_id': int(config['model']['user_bucket_size']),
  'item_id' : int(config['model']['item_bucket_size'])
  }

USED_FEATURE_NAMES = ['user_id', 'item_id', 'rating']

CATEGORICAL_FEATURE_NAMES =  list(CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE.keys())
TARGET = 'rating'
TARGET_LABELS = ['0','1']


In [None]:
def parse_input_config(config, phase):
    '''iniファイルをパースする関数
    configparserで数値を引っ張るとstrになってしまうためここでintに変換している。
    
    Parameters
    --------------------
    config: dict
        phaseごとの設定を記した辞書
    phase: str
        学習のフェーズ
        {'train', 'eval', 'predict'}のいずれか1つを指定
    
    Returns
    --------------------
    filename_pattern: string
    batch_size: int
        学習時のバッチサイズ
    num_epochs: int
        学習で回す最大のepoch数
    skip_header_lines: int
        csvファイルのうち読み飛ばす行数
    '''
    filename_pattern = config[phase]['filename_pattern']
    batch_size = int(config[phase]['batch_size'])
    num_epochs = int(config[phase]['num_epochs']) # Noneにすると評価したが最後一生返ってこない
    skip_header_lines = int(config[phase]['skip_header_lines'])

    return filename_pattern, batch_size, num_epochs, skip_header_lines


In [None]:
def parse_csv_row(csv_row):
    '''csvをparseする関数
    csv_input_fn内で使用
    
    Parameters
    --------------------
    csv_row: Tensor
        string型の入力
    
    Returns
    --------------------
    features, target: Tensorのlist
        record_defaultsに指定された型と同じデータ型が期待される
    '''
    columns = tf.decode_csv(csv_row, record_defaults=HEADER_DEFAULTS)
    features = dict(zip(HEADER, columns))

    target = features.pop(TARGET)
    return features, target


In [None]:
def csv_input_fn(config, phase, mode=tf.estimator.ModeKeys.EVAL):
    '''csvからfeaturesとtargetを出力するinput_fnを返す関数
    
    Parameters
    --------------------
    config: dict
        phaseごとの設定を記した辞書
    phase: str
        学習のフェーズ
        {'train', 'eval', 'predict'}のいずれか1つを指定
    mode: tf.estimator.ModeKeys
        学習のフェーズとほぼ同値（諸々の事情でphaseと分けている）
        
    Returns
    --------------------
    features, target: Iterator
        バッチの大きさだけ特徴量・目的変数を返すイテレータ
    '''
    filename_pattern, batch_size, num_epochs, skip_header_lines = parse_input_config(config, phase)

    # ファイル名のパターンを元にデータの読み込み
    file_names = tf.matching_files(filename_pattern)
    dataset = tf.data.TextLineDataset(filenames=file_names)
    dataset = dataset.skip(skip_header_lines)
    
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
    if shuffle:
        dataset = dataset.shuffle(int(config[phase]['batch_size']) * 2,
                                  seed=0,
                                  reshuffle_each_iteration=True)

    # バッチサイズ分だけ切り出しgenerateする
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda csv_row: parse_csv_row(csv_row))
    dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()

    features, target = iterator.get_next()
    
    return features, target

In [None]:
def json_serving_input_fn():
    '''serving用のinput_fn
        
    Returns
    --------------------
    tf.estimator.export.ServingInputReceiver: Tensor
    '''
    receiver_tensor = {}
    for feature_name in USED_FEATURE_NAMES:
        dtype = tf.float32 if feature_name == TARGET else tf.string
        receiver_tensor[feature_name] = tf.placeholder(shape=[None], dtype=dtype)

    return tf.estimator.export.ServingInputReceiver(receiver_tensor, receiver_tensor)


In [None]:
train_input_fn = lambda: csv_input_fn(config=config, 
                                      phase='train', 
                                      mode=tf.estimator.ModeKeys.TRAIN)

eval_input_fn = lambda: csv_input_fn(config=config,
                                     phase='eval', 
                                     mode=tf.estimator.ModeKeys.EVAL)


In [None]:
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                  max_steps=int(config['train']['max_steps']),
                  hooks=[EarlyStoppingHook(int(config['model']['early_stop']))]
                  )

eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn,
                exporters=[tf.estimator.LatestExporter(name="estimate",  
                                                       serving_input_receiver_fn=json_serving_input_fn)],
                steps=None,
                throttle_secs = 15
                )


In [None]:
raw_execute_time = gmtime()
execute_time = strftime("%Y%m%d_%H%M%S", raw_execute_time )
model_dir = os.path.join(config['path']['model_dir'], execute_time)


In [None]:
run_config = tf.estimator.RunConfig().replace(model_dir=model_dir, save_checkpoints_secs=300)


In [None]:
wide_feature_dim = int(config['model']['wide_feature_dim'])
user_embedding_dim = int(config['model']['user_embedding_dim'])
item_embedding_dim = int(config['model']['item_embedding_dim'])

categorical_hash_user = \
    tf.feature_column.categorical_column_with_hash_bucket('user_id', CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE['user_id'])
categorical_hash_item = \
    tf.feature_column.categorical_column_with_hash_bucket('item_id', CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE['item_id'])

categorical_feature_user_x_categorical_feature_item = tf.feature_column.crossed_column(['user_id', 'item_id'], wide_feature_dim)
categorical_feature_user_emb = tf.feature_column.embedding_column(
    categorical_column=categorical_hash_user, dimension=user_embedding_dim)
categorical_feature_item_emb = tf.feature_column.embedding_column(
    categorical_column=categorical_hash_item, dimension=item_embedding_dim)

wide_feature_columns = [categorical_feature_user_x_categorical_feature_item]
deep_feature_columns = [categorical_feature_user_emb, categorical_feature_item_emb]


In [None]:
dropout_prob = float(config['model']['dropout_prob'])
hidden_units = [64, 32]

estimator = tf.estimator.DNNLinearCombinedClassifier(
                        n_classes= len(TARGET_LABELS),
                        label_vocabulary=TARGET_LABELS,
                        dnn_feature_columns = deep_feature_columns,
                        linear_feature_columns = wide_feature_columns,
                        dnn_hidden_units= hidden_units,
                        dnn_optimizer= tf.train.AdamOptimizer(),
                        dnn_activation_fn= tf.nn.relu,
                        dnn_dropout= dropout_prob,
                        model_dir=model_dir,
                        config= run_config
                    )


In [None]:
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

## 予測&評価

In [None]:
test_data = pd.read_csv(config['predict']['filename_pattern'])
test_size = len(test_data)

predict_input_fn = lambda: csv_input_fn(config=config, 
                                      phase='predict', 
                                      mode=tf.estimator.ModeKeys.PREDICT)


In [None]:
predictions = estimator.predict(input_fn=predict_input_fn)
values = list(map(lambda item: item["logistic"][0],list(itertools.islice(predictions, test_size))))


In [None]:
test_value = np.array(test_data.iloc[:,2])
pred_value = np.array(values)
pred_value_binary = np.round(pred_value)

auc = roc_auc_score(test_value, pred_value)
accuracy = accuracy_score(test_value, pred_value_binary)
print('AUC: {:.4f}\nAccuracy: {:.4f}'.format(auc, accuracy))
