# dev002

NMFによるレコメンド

In [9]:
import os
import sys
import itertools
import datetime
from dateutil.relativedelta import relativedelta
from dotenv import load_dotenv
load_dotenv()
sys.path.append(os.getenv('UTILS_PATH'))

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

import line_notify

In [10]:
from scipy import sparse
from sklearn.decomposition import NMF

In [11]:
import builtins
import types

def imports():
    for name, val in globals().items():
        # module imports
        if isinstance(val, types.ModuleType):
            yield name, val

            # functions / callables
        if hasattr(val, '__call__'):
            yield name, val


def noglobal(f):
    '''
    ref: https://gist.github.com/raven38/4e4c3c7a179283c441f575d6e375510c
    '''
    return types.FunctionType(f.__code__,
                              dict(imports()),
                              f.__name__,
                              f.__defaults__,
                              f.__closure__
                              )

In [12]:
def apk(y_true, y_pred, K=12):
    assert(len(y_true) == len(y_pred))
    apks = []
    for idx in range(len(y_true)):
        y_i_true = y_true[idx]
        y_i_pred = y_pred[idx]

        # 予測値の数と重複の確認
        assert(len(y_i_pred) <= K)
        assert(len(np.unique(y_i_pred)) == len(y_i_pred))

        sum_precision = 0.0
        num_hits = 0.0

        for i, p in enumerate(y_i_pred):
            if p in y_i_true:
                num_hits += 1
                precision = num_hits / (i+1)
                sum_precision += precision
        apk = sum_precision / min(len(y_i_true), K)
        apks.append(apk)
    return apks

In [13]:
SEED = 42

ディレクトリ設定

In [14]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
#exp_name = os.path.dirname(__file__).split('/')[-1]
exp_name = 'dev002'
#os.makedirs(OUTPUT_DIR + exp_name, exist_ok=True)

データ読み込み

In [15]:
articles = pd.read_csv(INPUT_DIR + 'articles.csv', dtype='object')
customers = pd.read_csv(INPUT_DIR + 'customers.csv')
transactions = pd.read_csv(INPUT_DIR + 'transactions_train.csv', dtype={'article_id':'str'}, parse_dates=['t_dat'])
sample = pd.read_csv(INPUT_DIR + 'sample_submission.csv')

In [16]:
timedelta = relativedelta(years=1)
valid_start = datetime.datetime(2020,9,16)
train = transactions[transactions['t_dat'] < valid_start].copy()
valid = transactions[transactions['t_dat'] >= valid_start].copy()


In [17]:
ALL_USERS = customers['customer_id'].unique().tolist()
ALL_ITEMS = articles['article_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

In [19]:
target_customer_id = valid['customer_id'].unique().tolist()

In [24]:
target_id = [user_map[id_] for id_ in target_customer_id]

In [25]:
target_id

[330,
 349,
 356,
 412,
 487,
 497,
 601,
 1124,
 1195,
 1270,
 1340,
 1483,
 1930,
 1947,
 1979,
 2030,
 2093,
 2104,
 2193,
 2337,
 2370,
 2817,
 2913,
 2981,
 3016,
 3097,
 3104,
 3119,
 3129,
 3192,
 3221,
 3243,
 3608,
 3662,
 3776,
 3844,
 3857,
 3892,
 3943,
 4087,
 4723,
 4724,
 4812,
 4927,
 5131,
 5554,
 5763,
 5874,
 5974,
 5993,
 6039,
 6123,
 6130,
 6281,
 6421,
 6715,
 6817,
 6896,
 7115,
 7187,
 7218,
 7355,
 7409,
 7983,
 8009,
 8051,
 8066,
 8243,
 8421,
 8470,
 8488,
 8680,
 8771,
 8833,
 9403,
 9551,
 9668,
 9843,
 9936,
 10442,
 10549,
 10650,
 10901,
 10905,
 11314,
 11315,
 11525,
 12022,
 12106,
 12835,
 12946,
 13210,
 13423,
 13438,
 13536,
 13732,
 13967,
 14224,
 14355,
 14580,
 14792,
 14800,
 14963,
 14971,
 14997,
 15316,
 15356,
 15369,
 15394,
 15449,
 15654,
 15820,
 15832,
 15911,
 16118,
 16644,
 16669,
 16744,
 16856,
 16899,
 17146,
 17160,
 17621,
 18218,
 18257,
 18322,
 18466,
 18494,
 18526,
 18673,
 18679,
 18737,
 18791,
 18876,
 19026,
 19047

In [8]:
timedelta = relativedelta(years=1)
valid_start = datetime.datetime(2020,9,16)
train = transactions[transactions['t_dat'] < valid_start].copy()
valid = transactions[transactions['t_dat'] >= valid_start].copy()

if timedelta is not None:
    st_date = train['t_dat'].max() - timedelta
    train = train[train['t_dat']>=st_date].copy()

ALL_USERS = customers['customer_id'].unique().tolist()
ALL_ITEMS = articles['article_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

train['user_id'] = train['customer_id'].map(user_map)
train['item_id'] = train['article_id'].map(item_map)

train = train[['user_id', 'item_id']].drop_duplicates()

row = train['user_id'].values
col = train['item_id'].values
data = np.ones(train.shape[0])
csr_train = sparse.csr_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))

valid = valid[['customer_id', 'article_id']].drop_duplicates()
valid['user_id'] = valid['customer_id'].map(user_map)
target_id = valid['user_id'].unique().tolist()
# 正解データ作成
valid_true = valid.groupby('customer_id')['article_id'].apply(list).reset_index()
valid_true = valid_true.sort_values('customer_id').reset_index(drop=True)

for n_comp in [2,3,4,5,6,7,8,9]:
    model = NMF(n_components=n_comp, init='random', random_state=SEED, max_iter=1000)
    W = model.fit_transform(csr_train)
    H = model.components_

    R = np.dot(W[target_id], H)
    R = np.argsort(R)[:,::-1]
    R = R[:,:12]
    result = pd.DataFrame(R).stack().reset_index()
    result['customer_id'] = result['level_0'].map(user_ids)
    result['article_id'] = result[0].map(item_ids)

    # 2値分類の出力を元に12個選定
    valid_pred = result.groupby('customer_id')['article_id'].apply(list).reset_index()
    valid_pred = valid_pred.sort_values('customer_id').reset_index(drop=True)

    score = np.mean(apk(valid_true['article_id'].tolist(), valid_pred['article_id'].tolist()))
    print(f'{score}')
    message = f'{exp_name}_{n_comp} is finished!\nn_iter : {model.n_iter_}\nvalid_score : {score}'
    line_notify.send(message)

0.0029104912738429785
0.002154358407532727
0.0020641356934154123
0.002036791577549044
0.002564125601260985
0.002523231476778935
0.00246345953890304
0.002420186088398113
