# ens001

In [1]:
import os
import sys
import gc
import itertools
import pickle
import pathlib
import datetime
from dateutil.relativedelta import relativedelta
from dotenv import load_dotenv
load_dotenv()
sys.path.append(os.getenv('UTILS_PATH'))

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.optimize import minimize

import line_notify

In [2]:
import builtins
import types

def imports():
    for name, val in globals().items():
        # module imports
        if isinstance(val, types.ModuleType):
            yield name, val

            # functions / callables
        if hasattr(val, '__call__'):
            yield name, val


def noglobal(f):
    '''
    ref: https://gist.github.com/raven38/4e4c3c7a179283c441f575d6e375510c
    '''
    return types.FunctionType(f.__code__,
                              dict(imports()),
                              f.__name__,
                              f.__defaults__,
                              f.__closure__
                              )

ディレクトリ設定

In [3]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
#exp_name = os.path.dirname(__file__).split('/')[-1]
exp_name = 'ens001'
os.makedirs(OUTPUT_DIR + exp_name, exist_ok=True)

In [4]:
SEED = 48
exps = ['exp035', 'exp038', 'exp043', 'exp044', 'exp045']

データセット準備

In [5]:
articles = pd.read_csv(INPUT_DIR + 'articles.csv', dtype='object')
customers = pd.read_csv(INPUT_DIR + 'customers.csv')
transactions = pd.read_csv(INPUT_DIR + 'transactions_train.csv', dtype={'article_id':'str'}, parse_dates=['t_dat'])
sample = pd.read_csv(INPUT_DIR + 'sample_submission.csv')

ALL_CUSTOMER = customers['customer_id'].unique().tolist()
ALL_ARTICLE = articles['article_id'].unique().tolist()

customer_ids = dict(list(enumerate(ALL_CUSTOMER)))
article_ids = dict(list(enumerate(ALL_ARTICLE)))

customer_map = {u: uidx for uidx, u in customer_ids.items()}
article_map = {i: iidx for iidx, i in article_ids.items()}

articles['article_id'] = articles['article_id'].map(article_map)
customers['customer_id'] = customers['customer_id'].map(customer_map)
transactions['article_id'] = transactions['article_id'].map(article_map)
transactions['customer_id'] = transactions['customer_id'].map(customer_map)
sample['customer_id'] = sample['customer_id'].map(customer_map)

In [6]:
# 正解データ
valid_start = '2020-09-16'
valid_end = '2020-09-22'
valid = transactions[(transactions['t_dat'] >= valid_start) & (transactions['t_dat'] <= valid_end)].copy()
valid = valid[['customer_id', 'article_id']].drop_duplicates()
valid = valid.groupby('customer_id')['article_id'].apply(list).reset_index()
valid = valid.sort_values('customer_id').reset_index(drop=True)

In [7]:
# OOFデータ
oof = pd.DataFrame()
for exp in exps:
    exp_oof = pd.read_csv(OUTPUT_DIR + f'{exp}/{exp}_oof.csv', usecols=['customer_id', 'article_id', 'pred'])
    exp_oof = exp_oof.sort_values(['customer_id', 'pred'], ascending=False)
    exp_oof = exp_oof.groupby('customer_id').head(12)
    exp_oof = exp_oof.sort_values(['customer_id', 'pred'])
    exp_oof['exp'] = exp
    exp_oof['rank'] = exp_oof.groupby('customer_id').cumcount() + 1
    oof = pd.concat([oof, exp_oof[['customer_id', 'article_id', 'exp', 'rank']]])

In [8]:
# TESTデータ
def func(s, idx):
    s = s.split()
    return s[idx]

test = pd.DataFrame()
for exp in exps:
    exp_test = pd.read_csv(OUTPUT_DIR + f'{exp}/{exp}_sub.csv')
    prediction = exp_test['prediction']

    for i in range(12):
        exp_test_tmp = exp_test[['customer_id']]
        exp_test_tmp['exp'] = exp
        exp_test_tmp['article_id'] = prediction.apply(func, idx=i)
        exp_test_tmp['rank'] = 12 - i
        test = pd.concat([test, exp_test_tmp])

In [9]:
def apk(y_true, y_pred, K=12):
    assert(len(y_true) == len(y_pred))
    apks = []
    for idx in range(len(y_true)):
        y_i_true = y_true[idx]
        y_i_pred = y_pred[idx]

        # 予測値の数と重複の確認
        assert(len(y_i_pred) <= K)
        assert(len(np.unique(y_i_pred)) == len(y_i_pred))

        sum_precision = 0.0
        num_hits = 0.0

        for i, p in enumerate(y_i_pred):
            if p in y_i_true:
                num_hits += 1
                precision = num_hits / (i+1)
                sum_precision += precision
        apk = sum_precision / min(len(y_i_true), K)
        apks.append(apk)
    return apks

In [10]:
def f(x):
    oof_ = oof.copy()
    ws = x + [1 - sum(x)]
    oof_['w'] = 0
    for exp, w in zip(exps, ws):
        oof_.loc[oof_['exp']==exp, 'w'] = w
    oof_['rank'] = oof_['rank'] * oof_['w']
    oof_agg = oof_.groupby(['customer_id', 'article_id'])['rank'].sum().reset_index()
    oof_agg = oof_agg.sort_values(['customer_id', 'rank'], ascending=False)
    oof_agg = oof_agg.groupby('customer_id').head(12)
    oof_agg = oof_agg.groupby('customer_id')['article_id'].apply(list).reset_index()
    oof_agg = oof_agg.sort_values('customer_id').reset_index(drop=True)
    score = np.mean(apk(valid['article_id'].tolist(), oof_agg['article_id'].tolist()))
    print(-score)
    return -score

In [11]:
init_state = [round(1 / len(exps), 3) for _ in range(len(exps) - 1)]
result = minimize(f, init_state, method='Nelder-Mead')

-0.03275620872970546
-0.032749882532147306
-0.0327768476119052
-0.03283934499477748
-0.03281684554931684
-0.03282405545180976
-0.03282408318672374
-0.03281673811842985
-0.03281459093804206
-0.032809340058103674
-0.032808337260732984
-0.03280465038480665
-0.032786500154900956
-0.03283453907975187
-0.03284129259792004
-0.032825576824232074
-0.032837315294629375
-0.032834645427842465
-0.032836930195815656
-0.03282558487181878
-0.03283742014567792
-0.03284209640988969
-0.0328084630923106
-0.03284128000940264
-0.032808488017888986
-0.0328374506758728
-0.03283834293368419
-0.0328384425814014
-0.03283877560207764
-0.03283718625890185
-0.032838721765839234
-0.032839885123148324
-0.03284070003530913
-0.032840648278411065
-0.0328346144390306
-0.03283862220403488
-0.03283760396227034
-0.032841826741443
-0.03284018729986096
-0.032840393602796884
-0.032839154599832845
-0.03283933881693693
-0.032836402489531985
-0.03284086033377359
-0.032840677249723234
-0.03283755663839309
-0.03284086033377359
-0.0

In [12]:
print('optimized CV: ', -result['fun'])
print('w: ', result['x'])

optimized CV:  0.03284221953921152
w:  [0.19667888 0.19909511 0.20999136 0.2055299 ]


# sub作成

In [13]:
ws = list(result['x']) + [1 - sum(result['x'])]
test['w'] = 0
for exp, w in zip(exps, ws):
    test.loc[test['exp']==exp, 'w'] = w
    test['rank'] = test['rank'] * test['w']

test = test.groupby(['customer_id', 'article_id'])['rank'].sum().reset_index()
test = test.sort_values(['customer_id', 'rank'], ascending=False)
test = test.groupby('customer_id').head(12)
test = test.groupby('customer_id')['article_id'].apply(list).reset_index()

sub = sample['customer_id'].map(customer_ids).to_frame()
sub = sub.merge(test, on=['customer_id'], how='left')
sub = sub.rename(columns={'article_id':'prediction'})
assert(sub['prediction'].apply(len).min()==12)
sub['prediction'] = sub['prediction'].apply(lambda x: ' '.join(x))
sub.to_csv(OUTPUT_DIR + f'{exp_name}/{exp_name}_sub.csv', index=False)

In [14]:
sub

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601006 0568601043 0915529003 0924243002 06...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0714790020 0448509014 0915529003 0706016001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0901666001 0924243001 0794321008 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0751471043 0918522001 0918292001 0791587001 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0896152002 0730683050 0924243002 0791587015 06...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0557599022 0866731001 0720125039 0918292001 08...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0762846008 0706016001 0448509014 0706016003 07...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0762846027 0706016002 0866731001 0918292001 07...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0714790020 0448509014 0706016002 0706016001 07...
