# exp060_1st_stage_rate_check

In [1]:
import os
import sys
import gc
import itertools
import pickle
import pathlib
import datetime
from dateutil.relativedelta import relativedelta
from dotenv import load_dotenv
load_dotenv()
sys.path.append(os.getenv('UTILS_PATH'))

import pandas as pd
import numpy as np
import cudf
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

import line_notify

In [2]:
import builtins
import types

def imports():
    for name, val in globals().items():
        # module imports
        if isinstance(val, types.ModuleType):
            yield name, val

            # functions / callables
        if hasattr(val, '__call__'):
            yield name, val


def noglobal(f):
    '''
    ref: https://gist.github.com/raven38/4e4c3c7a179283c441f575d6e375510c
    '''
    return types.FunctionType(f.__code__,
                              dict(imports()),
                              f.__name__,
                              f.__defaults__,
                              f.__closure__
                              )

In [3]:
SEED = 42
RUN_INF = True # 推論処理を行うか
BATCH_SIZE = int(1e5)
N_ITER = 20 # 学習データのローリング数
RUN_US = True # アンダーサンプリング実施有無
N_SEED = 10 # seed avgの回数

In [4]:
INIT_N = 12

In [5]:
Ns = {}
Ns['cf_a'] = INIT_N
Ns['ctf_a'] = INIT_N
Ns['atfd_a'] = INIT_N
Ns['atfp_a'] = INIT_N
Ns['pa_a'] = INIT_N

Ns['cf_w'] = INIT_N
Ns['ctf_w'] = INIT_N
Ns['atfd_w'] = INIT_N
Ns['atfp_w'] = INIT_N
Ns['pa_w'] = INIT_N

Ns['cf_m'] = INIT_N
Ns['ctf_m'] = INIT_N
Ns['atfd_m'] = INIT_N
Ns['atfp_m'] = INIT_N
Ns['pa_m'] = INIT_N

Ns['cf_y'] = INIT_N
Ns['ctf_y'] = INIT_N
Ns['atfd_y'] = INIT_N
Ns['atfp_y'] = INIT_N
Ns['pa_y'] = INIT_N

ディレクトリ設定

In [6]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
#exp_name = os.path.dirname(__file__).split('/')[-1]
#exp_name = 'exp060'
#os.makedirs(OUTPUT_DIR + exp_name, exist_ok=True)

データ読み込み

In [7]:
articles = pd.read_csv(INPUT_DIR + 'articles.csv', dtype='object')
customers = pd.read_csv(INPUT_DIR + 'customers.csv')
transactions = pd.read_csv(INPUT_DIR + 'transactions_train.csv', dtype={'article_id':'str'}, parse_dates=['t_dat'])
sample = pd.read_csv(INPUT_DIR + 'sample_submission.csv')

In [8]:
first_week_sales_pred = pd.read_csv(OUTPUT_DIR + '1st_week_sales_pred_v004/result.csv', dtype={'article_id':'str'},  parse_dates=['1st_week_sales_dat'])

# 前処理

In [9]:
ALL_CUSTOMER = customers['customer_id'].unique().tolist()
ALL_ARTICLE = articles['article_id'].unique().tolist()

customer_ids = dict(list(enumerate(ALL_CUSTOMER)))
article_ids = dict(list(enumerate(ALL_ARTICLE)))

customer_map = {u: uidx for uidx, u in customer_ids.items()}
article_map = {i: iidx for iidx, i in article_ids.items()}

articles['article_id'] = articles['article_id'].map(article_map)
customers['customer_id'] = customers['customer_id'].map(customer_map)
transactions['article_id'] = transactions['article_id'].map(article_map)
transactions['customer_id'] = transactions['customer_id'].map(customer_map)
sample['customer_id'] = sample['customer_id'].map(customer_map)
first_week_sales_pred['article_id'] = first_week_sales_pred['article_id'].map(article_map) 

In [10]:
# 名寄せ
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].str.replace('None','NONE')

In [11]:
customers['age10'] = str((customers['age'] // 10) * 10)
customers.loc[customers['age'].isnull(), 'age10'] = np.nan

In [12]:
# label_encoding
le_cols = ['product_type_name', 'product_group_name', 'graphical_appearance_name',
            'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name',
            'index_name', 'section_name', 'garment_group_name']
for c in le_cols:
    le = LabelEncoder()
    articles[c] = le.fit_transform(articles[c].fillna(''))


le_cols = ['club_member_status', 'fashion_news_frequency', 'postal_code', 'age10']
for c in le_cols:
    le = LabelEncoder()
    customers[c] = le.fit_transform(customers[c].fillna(''))

In [13]:
ALL_INDEX_GROUP_NAME = articles['index_group_name'].unique().tolist()
index_group_name_ids = dict(list(enumerate(ALL_INDEX_GROUP_NAME)))
index_group_name_map = {u: uidx for uidx, u in index_group_name_ids.items()}
articles['index_group_name'] = articles['index_group_name'].map(index_group_name_map)

In [14]:
customers['customer_type'] = customers['FN'].fillna(0).astype(int).astype(str) + \
                             customers['Active'].fillna(0).astype(int).astype(str) + \
                             customers['club_member_status'].fillna(0).astype(int).astype(str) + \
                             customers['fashion_news_frequency'].fillna(0).astype(int).astype(str) + \
                             customers['age10'].fillna(0).astype(int).astype(str)

le = LabelEncoder()
customers['customer_type'] = le.fit_transform(customers['customer_type'])

In [15]:
# transactionに紐づけ
transactions = transactions.merge(customers, on='customer_id', how='left')
transactions = transactions.merge(articles, on='article_id', how='left')

# データセット作成（レコメンド→対象データセット作成→特徴量エンジニアリング）

In [16]:
@noglobal
def get_customer_frequent(history, n=12, timedelta=None):
    """顧客ごと商品の購入数をカウントし上位の商品を抽出

    Args:
        history (dataframe): 集計対象の実績データ
        n (int): レコメンド対象とする数
        timedelta (dateutil.relativedelta): 指定された場合、実績データの終端からtimedelta分のデータを取得する

    Returns:
        dataframe: 抽出結果
    """
    if timedelta is not None:
        st_date = history['t_dat'].max() - timedelta
        history = history[history['t_dat']>=st_date].copy()
        
    customer_agg = history.groupby(['customer_id', 'article_id'])['t_dat'].count().reset_index()
    customer_agg = customer_agg.rename(columns={'t_dat':'cnt'})
    customer_agg = customer_agg.sort_values(['customer_id', 'cnt'], ascending=False)
    result = customer_agg.groupby('customer_id').head(n)
    return result[['customer_id', 'article_id']]

@noglobal
def get_popular_article(history, n=12, timedelta=None):
    """全体の購入数をカウントし上位の商品を抽出

    Args:
        history (dataframe): 集計対象の実績データ
        n (int): レコメンド対象とする数
        timedelta (dateutil.relativedelta): 指定された場合、実績データの終端からtimedelta分のデータを取得する

    Returns:
        list: 抽出結果
    """
    # 全体の購入数量
    if timedelta is not None:
        st_date = history['t_dat'].max() - timedelta
        history = history[history['t_dat']>=st_date].copy()

    total_agg = history.groupby('article_id')['t_dat'].count().reset_index()
    total_agg = total_agg.rename(columns={'t_dat':'cnt'})
    total_agg = total_agg.sort_values(['cnt'], ascending=False)
    total_agg = total_agg.head(n)
    result = list(total_agg['article_id'].values)
    return result

@noglobal
def get_customer_type_frequent(history, n=12, timedelta=None):
    if timedelta is not None:
        st_date = history['t_dat'].max() - timedelta
        history = history[history['t_dat']>=st_date].copy()

    result = history[['customer_id', 'customer_type']].drop_duplicates().copy()
    agg = history.groupby(['customer_type', 'article_id'])['t_dat'].count().reset_index()
    agg = agg.rename(columns={'t_dat':'cnt'})
    agg = agg.sort_values(['customer_type', 'cnt'], ascending=False)
    agg = agg.groupby('customer_type').head(n)
    result = result.merge(agg[['customer_type', 'article_id']], on='customer_type', how='left')
    return result[['customer_id', 'article_id']]

@noglobal
def get_article_type_frequent(history, col, n=12, timedelta=None):
    if timedelta is not None:
        st_date = history['t_dat'].max() - timedelta
        history = history[history['t_dat']>=st_date].copy()

    result = history.groupby(['customer_id', col])['t_dat'].count().reset_index()
    result = result.rename(columns={'t_dat':'cnt'})
    result = result.sort_values(['customer_id', 'cnt'], ascending=False)
    result = result.groupby(['customer_id']).head(1)[['customer_id', col]]

    agg = history.groupby([col, 'article_id'])['t_dat'].count().reset_index()
    agg = agg.rename(columns={'t_dat':'cnt'})
    agg = agg.sort_values([col, 'cnt'], ascending=False)
    agg = agg.groupby(col).head(n)
    result = result.merge(agg[[col, 'article_id']], on=col, how='left')
    return result[['customer_id', 'article_id']]

@noglobal
def get_popular_new_article(first_week_sales_pred, n=12):
    """新商品の初週売り上げ予測が高い商品を抽出
    """
    first_week_sales_pred = first_week_sales_pred.sort_values(['1st_week_sales_pred'], ascending=False)
    first_week_sales_pred = first_week_sales_pred.head(n)
    result = list(first_week_sales_pred['article_id'].values)
    return result

@noglobal
def calc_pair(history):
    df = history[['article_id', 't_dat', 'customer_id']].copy()
    df = cudf.from_pandas(df)
    df['t_dat'] = df['t_dat'].factorize()[0].astype('int16')
    dt = df.groupby(['customer_id','t_dat'])['article_id'].agg(list).rename('pair').reset_index()
    df = df[['customer_id', 't_dat', 'article_id']].merge(dt, on=['customer_id', 't_dat'], how='left')
    del dt
    gc.collect()

    # Explode the rows vs list of articles
    df = df[['article_id', 'pair']].explode(column='pair')
    gc.collect()
        
    # Discard duplicates
    df = df.loc[df['article_id']!=df['pair']].reset_index(drop=True)
    gc.collect()

    # Count how many times each pair combination happens
    df = df.groupby(['article_id', 'pair']).size().rename('count').reset_index()
    gc.collect()
        
    # Sort by frequency
    df = df.sort_values(['article_id' ,'count'], ascending=False).reset_index(drop=True)
    gc.collect()

    # pick only top1 most frequent pair
    df = df.groupby('article_id').nth(0).reset_index()
    pair = dict(zip(df['article_id'].to_arrow().to_pylist(), df['pair'].to_arrow().to_pylist()))

    return pair

In [17]:
@noglobal
def add_labels(recom_result, history):
    """レコメンドしたデータが学習期間で購入されたかどうかのフラグを付与する

    Args:
        recom_result (_type_): レコメンド結果
        train_tran (_type_): 学習期間のトランザクションデータ

    Returns:
        _type_: 学習期間での購入フラグを付与したレコメンド結果
    """
    history = history[['customer_id', 'article_id']].drop_duplicates()
    history['buy'] = 1
    recom_result = recom_result.merge(history, on=['customer_id', 'article_id'], how='left')
    recom_result['buy'] = recom_result['buy'].fillna(0)
    return recom_result


In [32]:
@noglobal
def calc_recall(result, target_tran, recom_id):
    result = add_labels(result, target_tran)
    best_recall = 0.0
    best_n = 12
    for i in range(12):
        n = 12 - i
        recall = result.groupby('customer_id').head(n)['buy'].mean()
        if recall >= best_recall:
            best_recall = recall
            best_n = n
    print(f'[{recom_id}]  best_n: {best_n} | best_recall:' + '{:.3f}'.format(best_recall*100))


In [19]:
# 学習データの作成
# 1週ずつローリングして学習データを生成
train_start = '2020-09-09'
valid_start = '2020-09-16'
valid_end = '2020-09-22'

hist_st = train_start
target_st = valid_start

history_tran = transactions[transactions['t_dat'] < hist_st].copy()
target_tran = transactions[(transactions['t_dat'] >= hist_st) & (transactions['t_dat'] < target_st)].copy()
first_week_sales_pred_tmp = first_week_sales_pred[(first_week_sales_pred['1st_week_sales_dat'] >= target_tran['t_dat'].min())&(first_week_sales_pred['1st_week_sales_dat'] <= target_tran['t_dat'].max())]
target_id = target_tran['customer_id'].unique().tolist()

In [20]:
target_customer_id = target_id
history = history_tran
first_week_sales_pred = first_week_sales_pred_tmp

In [33]:
td = None
result = get_customer_frequent(history, Ns['cf_a'], td)
calc_recall(result, target_tran, 'cf_a')
result = get_customer_type_frequent(history, Ns['ctf_a'], td)
calc_recall(result, target_tran, 'ctf_a')
result = get_article_type_frequent(history, 'department_name', Ns['atfd_a'], td)
calc_recall(result, target_tran, 'atfd_a')
result = get_article_type_frequent(history, 'perceived_colour_master_name', Ns['atfp_a'], td)
calc_recall(result, target_tran, 'atfp_a')
popular_article = get_popular_article(history, Ns['pa_a'], td)
# customerとpopular articleの全組み合わせでdataframe作成
result = pd.DataFrame(itertools.product(target_customer_id, popular_article), columns=['customer_id', 'article_id'])
calc_recall(result, target_tran, 'pa_a')

td = relativedelta(weeks=1)
result = result.append(get_customer_frequent(history, Ns['cf_w'], td))
calc_recall(result, target_tran, 'cf_w')
result = get_customer_type_frequent(history, Ns['ctf_w'], td)
calc_recall(result, target_tran, 'ctf_w')
result = get_article_type_frequent(history, 'department_name', Ns['atfd_w'], td)
calc_recall(result, target_tran, 'atfd_w')
result = get_article_type_frequent(history, 'perceived_colour_master_name', Ns['atfp_w'], td)
calc_recall(result, target_tran, 'atfp_w')
popular_article = get_popular_article(history, Ns['pa_w'], td)
# customerとpopular articleの全組み合わせでdataframe作成
result = pd.DataFrame(itertools.product(target_customer_id, popular_article), columns=['customer_id', 'article_id'])
calc_recall(result, target_tran, 'pa_w')

td = relativedelta(months=1)
result = result.append(get_customer_frequent(history, Ns['cf_m'], td))
calc_recall(result, target_tran, 'cf_m')
result = get_customer_type_frequent(history, Ns['ctf_m'], td)
calc_recall(result, target_tran, 'ctf_m')
result = get_article_type_frequent(history, 'department_name', Ns['atfd_m'], td)
calc_recall(result, target_tran, 'atfd_m')
result = get_article_type_frequent(history, 'perceived_colour_master_name', Ns['atfp_m'], td)
calc_recall(result, target_tran, 'atfp_m')
popular_article = get_popular_article(history, Ns['pa_m'], td)
# customerとpopular articleの全組み合わせでdataframe作成
result = pd.DataFrame(itertools.product(target_customer_id, popular_article), columns=['customer_id', 'article_id'])
calc_recall(result, target_tran, 'pa_m')

td = relativedelta(years=1)
result = result.append(get_customer_frequent(history, Ns['cf_y'], td))
calc_recall(result, target_tran, 'cf_y')
result = get_customer_type_frequent(history, Ns['ctf_y'], td)
calc_recall(result, target_tran, 'ctf_y')
result = get_article_type_frequent(history, 'department_name', Ns['atfd_y'], td)
calc_recall(result, target_tran, 'atfd_y')
result = get_article_type_frequent(history, 'perceived_colour_master_name', Ns['atfp_y'], td)
calc_recall(result, target_tran, 'atfp_y')
popular_article = get_popular_article(history, Ns['pa_y'], td)
# customerとpopular articleの全組み合わせでdataframe作成
result = pd.DataFrame(itertools.product(target_customer_id, popular_article), columns=['customer_id', 'article_id'])
calc_recall(result, target_tran, 'pa_y')

[cf_a]  best_n: 1 | best_recall:0.049
[ctf_a]  best_n: 1 | best_recall:0.025
[atfd_a]  best_n: 1 | best_recall:0.019
[atfp_a]  best_n: 1 | best_recall:0.028
[pa_a]  best_n: 1 | best_recall:0.515
[cf_w]  best_n: 2 | best_recall:0.264
[ctf_w]  best_n: 1 | best_recall:0.114
[atfd_w]  best_n: 1 | best_recall:0.280
[atfp_w]  best_n: 1 | best_recall:0.151
[pa_w]  best_n: 2 | best_recall:0.634
[cf_m]  best_n: 11 | best_recall:0.261
[ctf_m]  best_n: 3 | best_recall:0.066
[atfd_m]  best_n: 1 | best_recall:0.121
[atfp_m]  best_n: 1 | best_recall:0.090
[pa_m]  best_n: 1 | best_recall:0.691
[cf_y]  best_n: 12 | best_recall:0.058
[ctf_y]  best_n: 1 | best_recall:0.033
[atfd_y]  best_n: 1 | best_recall:0.037
[atfp_y]  best_n: 1 | best_recall:0.034
[pa_y]  best_n: 1 | best_recall:0.515
