In [8]:
import os
import datetime
from dateutil.relativedelta import relativedelta
from unittest import result
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from functools import reduce
from cuml.experimental.preprocessing import MinMaxScaler
from cuml.preprocessing import Normalizer
import itertools

import cudf
import cuml

import builtins
import types

def imports():
    for name, val in globals().items():
        # module imports
        if isinstance(val, types.ModuleType):
            yield name, val

            # functions / callables
        if hasattr(val, '__call__'):
            yield name, val


def noglobal(f):
    '''
    ref: https://gist.github.com/raven38/4e4c3c7a179283c441f575d6e375510c
    '''
    return types.FunctionType(f.__code__,
                              dict(imports()),
                              f.__name__,
                              f.__defaults__,
                              f.__closure__
                              )

In [3]:
INPUT_DIR = 'dataset/'
transactions = cudf.read_parquet(INPUT_DIR + 'transactions.parquet')
customers = cudf.read_parquet(INPUT_DIR + 'customers.parquet')
articles = cudf.read_parquet(INPUT_DIR + 'articles.parquet')

In [4]:
label_encode_column = ['FN', 'Active', 'fashion_news_frequency', 'club_member_status', 'postal_code']
for c in label_encode_column:
    customers[c] = customers[c].astype(str)
    le = cuml.preprocessing.LabelEncoder()
    customers[c] = le.fit_transform(customers[c].fillna(''))

customers['age'] = customers['age'].fillna(int(customers['age'].mean()))

#  null value
# FN 895050
# Active 907576
select_column = ['customer_id','club_member_status','fashion_news_frequency','age','postal_code']
customers = customers[select_column]

In [5]:
label_encode_column =  ['product_type_name', 'product_group_name', 'graphical_appearance_name',
            'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name',
            'index_name', 'index_group_name', 'section_name', 'garment_group_name']

for c in label_encode_column:
    articles[c] = articles[c].astype(str)
    le = cuml.preprocessing.LabelEncoder()
    articles[c] = le.fit_transform(articles[c].fillna(''))

label_encode_column.insert(0, 'article_id')
articles = articles[label_encode_column] 

## generate candidate

In [6]:
## Recommend Most Often Previously Purchased Items
def get_customer_frequent(history, n=12, timedelta=None):
    if timedelta is not None:
        st_date = history['t_dat'].max() - timedelta
        history = history[history['t_dat']>=st_date].copy()
        
    tmp = history.groupby(['customer_id','article_id'])['t_dat'].agg('count').reset_index()
    tmp.columns = ['customer_id','article_id','ct']
    tmp = tmp.sort_values(['customer_id','ct'],ascending=False)
    tmp = tmp.to_pandas().groupby('customer_id').head(n)[['customer_id', 'article_id']]
    result = cudf.DataFrame.from_pandas(tmp)
    return result

In [7]:
@noglobal
def get_popular_article(history, n=12, timedelta=None):
    if timedelta is not None:
        st_date = history['t_dat'].max() - timedelta
        history = history[history['t_dat']>=st_date].copy()

    total_agg = history.groupby('article_id')['t_dat'].count().reset_index()
    total_agg = total_agg.rename(columns={'t_dat':'cnt'})
    total_agg = total_agg.sort_values(['cnt'], ascending=False)
    total_agg = total_agg.head(n)
    result = list(total_agg['article_id'].values)
    return result

In [26]:
@noglobal
def add_labels(recom_result, history):
    """レコメンドしたデータが学習期間で購入されたかどうかのフラグを付与する

    Args:
        recom_result (_type_): レコメンド結果
        train_tran (_type_): 学習期間のトランザクションデータ

    Returns:
        _type_: 学習期間での購入フラグを付与したレコメンド結果
    """
    history = history[['customer_id', 'article_id']].drop_duplicates()
    history['buy'] = 1
    recom_result = recom_result.merge(history, on=['customer_id', 'article_id'], how='left')
    recom_result['buy'] = recom_result['buy'].fillna(0)
    return recom_result

In [27]:
@noglobal
# def get_reccomend(target_customer_id, history, Ns, first_week_sales_pred):
def get_reccomend(target_customer_id, history, Ns, first_week_sales_pred):
    n = 12
    result = cudf.DataFrame()
    

    td = None
#     result = result.append(get_customer_frequent(history, Ns['cf_a'], td), ignore_index=True)
    result = result.append(get_customer_frequent(history, Ns['cf_a'], td))
#     result = result.append(get_customer_type_frequent(history, Ns['ctf_a'], td))
#     result = result.append(get_article_type_frequent(history, 'department_name', Ns['atfd_a'], td))
#     result = result.append(get_article_type_frequent(history, 'perceived_colour_master_name', Ns['atfp_a'], td))

    popular_article = get_popular_article(history, Ns['pa_a'], td)
#     # customerとpopular articleの全組み合わせでdataframe作成
    popular_article = pd.DataFrame(itertools.product(target_customer_id, popular_article), columns=['customer_id', 'article_id'])
    popular_article['article_id'] = popular_article.article_id.astype('int32')
    result = result.append(cudf.DataFrame.from_pandas(popular_article))
#     popular_new_article = get_popular_new_article(first_week_sales_pred, n=48)
#     popular_new_article = pd.DataFrame(itertools.product(target_customer_id, popular_new_article), columns=['customer_id', 'article_id'])
#     result = result.append(popular_new_article)

    result = result.drop_duplicates()

#     td = relativedelta(weeks=1)
    td = np.timedelta64(7, 'D')
    result = result.append(get_customer_frequent(history, Ns['cf_w'], td))
#     result = result.append(get_customer_type_frequent(history, Ns['ctf_w'], td))
#     result = result.append(get_article_type_frequent(history, 'department_name', Ns['atfd_w'], td))
#     result = result.append(get_article_type_frequent(history, 'perceived_colour_master_name', Ns['atfp_w'], td))
    popular_article = get_popular_article(history, Ns['pa_w'], td)
#     # customerとpopular articleの全組み合わせでdataframe作成
    popular_article = pd.DataFrame(itertools.product(target_customer_id, popular_article), columns=['customer_id', 'article_id'])
    popular_article['article_id'] = popular_article.article_id.astype('int32')
    result = result.append(cudf.DataFrame.from_pandas(popular_article))
    result = result.drop_duplicates()

    td = np.timedelta64(30, 'D')
    result = result.append(get_customer_frequent(history, Ns['cf_m'], td))
#     result = result.append(get_customer_type_frequent(history, Ns['ctf_m'], td))
#     result = result.append(get_article_type_frequent(history, 'department_name', Ns['atfd_m'], td))
#     result = result.append(get_article_type_frequent(history, 'perceived_colour_master_name', Ns['atfp_m'], td))
    popular_article = get_popular_article(history, Ns['pa_m'], td)
    # customerとpopular articleの全組み合わせでdataframe作成
    popular_article = pd.DataFrame(itertools.product(target_customer_id, popular_article), columns=['customer_id', 'article_id'])
    popular_article['article_id'] = popular_article.article_id.astype('int32')
    result = result.append(cudf.DataFrame.from_pandas(popular_article))
    result = result.drop_duplicates()

    td = np.timedelta64(365, 'D')
    result = result.append(get_customer_frequent(history, Ns['cf_y'], td))
#     result = result.append(get_customer_type_frequent(history, Ns['ctf_y'], td))
#     result = result.append(get_article_type_frequent(history, 'department_name', Ns['atfd_y'], td))
#     result = result.append(get_article_type_frequent(history, 'perceived_colour_master_name', Ns['atfp_y'], td))
    popular_article = get_popular_article(history, Ns['pa_y'], td)
    # customerとpopular articleの全組み合わせでdataframe作成
    popular_article = pd.DataFrame(itertools.product(target_customer_id, popular_article), columns=['customer_id', 'article_id'])
    popular_article['article_id'] = popular_article.article_id.astype('int32')
    result = result.append(cudf.DataFrame.from_pandas(popular_article))
    result = result.drop_duplicates()

    result = result[result['customer_id'].isin(target_customer_id)].copy()

#     purchased_together_pair = calc_pair(history)
#     add_result = result.copy()
#     add_result['article_id'] = add_result['article_id'].map(purchased_together_pair)
#     result = result.append(add_result.dropna().drop_duplicates())
#     result = result.drop_duplicates()

    return result

In [28]:
@noglobal
def add_features(df, history,customers, articles , first_week_sales_pred=None, text_svd_df=None):
    df = df.merge(customers, on=['customer_id'], how='left')
    df = df.merge(articles, on=['article_id'], how='left')
    # df = df.merge(make_article_tran_features(history), on=['article_id'], how='left')
    # df = df.merge(make_customer_tran_features(history), on=['customer_id'], how='left')
#     df = df.merge(make_article_tran_features(history), on=['article_id'], how='left')
#     df = df.merge(make_customer_features(customers), on=['customer_id'], how='left')
#     df = df.merge(make_customer_tran_features(history), on=['customer_id'], how='left')
# #     df = df.merge(make_customer_article_features(df[['customer_id', 'article_id']], history), on=['article_id', 'customer_id'], how='left')
#     df = df.merge(make_new_article_features(first_week_sales_pred), on=['article_id'], how='left')
#     df = df.merge(text_svd_df, on=['article_id'], how='left')

#     cols = ['product_type_name', 'product_group_name', 'graphical_appearance_name',
#             'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name',
#             'index_name', 'index_group_name', 'section_name', 'garment_group_name']

#     for c in cols:
#         df = add_same_article_type_rate(df, history, c)

    return df
    

In [29]:
params = {
    "objective": "binary",
    "boosting" : "gbdt",
    "learning_rate": 0.01,
    "metric": "binary_logloss",
    'device':'gpu',
    "seed": 42
}

NameError: name 'SEED' is not defined

In [30]:
def apk(y_true, y_pred, K=12):
    assert(len(y_true) == len(y_pred))
    apks = []
    for idx in range(len(y_true)):
        y_i_true = y_true[idx]
        y_i_pred = y_pred[idx]

        # 予測値の数と重複の確認
        assert(len(y_i_pred) <= K)
        assert(len(np.unique(y_i_pred)) == len(y_i_pred))

        sum_precision = 0.0
        num_hits = 0.0

        for i, p in enumerate(y_i_pred):
            if p in y_i_true:
                num_hits += 1
                precision = num_hits / (i+1)
                sum_precision += precision
        apk = sum_precision / min(len(y_i_true), K)
        apks.append(apk)
    return apks

In [None]:
# # 学習データの作成
# # 1週ずつローリングして学習データを生成
# train_start = np.datetime64('2020-09-09')
# valid_start = np.datetime64('2020-09-16')
# valid_end = np.datetime64('2020-09-22')


# # hist_st = np.datetime64(train_start)
# # target_st = np.datetime64(valid_start) 
# # ml_train = pd.DataFrame()

# fi = pd.DataFrame()
# # vl_pred = np.zeros(len(ml_valid))
# scores = []

# for i in range(N_ITER):
#     print(i)
#     history_tran = transactions[transactions['t_dat'] < train_start].copy()
#     target_tran = transactions[(transactions['t_dat'] >= train_start) & (transactions['t_dat'] < valid_start)].copy()
# #     first_week_sales_pred_tmp = first_week_sales_pred[(first_week_sales_pred['1st_week_sales_dat'] >= target_tran['t_dat'].min())&(first_week_sales_pred['1st_week_sales_dat'] <= target_tran['t_dat'].max())]
#     target_id= target_tran['customer_id'].unique().to_pandas().tolist()
#     print(f'train target_id count: {len(target_id)}')
#     ########## todo
#     first_week_sales_pred_tmp = None
#     ##########
#     recom = get_reccomend(target_id, history_tran, Ns, first_week_sales_pred_tmp)
#     ml_train = add_labels(recom, target_tran)
#     ml_train = add_features(ml_train, history_tran, customers, articles).fillna(0).to_pandas()

# #     # 評価データの作成
#     history_tran = transactions[transactions['t_dat'] < valid_start].copy()
#     target_tran = transactions[(transactions['t_dat'] >= valid_start) & (transactions['t_dat'] <= valid_end)].copy()
#     target_id = target_tran['customer_id'].unique().to_pandas().tolist()
#     print(f'val target_id count: {len(target_id)}')
# #     ########## todo
# #     first_week_sales_pred=None
# #     ########## 
#     recom = get_reccomend(target_id, history_tran, Ns, first_week_sales_pred_tmp)
#     ml_valid = add_labels(recom, target_tran)
#     ml_valid = add_features(ml_valid, history_tran, customers, articles).fillna(0).to_pandas()

#     target = 'buy'
#     not_use_cols = ['customer_id', 'article_id','t_dat', target]
#     features = [c for c in ml_train.columns if c not in not_use_cols]
    
#     # 学習
#     tr_x, tr_y = ml_train[features], ml_train[target]
#     vl_x, vl_y = ml_valid[features], ml_valid[target]
#     tr_data = lgb.Dataset(tr_x, label=tr_y)
#     vl_data = lgb.Dataset(vl_x, label=vl_y)
#     model = lgb.train(params, tr_data, valid_sets=[tr_data, vl_data],
#                     num_boost_round=20000, callbacks=[early_stopping(100), log_evaluation(1000)])
    
#     # 特徴量重要度
#     fi_tmp = pd.DataFrame()
#     fi_tmp['iter'] = N_ITER
#     fi_tmp['feature'] = model.feature_name()
#     fi_tmp['importance'] = model.feature_importance(importance_type='gain')
#     fi = fi.append(fi_tmp)

# #     # cv
#     vl_pred = model.predict(vl_x, num_iteration=model.best_iteration)
# #     # 正解データ作成
#     valid = transactions[(transactions['t_dat'] >= valid_start) & (transactions['t_dat'] <= valid_end)].copy()
#     valid = valid[['customer_id', 'article_id']].drop_duplicates().to_pandas()
#     valid = valid.groupby('customer_id')['article_id'].apply(list).reset_index()
#     valid = valid.sort_values('customer_id').reset_index(drop=True)
    
#     # 2値分類の出力を元に12個選定
#     valid_pred = ml_valid[['customer_id', 'article_id']].copy()
#     valid_pred['prob'] = vl_pred
#     valid_pred = valid_pred.sort_values(['customer_id', 'prob'], ascending=False)
#     valid_pred = valid_pred.groupby('customer_id').head(12)
#     valid_pred = valid_pred.groupby('customer_id')['article_id'].apply(list).reset_index()
#     valid_pred = valid_pred.sort_values('customer_id').reset_index(drop=True)
#     assert(valid['customer_id'].tolist() == valid_pred['customer_id'].tolist())
#     # MAP@12
#     score = np.mean(apk(valid['article_id'].tolist(), valid_pred['article_id'].tolist()))
#     print('{:.5f}'.format(score))
# #     print(f'{valid_start.strftime("%Y-%m-%d")} - {valid_end.strftime("%Y-%m-%d")} : ' + '{:.5f}'.format(score))
#     scores.append(score)
# ##################
# #     ml_valid['pred'] = vl_pred
# #     ml_valid.to_csv(OUTPUT_DIR + f'{exp_name}/{exp_name}_oof{i}.csv', index=False)

# #     with open(OUTPUT_DIR + f'{exp_name}/model{i}.pickle', 'wb') as f:
# #         pickle.dump(model, f)

# #     train_start = train_start - relativedelta(days=7)
# #     valid_start = valid_start - relativedelta(days=7)
# #     valid_end = valid_end - relativedelta(days=7)
#     break
# # ml_train = ml_train.reset_index(drop=True)
# # ml_train.to_csv(OUTPUT_DIR + f'{exp_name}/ml_train.csv', index=False)

In [None]:
# fi_n = fi['feature'].nunique()
# order = list(fi.groupby("feature").mean().sort_values("importance", ascending=False).index)
# plt.figure(figsize=(10, fi_n*0.2))
# sns.barplot(x="importance", y="feature", data=fi, order=order)
# plt.title(f"LGBM importance")
# plt.tight_layout()
# # plt.savefig(OUTPUT_DIR + f'{exp_name}/lgbm_importance.png')

In [31]:
# del history_tran, target_tran, target_id, recom
# del ml_train, ml_valid, tr_x, tr_y, vl_x, vl_y, tr_data, vl_data, valid, valid_pred
# import gc
# gc.collect()