# exp001

In [1]:
import os
import sys
from dotenv import load_dotenv
load_dotenv()
sys.path.append(os.getenv('UTILS_PATH'))

import pandas as pd
import numpy as np
import line_notify

ディレクトリ設定

In [2]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
#exp_name = os.path.dirname(__file__).split('/')[-1]
exp_name = 'exp001'
os.makedirs(OUTPUT_DIR + exp_name, exist_ok=True)

データ読み込み

In [3]:
articles = pd.read_csv(INPUT_DIR + 'articles.csv', dtype='object')
customers = pd.read_csv(INPUT_DIR + 'customers.csv')
transactions = pd.read_csv(INPUT_DIR + 'transactions_train.csv', dtype={'article_id':'str'}, parse_dates=['t_dat'])
sample = pd.read_csv(INPUT_DIR + 'sample_submission.csv')

データ分割

In [4]:
valid_start = '2020-09-16'
train = transactions[transactions['t_dat'] < valid_start].copy()
valid = transactions[transactions['t_dat'] >= valid_start].copy()

# バリデーション

validの正解データ作成

In [5]:
valid_unique = valid[['customer_id', 'article_id']].drop_duplicates()
valid_true = valid_unique.groupby('customer_id')['article_id'].apply(list).reset_index()

予測値作成（trainの最頻値で予測値を作る）

In [6]:
def customer_frequent_reccomend(df, n=12):
    """顧客ごと商品の購入数をカウントし上位の商品をレコメンド対象として抽出

    Args:
        df (dataframe): 集計対象の実績データ
        n (int): レコメンド対象とする数

    Returns:
        dataframe: レコメンド結果
    """
    customer_agg = df.groupby(['customer_id', 'article_id'])['t_dat'].count().reset_index()
    customer_agg = customer_agg.rename(columns={'t_dat':'cnt'})
    customer_agg = customer_agg.sort_values(['customer_id', 'cnt'], ascending=False)
    customer_agg = customer_agg.groupby('customer_id').head(12)
    result = customer_agg.groupby('customer_id')['article_id'].apply(list).reset_index()
    return result

def popular_article_reccomend(df, n=12):
    """全体の購入数をカウントし上位の商品をレコメンド対象として抽出

    Args:
        df (dataframe): 集計対象の実績データ
        n (int): レコメンド対象とする数

    Returns:
        list: レコメンド結果
    """
    # 全体の購入数量
    total_agg = df.groupby('article_id')['t_dat'].count().reset_index()
    total_agg = total_agg.rename(columns={'t_dat':'cnt'})
    total_agg = total_agg.sort_values(['cnt'], ascending=False)
    total_agg = total_agg.head(n)
    result = list(total_agg['article_id'].values)
    return result

def get_reccomend(target_customer_id, train):
    """対象のcustomer_idに対するレコメンド結果を返す

    Args:
        target_customer_id (list): 対象のcustomer_id
        train (dataframe): レコメンドに用いる学習データ

    Returns:
        dataframe: レコメンド結果
    """
    result = pd.DataFrame()
    result['customer_id'] = target_customer_id

    customer_freq = customer_frequent_reccomend(train)
    popular_article = popular_article_reccomend(train)

    result = result.merge(customer_freq, on='customer_id', how='left')

    # listの代入がfillnaやlocではうまく行かないのでforループでatで入れてく（改善の余地あり）
    for idx in result[result['article_id'].isnull()].index:
        result.at[idx, 'article_id'] = popular_article

    # 購入実績の商品個数が12に満たないものは全体の最頻値も足して12に合わせる
    for idx in result[result['article_id'].apply(len) < 12].index:
        new_vals = result.at[idx, 'article_id'] + popular_article
        new_vals = sorted(set(new_vals), key=new_vals.index)
        result.at[idx, 'article_id'] = new_vals[:12]
    return result

In [7]:
target_id = valid_true['customer_id'].tolist()
valid_pred = get_reccomend(target_id, train)

In [8]:
def apk(y_true, y_pred, K=12):
    assert(len(y_true) == len(y_pred))
    apks = []
    for idx in range(len(y_true)):
        y_i_true = y_true[idx]
        y_i_pred = y_pred[idx]

        # 予測値の数と重複の確認
        assert(len(y_i_pred) <= K)
        assert(len(np.unique(y_i_pred)) == len(y_i_pred))

        sum_precision = 0.0
        num_hits = 0.0

        for i, p in enumerate(y_i_pred):
            if p in y_i_true:
                num_hits += 1
                precision = num_hits / (i+1)
                sum_precision += precision
        apk = sum_precision / min(len(y_i_true), K)
        apks.append(apk)
    return apks

In [9]:
# MAP@12
mapa12 = np.mean(apk(valid_true['article_id'].tolist(), valid_pred['article_id'].tolist()))
print(f'MAP@12 : ' + '{:.5f}'.format(mapa12))

MAP@12 : 0.00904


# sub

In [10]:
target_id = sample['customer_id'].tolist()
sub = get_reccomend(target_id, transactions)

In [11]:
sub = sub.rename(columns={'article_id':'prediction'})
sub['prediction'] = sub['prediction'].apply(lambda x: ' '.join(x))

In [12]:
sub.to_csv(OUTPUT_DIR + f'{exp_name}/{exp_name}_sub.csv', index=False)

In [13]:
sub.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601006 0797065001 0176209023 0568601043 06...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0811835004 0351484002 0689898002 0723529001 05...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0351484002 0663713001 0750424014 0870304002 05...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0732413001 0742079001 0706016001 0706016002 03...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0399061015 0589440005 0634249005 0677049001 06...


In [14]:
message = f'{exp_name} is finished!\nvalid_score : {mapa12}'
line_notify.send(message)