# 1. 데이터 로드

### 주요 import 할것들 미리 해두기 (나머지는 그때그때)

In [1]:
import os
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime as dt
from datetime import timedelta as td

from tqdm.notebook import tqdm

In [2]:
import utils

### 경로 등 global config 설정

In [3]:
RAW_ROOT = 'raw'
DATA_ROOT = 'data'

### 각종 데이터 불러오기

In [4]:
%%time
_, _, _, df_sub_raw = utils.load_raw(RAW_ROOT)

CPU times: user 13.3 s, sys: 8 s, total: 21.3 s
Wall time: 20.2 s


In [5]:
%%time
df_user_CV = pd.read_parquet(f'{DATA_ROOT}/df_user_aggregated_CV.pq')
df_user_LB = pd.read_parquet(f'{DATA_ROOT}/df_user_aggregated_LB.pq')
df_item_CV = pd.read_parquet(f'{DATA_ROOT}/df_item_aggregated_CV.pq')
df_item_LB = pd.read_parquet(f'{DATA_ROOT}/df_item_aggregated_LB.pq')
df_log_all = pd.read_parquet(f'{DATA_ROOT}/df_log_preprocessed.pq')
df_log_CV = utils.get_df_log_of(df_log_all, 'CV')
df_log_LB = utils.get_df_log_of(df_log_all, 'LB')

CPU times: user 26.2 s, sys: 26.4 s, total: 52.6 s
Wall time: 47.7 s


In [6]:
del df_log_all

---

# 2. CV 테스트 환경 및 서브밋 환경

### CV 테스트 정답셋

In [7]:
%%time
df_log_CV_test = df_log_CV[df_log_CV['target'] == 'test']
df_log_CV_train_valid = df_log_CV[df_log_CV['target'] != 'test']

CPU times: user 4.4 s, sys: 1.5 s, total: 5.9 s
Wall time: 5.83 s


In [8]:
uid2aiidset_CV = {}

In [9]:
%%time
for row in df_log_CV_test[['customer_id', 'article_id']].itertuples():
    _, uid, iid = row
    if uid not in uid2aiidset_CV:
        uid2aiidset_CV[uid] = set()
    uid2aiidset_CV[uid].add(iid)

CPU times: user 1.12 s, sys: 539 ms, total: 1.65 s
Wall time: 1.59 s


# 3. Recent Week Popular

### CV 버전

4-week best

In [10]:
best_week_cut = 4

In [11]:
last_week = df_log_CV_train_valid['week'].max()

In [12]:
df_log_temp = df_log_CV_train_valid[df_log_CV_train_valid['week'] > last_week - best_week_cut]
df_iid_count_temp = df_log_temp[['article_id', 'customer_id']].groupby('article_id').count()
df_iid_count_temp.columns = ['count']
local_popular_iids = list(df_iid_count_temp.sort_values('count', ascending=False).iloc[:12].index)

In [13]:
del df_iid_count_temp
del df_log_temp

In [14]:
%%time
resource_rows = []
prediction = ' '.join(local_popular_iids)
for uid in uid2aiidset_CV:
    resource_rows.append((uid, prediction))
df_resource_CV = pd.DataFrame(resource_rows)
df_resource_CV.columns = ['customer_id', 'prediction']

CPU times: user 125 ms, sys: 8.63 ms, total: 133 ms
Wall time: 133 ms


In [15]:
del resource_rows

In [16]:
os.makedirs('data/resources', exist_ok=True)

In [17]:
%%time
df_resource_CV.to_csv(f'data/resources/RWP_WC{best_week_cut}_CV.csv', index=False)

CPU times: user 433 ms, sys: 28.8 ms, total: 462 ms
Wall time: 461 ms


In [18]:
del df_resource_CV

### LB 버전

1-week best

In [19]:
best_week_cut = 1

In [20]:
last_week = df_log_LB['week'].max()

In [21]:
df_log_temp = df_log_LB[df_log_LB['week'] > last_week - best_week_cut]
df_iid_count_temp = df_log_temp[['article_id', 'customer_id']].groupby('article_id').count()
df_iid_count_temp.columns = ['count']
local_popular_iids = list(df_iid_count_temp.sort_values('count', ascending=False).iloc[:12].index)

In [22]:
del df_iid_count_temp
del df_log_temp

In [23]:
%%time
df_resource_LB = df_sub_raw.copy()
df_resource_LB['prediction'] = ' '.join(local_popular_iids)

CPU times: user 41.9 ms, sys: 182 ms, total: 223 ms
Wall time: 222 ms


In [24]:
os.makedirs('data/resources', exist_ok=True)

In [25]:
%%time
df_resource_LB.to_csv(f'data/resources/RWP_WC{best_week_cut}_LB.csv', index=False)

CPU times: user 6.05 s, sys: 226 ms, total: 6.28 s
Wall time: 6.34 s


In [26]:
del df_resource_LB

---

# 4. Recent Bought

### CV 버전

8-week recent bought

In [27]:
week_cut = 8

In [28]:
last_week = df_log_CV_train_valid['week'].max()

In [29]:
%%time
df_log_temp = df_log_CV_train_valid[df_log_CV_train_valid['week'] > (last_week - week_cut)]
df_iid_count_temp = df_log_temp[['article_id', 'customer_id']].groupby('article_id').count()
df_iid_count_temp.columns = ['count']
df_log_temp = df_log_temp.merge(df_iid_count_temp, how='left', left_on='article_id', right_index=True)
df_log_temp = df_log_temp.sort_values(['t_dat', 'count'], ascending=[False, False])
df_recent_boughts = df_log_temp.groupby('customer_id').agg({'article_id': lambda x: list(x)[:12]})
df_recent_boughts.columns = ['recent_boughts']

CPU times: user 5.82 s, sys: 1.72 s, total: 7.53 s
Wall time: 7.83 s


In [30]:
del df_iid_count_temp
del df_log_temp

In [31]:
%%time
resource_rows = []
for uid in tqdm(uid2aiidset_CV):
    try:
        prediction = ' '.join(df_recent_boughts.loc[uid, 'recent_boughts'])
        resource_rows.append((uid, prediction))
    except KeyError:
        pass
df_resource_CV = pd.DataFrame(resource_rows)
df_resource_CV.columns = ['customer_id', 'prediction']

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=99089.0), HTML(value='')))


CPU times: user 2.03 s, sys: 45.1 ms, total: 2.07 s
Wall time: 2.08 s


In [32]:
del resource_rows
del df_recent_boughts

In [33]:
os.makedirs('data/resources', exist_ok=True)

In [34]:
%%time
df_resource_CV.to_csv(f'data/resources/RB_WC{week_cut}_CV.csv', index=False)

CPU times: user 207 ms, sys: 3.48 ms, total: 211 ms
Wall time: 210 ms


In [35]:
del df_resource_CV

### LB 버전

8-week recent bought

In [36]:
week_cut = 8

In [37]:
last_week = df_log_LB['week'].max()

In [38]:
%%time
df_log_temp = df_log_LB[df_log_LB['week'] > (last_week - week_cut)]
df_iid_count_temp = df_log_temp[['article_id', 'customer_id']].groupby('article_id').count()
df_iid_count_temp.columns = ['count']
df_log_temp = df_log_temp.merge(df_iid_count_temp, how='left', left_on='article_id', right_index=True)
df_log_temp = df_log_temp.sort_values(['t_dat', 'count'], ascending=[False, False])
df_recent_boughts = df_log_temp.groupby('customer_id').agg({'article_id': lambda x: list(x)[:12]})
df_recent_boughts.columns = ['recent_boughts']

CPU times: user 6.17 s, sys: 1.59 s, total: 7.76 s
Wall time: 7.72 s


In [39]:
del df_iid_count_temp
del df_log_temp

In [40]:
%%time
resource_rows = []
for uid, value in tqdm(df_recent_boughts.iterrows(), total=len(df_recent_boughts)):
    resource_rows.append((uid, ' '.join(value.recent_boughts)))
df_resource_LB = pd.DataFrame(resource_rows)
df_resource_LB.columns = ['customer_id', 'prediction']

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=378688.0), HTML(value='')))


CPU times: user 21.3 s, sys: 394 ms, total: 21.7 s
Wall time: 21.5 s


In [41]:
del resource_rows
del df_recent_boughts

In [42]:
os.makedirs('data/resources', exist_ok=True)

In [43]:
%%time
df_resource_LB.to_csv(f'data/resources/RB_WC{week_cut}_LB.csv', index=False)

CPU times: user 1.16 s, sys: 48.6 ms, total: 1.21 s
Wall time: 1.21 s


In [44]:
del df_resource_LB

---

In [45]:
del df_user_CV
del df_user_LB
del df_item_CV
del df_item_LB
del df_log_CV
del df_log_LB