# train_split
学習データからテストデータを模した検証用データを生成する
* trainデータを1weekごとに分割
* 各セッションごとランダムに分割し、trainとtestに分ける
* testからlabelデータを生成する

In [1]:
import os
import sys
import subprocess
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import random

import pandas as pd

In [2]:
SEED = 42
random.seed(SEED)

In [3]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
PREP_DIR = os.getenv("PREP_DIR")

In [4]:
org_sessions = pd.read_pickle(PREP_DIR + "train_sessions.pkl")


## 1weekごと分割

In [5]:
w1_start = "2022-07-31 22:00:00"
w2_start = "2022-08-07 22:00:00"
w3_start = "2022-08-14 22:00:00"
w4_start = "2022-08-21 22:00:00"

dt = pd.to_datetime(org_sessions["ts"], unit="ms")
week1_sessions = org_sessions[dt < w2_start].copy()
week2_sessions = org_sessions[(dt >= w2_start) & (dt < w3_start)].copy()
week3_sessions = org_sessions[(dt >= w3_start) & (dt < w4_start)].copy()
week4_sessions = org_sessions[dt >= w4_start].copy()

week1_sessions = week1_sessions.reset_index(drop=True)
week2_sessions = week2_sessions.reset_index(drop=True)
week3_sessions = week3_sessions.reset_index(drop=True)
week4_sessions = week4_sessions.reset_index(drop=True)

In [6]:
print("week1 : ", pd.to_datetime(week1_sessions["ts"].min(), unit="ms"), "-" ,pd.to_datetime(week1_sessions["ts"].max(), unit="ms"))
print("week2 : ", pd.to_datetime(week2_sessions["ts"].min(), unit="ms"), "-" ,pd.to_datetime(week2_sessions["ts"].max(), unit="ms"))
print("week3 : ", pd.to_datetime(week3_sessions["ts"].min(), unit="ms"), "-" ,pd.to_datetime(week3_sessions["ts"].max(), unit="ms"))
print("week4 : ", pd.to_datetime(week4_sessions["ts"].min(), unit="ms"), "-" ,pd.to_datetime(week4_sessions["ts"].max(), unit="ms"))

week1 :  2022-07-31 22:00:00.025000 - 2022-08-07 21:59:59.968000
week2 :  2022-08-07 22:00:00.009000 - 2022-08-14 21:59:59.998000
week3 :  2022-08-14 22:00:00.009000 - 2022-08-21 21:59:59.994000
week4 :  2022-08-21 22:00:00.002000 - 2022-08-28 21:59:59.984000


## train,test,labelsを作成し出力

In [7]:
def split_train_test(sessions):
    """
    各セッションごとランダムに分割し、trainとtestに分ける
    """
    train_idx = []
    test_idx = []
    for _, session in tqdm(sessions.groupby("session")):
        session_len = len(session)
        # １レコードしかない場合、trainとtestを作れないのでスキップする
        if session_len == 1:
            continue

        split_point = random.randint(1, session_len-1)
        train_idx.extend(list(session.iloc[:split_point].index))
        test_idx.extend(list(session.iloc[split_point:].index))
    
    train_sessions = sessions.loc[train_idx].copy()
    test_sessions = sessions.loc[test_idx].copy()

    train_sessions = train_sessions.reset_index(drop=True)
    test_sessions = test_sessions.reset_index(drop=True)

    return train_sessions, test_sessions

def make_labels(sessions):
    labels = pd.DataFrame(sessions["session"].unique(), columns=["session"])

    clicks_labels = sessions[sessions["type"]=="clicks"].groupby("session").head(1)[["session", "aid"]]
    clicks_labels = clicks_labels.rename(columns={"aid":"clicks_labels"})
    labels = labels.merge(clicks_labels, how="left", on="session")

    carts_labels = sessions[sessions["type"]=="carts"].drop_duplicates(subset=["session", "aid"]).groupby("session")["aid"].apply(list).reset_index()
    carts_labels = carts_labels.rename(columns={"aid":"carts_labels"})
    labels = labels.merge(carts_labels, how="left", on="session")

    orders_labels = sessions[sessions["type"]=="orders"].drop_duplicates(subset=["session", "aid"]).groupby("session")["aid"].apply(list).reset_index()
    orders_labels = orders_labels.rename(columns={"aid":"orders_labels"})
    labels = labels.merge(orders_labels, how="left", on="session")

    return labels

def make_train_test_labels(session, suffix):
    train_sessions, test_sessions = split_train_test(session)
    labels = make_labels(test_sessions)

    train_sessions.to_pickle(PREP_DIR + f"train_sessions_{suffix}.pkl")
    test_sessions.to_pickle(PREP_DIR + f"test_sessions_{suffix}.pkl")
    labels.to_pickle(PREP_DIR + f"labels_{suffix}.pkl")


In [8]:
make_train_test_labels(week1_sessions, "week1")
make_train_test_labels(week2_sessions, "week2")
make_train_test_labels(week3_sessions, "week3")
make_train_test_labels(week4_sessions, "week4")

100%|██████████| 5348783/5348783 [02:57<00:00, 30092.81it/s]
100%|██████████| 5683320/5683320 [03:02<00:00, 31187.29it/s]
100%|██████████| 5704612/5704612 [03:01<00:00, 31348.02it/s]
100%|██████████| 5323084/5323084 [02:52<00:00, 30832.20it/s]
