<a href="https://colab.research.google.com/github/coffeemountain/kaggle_otto/blob/main/notebooks/create_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# FIXME: データセット別で変更しなくてもいいように、相対パスにする
DATASET_DIR = '/content/drive/MyDrive/kaggle/2022/OTTO/share/example_train2'

## 1. 公式リポジトリのコードからデータセットを作成

In [None]:
!   pip install pipenv \
&& cd recsys-dataset \
&& pipenv sync \
&& pipenv install numpy

!   cd recsys-dataset \
&& pipenv run python -m src.testset --train-set train.jsonl --days 2 --output-path 'out/' --seed 42

!   cd recsys-dataset/out \
&& ls \
&& cp -r recsys-dataset/out {DATASET_DIR}


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pipenv
  Downloading pipenv-2022.12.19-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 7.1 MB/s 
[?25hCollecting virtualenv
  Downloading virtualenv-20.17.1-py3-none-any.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 53.5 MB/s 
Collecting virtualenv-clone>=0.2.5
  Downloading virtualenv_clone-0.5.7-py3-none-any.whl (6.6 kB)
Collecting distlib<1,>=0.3.6
  Downloading distlib-0.3.6-py2.py3-none-any.whl (468 kB)
[K     |████████████████████████████████| 468 kB 58.1 MB/s 
Installing collected packages: distlib, virtualenv-clone, virtualenv, pipenv
Successfully installed distlib-0.3.6 pipenv-2022.12.19 virtualenv-20.17.1 virtualenv-clone-0.5.7
[1mCreating a virtualenv for this project...[0m
Pipfile: [33m[1m/content/recsys-dataset/Pipfile[0m
[1mUsing[0m [33m[1m/usr/local/bin/python[0m [32m(3.8.16)[0m [1mto create virtualenv..

## 2. データセットファイルを作成

### データセットの概要
- 4週目の最後の2日を使用(↑のコマンドで作成したもの)
- 4週目の全sessionを前半・後半(1日目・2日目）で、trainとtestに分ける。

### 出力ファイル
- train_full.parquet: 特徴量作成等で利用 (各sessionの全データ)
  - columns: session, aid, ts, type
- train.parquet: モデル学習に入力データとして利用
  - columns: session, aid, ts, type
- train_labels.parquet: モデル学習時の正解データとして利用
  - columns: session, type, ground_truth
- test.parquet: CV計算時に、予測出力のために入力するデータ
  - columns: session, aid, ts, type
- test_labels.parquet: CV計算時に、正解データとして利用
  - columns: session, type, ground_truth

In [None]:
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

### train_full.parquet

In [None]:
session_data_full = pd.read_json(DATASET_DIR + '/out/test_sessions_full.jsonl', lines=True)
session_data_full

Unnamed: 0,session,events
0,12383433,"[{'aid': 1542913, 'ts': 1661551200081, 'type':..."
1,12383434,"[{'aid': 8211, 'ts': 1661551200511, 'type': 'c..."
2,12383435,"[{'aid': 940546, 'ts': 1661551201055, 'type': ..."
3,12383436,"[{'aid': 525172, 'ts': 1661551201158, 'type': ..."
4,12383437,"[{'aid': 822481, 'ts': 1661551201268, 'type': ..."
...,...,...
515697,12899774,"[{'aid': 33035, 'ts': 1661723968869, 'type': '..."
515698,12899775,"[{'aid': 1743151, 'ts': 1661723970935, 'type':..."
515699,12899776,"[{'aid': 548599, 'ts': 1661723972537, 'type': ..."
515700,12899777,"[{'aid': 384045, 'ts': 1661723976974, 'type': ..."


In [None]:
def sessions_train_test_split(df, rate=0.5):
  """
  - すべてのsessionからtrain・testのグループを作る
  - 返却は、train用のsessionIDのリスト, test用のsessionIDのリスト
  """
  threshold = int(len(df) * rate)
  
  train_session_list = df['session'][:threshold]
  test_session_list = df['session'][threshold:]

  return train_session_list, test_session_list

train_session_list, test_session_list = sessions_train_test_split(session_data_full)

In [None]:
type_labels = {'clicks': 0, 'carts': 1, 'orders': 2}

def explode_events_list(df):
  """
  データのフォーマットをよく使っているものに変更する
  """

  df_exploded = df.explode('events')
  df_exploded[['aid', 'ts', 'type']] = df_exploded['events'].progress_apply(pd.Series)

  df_exploded['type'] = df_exploded['type'].map(type_labels)
  df_exploded['ts'] = (df_exploded['ts'] / 1000).astype('int32')

  return df_exploded[['session', 'aid', 'ts', 'type']]

train_session_data_full = session_data_full[session_data_full['session'].isin(train_session_list)]
train_full = explode_events_list(train_session_data_full)

train_full.to_parquet(DATASET_DIR + '/datasets/train_full.parquet')
train_full

100%|██████████| 2171512/2171512 [12:39<00:00, 2857.71it/s]


Unnamed: 0,session,aid,ts,type
0,12383433,1542913,1661551200,0
0,12383433,1131993,1661551247,0
0,12383433,1131993,1661551290,0
0,12383433,1131993,1661551306,1
0,12383433,504821,1661551332,0
...,...,...,...,...
257849,12641633,498982,1661667237,0
257849,12641633,268689,1661667332,0
257849,12641633,924881,1661667436,0
257850,12641634,1498684,1661667238,0


### train.parquet | test.parquet


In [None]:
sessions = pd.read_json(DATASET_DIR + '/out/test_sessions.jsonl', lines=True)
sessions

Unnamed: 0,session,events
0,12383433,"[{'aid': 1542913, 'ts': 1661551200081, 'type':..."
1,12383434,"[{'aid': 8211, 'ts': 1661551200511, 'type': 'c..."
2,12383435,"[{'aid': 940546, 'ts': 1661551201055, 'type': ..."
3,12383436,"[{'aid': 525172, 'ts': 1661551201158, 'type': ..."
4,12383437,"[{'aid': 822481, 'ts': 1661551201268, 'type': ..."
...,...,...
515697,12899774,"[{'aid': 33035, 'ts': 1661723968869, 'type': '..."
515698,12899775,"[{'aid': 1743151, 'ts': 1661723970935, 'type':..."
515699,12899776,"[{'aid': 548599, 'ts': 1661723972537, 'type': ..."
515700,12899777,"[{'aid': 384045, 'ts': 1661723976974, 'type': ..."


In [None]:
train_sessions = sessions[sessions['session'].isin(train_session_list)]
test_sessions = sessions[sessions['session'].isin(test_session_list)]

train = explode_events_list(train_sessions)
test = explode_events_list(test_sessions)

train.to_parquet(DATASET_DIR + '/datasets/train.parquet')
test.to_parquet(DATASET_DIR + '/datasets/test.parquet')
train

100%|██████████| 1088724/1088724 [06:15<00:00, 2897.13it/s]
100%|██████████| 1008064/1008064 [06:06<00:00, 2751.03it/s]


Unnamed: 0,session,aid,ts,type
0,12383433,1542913,1661551200,0
1,12383434,8211,1661551200,0
2,12383435,940546,1661551201,1
2,12383435,45443,1661551213,0
2,12383435,1769360,1661551246,0
...,...,...,...,...
257848,12641632,694417,1661667237,0
257848,12641632,694417,1661667345,0
257849,12641633,498982,1661667237,0
257849,12641633,268689,1661667332,0


### train_labels.parquet | test_labels.parquet

In [None]:
labels_data = pd.read_json('recsys-dataset/out/test_labels.jsonl', lines=True)
labels_data

ValueError: ignored

In [None]:
def explode_labels_dict(df):

  def _clicks_int2list(row):
    # clicksだけlistじゃないので、listにする(あとの処理のため)
    if 'clicks' in row:
      row['clicks'] = [row['clicks']]
    return row

  df['labels'] = df['labels'].apply(_clicks_int2list)
  return df[['session']].join(pd.DataFrame([*df['labels']], df.index).stack()\
      .rename_axis([None, 'type']).reset_index(1, name='ground_truth'))


train_labels_data = labels_data[labels_data['session'].isin(train_session_list)]
test_labels_data = labels_data[labels_data['session'].isin(test_session_list)]

train_labels = explode_labels_dict(train_session_list)
test_labels = explode_labels_dict(test_labels_data)

train_labels.to_parquet(DATASET_DIR + '/datasets/train_labels.parquet')
test_labels.to_parquet(DATASET_DIR + '/datasets/test_labels.parquet')

train_labels