# README

データセット
- ./data/input/ubiquant-market-prediction-half-precision-pickle/ に配置
- https://www.kaggle.com/datasets/lonnieqin/ubiquant-market-prediction-half-precision-pickle


サンプルコード
- https://www.kaggle.com/code/lonnieqin/ump-tf-record-combinatorialpurgedgroupkfold

# Library

In [1]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import tensorflow as tf
import json
import numpy as np
from scipy.special import comb
from itertools import combinations
from tqdm import tqdm

2022-05-05 03:20:25.129102: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/usr/local/lib:/usr/lib:
2022-05-05 03:20:25.129133: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Def

In [2]:
class CombinatorialPurgedGroupKFold():
    def __init__(self, n_splits = 6, n_test_splits = 2, purge = 1, pctEmbargo = 0.01, **kwargs):
        self.n_splits = n_splits
        self.n_test_splits = n_test_splits
        self.purge = purge
        self.pctEmbargo = pctEmbargo
        
    def split(self, X, y = None, groups = None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
            
        u, ind = np.unique(groups, return_index = True)
        unique_groups = u[np.argsort(ind)]
        n_groups = len(unique_groups)
        group_dict = {}
        for idx in range(len(X)):
            if groups[idx] in group_dict:
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
                
        n_folds = comb(self.n_splits, self.n_test_splits, exact = True)
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
            
        mbrg = int(n_groups * self.pctEmbargo)
        if mbrg < 0:
            raise ValueError(
                "The number of 'embargoed' groups should not be negative")
        
        split_dict = {}
        group_test_size = n_groups // self.n_splits
        for split in range(self.n_splits):
            if split == self.n_splits - 1:
                split_dict[split] = unique_groups[int(split * group_test_size):].tolist()
            else:
                split_dict[split] = unique_groups[int(split * group_test_size):int((split + 1) * group_test_size)].tolist()
        
        for test_splits in combinations(range(self.n_splits), self.n_test_splits):
            test_groups = []
            banned_groups = []
            for split in test_splits:
                test_groups += split_dict[split]
                banned_groups += unique_groups[split_dict[split][0] - self.purge:split_dict[split][0]].tolist()
                banned_groups += unique_groups[split_dict[split][-1] + 1:split_dict[split][-1] + self.purge + mbrg + 1].tolist()
            train_groups = [i for i in unique_groups if (i not in banned_groups) and (i not in test_groups)]

            train_idx = []
            test_idx = []
            for train_group in train_groups:
                train_idx += group_dict[train_group]
            for test_group in test_groups:
                test_idx += group_dict[test_group]
            yield train_idx, test_idx

## test

In [3]:
n_splits = 6
n_test_splits = 1
elements = list(range(10 * (n_splits + n_test_splits)))
groups = [element // n_splits for element in elements]
data = pd.DataFrame({"group": groups, "element": elements})
kfold = CombinatorialPurgedGroupKFold(n_splits, n_test_splits)
for index, (train_indices, test_indices) in enumerate(kfold.split(data, groups=data["group"])):
    print("=" * 100)
    print(f"Fold {index}")
    print("=" * 100)
    print("Train indices:", train_indices, "Length:", len(train_indices))
    print("Test Indices:", test_indices, "Length:", len(test_indices))

Fold 0
Train indices: [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69] Length: 52
Test Indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] Length: 12
Fold 1
Train indices: [0, 1, 2, 3, 4, 5, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69] Length: 46
Test Indices: [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23] Length: 12
Fold 2
Train indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69] Length: 46
Test Indices: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35] Length: 12
Fold 3
Train indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 2

# Practice

## Import dataset

In [25]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
# train = pd.read_pickle('../data/input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train = pd.read_parquet('../data/input/ubiquant-market-prediction-half-precision-pickle/train_low_mem.parquet').drop(["row_id"], axis=1)
train.head()

CPU times: user 6.97 s, sys: 8.73 s, total: 15.7 s
Wall time: 10.5 s


Unnamed: 0,time_id,investment_id,target,f_0,f_1,f_2,f_3,f_4,f_5,f_6,...,f_290,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299
0,0,1,-0.300875,0.932573,0.113691,-0.402206,0.378386,-0.203938,-0.413469,0.965623,...,0.366028,-1.09562,0.200075,0.819155,0.941183,-0.086764,-1.087009,-1.044826,-0.287605,0.321566
1,0,2,-0.23104,0.810802,-0.514115,0.742368,-0.616673,-0.194255,1.77121,1.428127,...,-0.154193,0.912726,-0.734579,0.819155,0.941183,-0.387617,-1.087009,-0.929529,-0.97406,-0.343624
2,0,6,0.568807,0.393974,0.615937,0.567806,-0.607963,0.068883,-1.083155,0.979656,...,-0.13802,0.912726,-0.551904,-1.220772,-1.060166,-0.219097,-1.087009,-0.612428,-0.113944,0.243608
3,0,7,-1.06478,-2.343535,-0.01187,1.874606,-0.606346,-0.586827,-0.815737,0.778096,...,0.382201,0.912726,-0.266359,-1.220772,0.941183,-0.609113,0.104928,-0.783423,1.15173,-0.773309
4,0,8,-0.53194,0.842057,-0.262993,2.33003,-0.583422,-0.618392,-0.742814,-0.946789,...,-0.170365,0.912726,-0.741355,-1.220772,0.941183,-0.588445,0.104928,0.753279,1.345611,-0.737624


In [26]:
train.count()

time_id          3141410
investment_id    3141410
target           3141410
f_0              3141410
f_1              3141410
                  ...   
f_295            3141410
f_296            3141410
f_297            3141410
f_298            3141410
f_299            3141410
Length: 303, dtype: int64

In [27]:
investment_id = train.pop("investment_id")
investment_id.head()

0    1
1    2
2    6
3    7
4    8
Name: investment_id, dtype: uint16

In [28]:
time_id = train.pop("time_id")

In [29]:
y = train.pop("target")
y.head()

0   -0.300875
1   -0.231040
2    0.568807
3   -1.064780
4   -0.531940
Name: target, dtype: float32

## Create TF-Record

In [30]:
def create_record(i):
    dic = {}
    dic[f"features"] = tf.train.Feature(float_list=tf.train.FloatList(value=list(train.iloc[i])))
    dic["time_id"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[time_id.iloc[i]]))
    dic["investment_id"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[investment_id.iloc[i]]))
    dic["target"] = tf.train.Feature(float_list=tf.train.FloatList(value=[y.iloc[i]]))
    record_bytes = tf.train.Example(features=tf.train.Features(feature=dic)).SerializeToString()
    return record_bytes
    
def decode_function(record_bytes):
  return tf.io.parse_single_example(
      # Data
      record_bytes,
      # Schema
      {
          "features": tf.io.FixedLenFeature([300], dtype=tf.float32),
          "time_id": tf.io.FixedLenFeature([], dtype=tf.int64),
          "investment_id": tf.io.FixedLenFeature([], dtype=tf.int64),
          "target": tf.io.FixedLenFeature([], dtype=tf.float32)
      }
  )

In [31]:
%%time
import time
n_splits = 5
n_test_splits = 1
kfold = CombinatorialPurgedGroupKFold(n_splits, n_test_splits)
for fold, (train_indices, test_indices) in tqdm(enumerate(kfold.split(train, groups=time_id))):
    print("=" * 100)
    print(f"Fold {index}")
    print("=" * 100)
    print("Train Sample size:", len(train_indices))
    print("Test Sample size:", len(test_indices))
    train_save_path = f"../data/input/CombinatorialPurgedGroupKFold_tf_record/fold_{fold}_train.tfrecords"
    begin = time.time()
    print(f"Creating {train_save_path}")
    with tf.io.TFRecordWriter(train_save_path) as file_writer:
        for i in train_indices:
            file_writer.write(create_record(i))
    print("Elapsed time: %.2f"%(time.time() - begin))
    begin = time.time()
    print(f"Creating {train_save_path}")
    test_save_path = f"../data/input/CombinatorialPurgedGroupKFold_tf_record/fold_{fold}_test.tfrecords"
    with tf.io.TFRecordWriter(test_save_path) as file_writer:
        for i in test_indices:
            file_writer.write(create_record(i))
    print("Elapsed time: %.2f"%(time.time() - begin))

0it [00:00, ?it/s]

Fold 5
Train Sample size: 2567952
Test Sample size: 544192
Creating ../data/input/CombinatorialPurgedGroupKFold_tf_record/fold_0_train.tfrecords
Elapsed time: 1269.88
Creating ../data/input/CombinatorialPurgedGroupKFold_tf_record/fold_0_train.tfrecords


1it [25:50, 1550.22s/it]

Elapsed time: 266.17
Fold 5
Train Sample size: 2608404
Test Sample size: 500873
Creating ../data/input/CombinatorialPurgedGroupKFold_tf_record/fold_1_train.tfrecords
Elapsed time: 1256.12
Creating ../data/input/CombinatorialPurgedGroupKFold_tf_record/fold_1_train.tfrecords


2it [50:46, 1518.76s/it]

Elapsed time: 240.29
Fold 5
Train Sample size: 2499599
Test Sample size: 605976
Creating ../data/input/CombinatorialPurgedGroupKFold_tf_record/fold_2_train.tfrecords
Elapsed time: 1209.28
Creating ../data/input/CombinatorialPurgedGroupKFold_tf_record/fold_2_train.tfrecords


3it [1:15:49, 1511.20s/it]

Elapsed time: 292.55
Fold 5
Train Sample size: 2390442
Test Sample size: 710108
Creating ../data/input/CombinatorialPurgedGroupKFold_tf_record/fold_3_train.tfrecords
Elapsed time: 1149.68
Creating ../data/input/CombinatorialPurgedGroupKFold_tf_record/fold_3_train.tfrecords


4it [1:40:40, 1503.23s/it]

Elapsed time: 341.00
Fold 5
Train Sample size: 2361149
Test Sample size: 780261
Creating ../data/input/CombinatorialPurgedGroupKFold_tf_record/fold_4_train.tfrecords
Elapsed time: 1150.17
Creating ../data/input/CombinatorialPurgedGroupKFold_tf_record/fold_4_train.tfrecords


5it [2:06:09, 1513.92s/it]

Elapsed time: 378.88
CPU times: user 2h 5min 16s, sys: 51.9 s, total: 2h 6min 8s
Wall time: 2h 6min 9s





## Write unique Investment Ids

In [32]:
investment_ids = investment_id.unique()
investment_id_df = pd.DataFrame({"investment_id": investment_ids})
investment_id_df.to_csv("../data/input/CombinatorialPurgedGroupKFold_tf_record/investment_ids.csv", index=False)