In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 5000)

In [21]:
FEATURE_DIR_NAME = '../data/features/'
RAW_DATA_DIR_NAME = '../data/raw/'

In [26]:
train = pd.read_csv(RAW_DATA_DIR_NAME + 'train.csv')[['chip_id']]

In [29]:
train['chip_id'].unique().tolist()

['79ad4647da6de6425abf',
 'c695a1e61e002b34e556',
 '6718e7f83c824b1e436d',
 '0b9dbf13f938efd5717f',
 '84b788fdc5e779f8a0df',
 '118c70535bd753a86615']

In [3]:
train = pd.read_csv(RAW_DATA_DIR_NAME + 'train.csv')
train['chip_id'], _ = pd.factorize(train['chip_id'])
y = train.target

In [4]:
train

Unnamed: 0,spectrum_id,spectrum_filename,chip_id,exc_wl,layout_a,layout_x,layout_y,pos_x,target
0,000da4633378740f1ee8,b2e223339f4abce9b400.dat,0,850,2,36,140,1313.0810,0
1,000ed1a5a9fe0ad2b7dd,e2f150a503244145e7ce.dat,0,780,3,0,168,159.4150,0
2,0016e3322c4ce0700f9a,3d58b7ccaee157979cf0.dat,1,780,1,34,29,-610.7688,0
3,00256bd0f8c6cf5f59c8,ed3641184d3b7c0ae703.dat,1,780,2,32,139,1214.6180,0
4,003483ee5ae313d37590,4c63418d39f86dfab9bb.dat,1,780,0,45,85,-257.6161,0
...,...,...,...,...,...,...,...,...,...
7431,ffcc2d0e80130bcd1f66,677582af16aeb72c01df.dat,3,850,0,8,56,-1441.3620,0
7432,ffd86d57b9d44f10c7d0,6f23369fb8e0d1fde118.dat,4,850,3,12,2,543.2881,0
7433,ffdc78c1ca0a8c5a689f,825df3fcf8ce0570f0be.dat,2,780,0,41,102,-383.0251,0
7434,ffe1a53afdbab5ebddeb,5a2bd19c41cb6da70b33.dat,4,850,3,3,124,259.5428,0


In [5]:
y

0       0
1       0
2       0
3       0
4       0
       ..
7431    0
7432    0
7433    0
7434    0
7435    0
Name: target, Length: 7436, dtype: int64

In [6]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)

    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

In [10]:
trn_idx, val_idx = stratified_group_k_fold(train, y, train.chip_id, k=5, seed=42)

ValueError: too many values to unpack (expected 2)

In [12]:
stratified_group_k_list = []
for fold, (trn_idx, val_idx) in enumerate(stratified_group_k_fold(train, y, train.chip_id, k=5, seed=42)):
    trn_x, trn_y = train.iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train.iloc[val_idx], y.iloc[val_idx]
    
    stratified_group_k_list.append((trn_idx, val_idx))

    print(f'fold{fold}', '-'*50)
    print(f'train samples: {len(trn_x)} \t\t| chip id: {train.chip_id[trn_idx].unique()} \t\t| positive ratio:{trn_y.mean():.3}')
    print(f'valid samples: {len(val_x)} \t\t| chip id: {train.chip_id[val_idx].unique()} \t\t\t| positive ratio:{val_y.mean():.3}')

fold0 --------------------------------------------------
train samples: 5631 		| chip id: [0 1 2 3 4] 		| positive ratio:0.0297
valid samples: 1805 		| chip id: [5] 			| positive ratio:0.0382
fold1 --------------------------------------------------
train samples: 5615 		| chip id: [1 2 3 4 5] 		| positive ratio:0.0203
valid samples: 1821 		| chip id: [0] 			| positive ratio:0.067
fold2 --------------------------------------------------
train samples: 6242 		| chip id: [0 1 2 3 5] 		| positive ratio:0.0356
valid samples: 1194 		| chip id: [4] 			| positive ratio:0.0117
fold3 --------------------------------------------------
train samples: 6288 		| chip id: [0 1 3 4 5] 		| positive ratio:0.0356
valid samples: 1148 		| chip id: [2] 			| positive ratio:0.0105
fold4 --------------------------------------------------
train samples: 5968 		| chip id: [0 2 4 5] 		| positive ratio:0.0364
valid samples: 1468 		| chip id: [1 3] 			| positive ratio:0.0129


In [15]:
id1, id2 = stratified_group_k_list[0]

In [20]:
len(id1)

5631

In [19]:
len(id2)

1805