In [25]:
import numpy as np
import pandas as pd
from io import StringIO

from datastore.data import RandomData, RandomMultiTaskData

In [2]:
dataset = RandomData(num_samples=10, num_classes=2)

In [8]:
single_df = dataset.dataframe()

In [9]:
single_df

Unnamed: 0,data,labels
0,-0.712391,1
1,0.753766,0
2,-0.044503,0
3,0.451812,0
4,1.345102,1
5,0.532338,0
6,1.350188,0
7,0.861211,0
8,1.478686,0
9,-1.045377,0


In [13]:
multi = RandomMultiTaskData(num_samples=10, num_tasks=3, num_classes=2)

In [14]:
multi_df = multi.dataframe()

In [21]:
multi_df

Unnamed: 0,data,task0,task1,task2
0,-0.712391,1,0,1
1,0.753766,0,1,0
2,-0.044503,0,1,1
3,0.451812,0,1,1
4,1.345102,1,1,1
5,0.532338,0,0,1
6,1.350188,0,1,1
7,0.861211,0,0,1
8,1.478686,0,0,0
9,-1.045377,0,1,1


In [27]:
df.to_csv('tmpdata.csv', index='labels', index_label='label_idx')

In [28]:
read_df = pd.read_csv('tmpdata.csv')

In [29]:
read_df

Unnamed: 0,label_idx,data,labels
0,0,-0.712391,1
1,1,0.753766,0
2,2,-0.044503,0
3,3,0.451812,0
4,4,1.345102,1
5,5,0.532338,0
6,6,1.350188,0
7,7,0.861211,0
8,8,1.478686,0
9,9,-1.045377,0


In [32]:
labels = pd.read_csv('tmpdata.csv', usecols=['labels'])

In [33]:
labels

Unnamed: 0,labels
0,1
1,0
2,0
3,0
4,1
5,0
6,0
7,0
8,0
9,0


In [34]:
from collections import namedtuple
from sklearn.model_selection import StratifiedKFold

from datastore.api.data import Subset


def stratified_split(dataset, num_splits, seed=42):
    """ Create stratified k-fold splits
    Parameters
    ----------
    dataset : datastore.dataset
    num_splits : int
        Number of splits of the data (usually denoted by `k` folds)
    seed : int
        Random seed to control the splits
    Returns
    -------
    splits : list(namedtuple<Subset, Subset>)
        stratified splits of the data
    """
    skf = StratifiedKFold(n_splits=num_splits, random_state=seed)
    data, labels = dataset.load_data()

    splits = []
    Split = namedtuple('Split', 'train valid')

    for train_idx, valid_idx in skf.split(data, labels):
        split = Split(
            train = Subset(dataset, train_idx),
            valid = Subset(dataset, valid_idx)
        )

        splits.append(split)

    return splits

In [36]:
num_splits = 3

In [38]:
skf = StratifiedKFold(n_splits=num_splits, random_state=13)

data = np.empty_like(labels)
splits = []
Split = namedtuple('Split', 'train valid')

for train_idx, valid_idx in skf.split(data, labels):
    split = Split(
        train = Subset(dataset, train_idx),
        valid = Subset(dataset, valid_idx)
    )

    splits.append(split)



In [39]:
splits

[Split(train=<datastore.api.data.Subset object at 0x1a2c28d2b0>, valid=<datastore.api.data.Subset object at 0x1a2c28db70>),
 Split(train=<datastore.api.data.Subset object at 0x1a2c28dbe0>, valid=<datastore.api.data.Subset object at 0x129792f98>),
 Split(train=<datastore.api.data.Subset object at 0x1297a5400>, valid=<datastore.api.data.Subset object at 0x1a2c29c2b0>)]