In [29]:
from sklearn.model_selection import RepeatedKFold
import os
import pandas as pd
import glob
from pathlib import Path
from tqdm import tqdm
import random
from collections import defaultdict
from itertools import combinations

In [14]:
def get_ids_dataframe(corpus_path: str)-> pd.DataFrame:
    '''
        in: 
            corpus_path: contains the path with the sub-folders named after the 
                         document types
        out: pandas.DataFrame: [id, group]
    '''
        
    ids = []
    for subdir, folders, files in os.walk(corpus_path):
        if subdir!=corpus_path:
            for file in tqdm(os.listdir(path=subdir)):
                if ".ann" in file:           
                    if Path(os.path.join(subdir,file)).stat().st_size!=0:
                        ids.append({
                                "id": file.split(".")[0],
                                "group": subdir.split("\\")[-1]
                            }
                        )
    return pd.DataFrame(ids).set_index('id')

# Intra-group folds

In [18]:
def get_intra_group_folds(ids_df: pd.DataFrame, 
                          rnd_state: int=1524513)-> dict:
    '''
        in: 
            ids_df: pandas.DataFrame[id, group] 
            rnd_state: seed for random number generator
        
        out: dict {letterype: [(fold_train_0, fold_test_0), (fold_train_1, fold_test1), (..)]}
        
    '''
        
    intra_group_folds = defaultdict(list)
    groups = ids_df['group'].unique()
    for document_type in groups:
        repeated_k_folder = RepeatedKFold(n_repeats=100, n_splits=10, random_state=rnd_state)
        df = ids_df[ids_df['group']==document_type]
        for train_indcs, test_indcs in tqdm(repeated_k_folder.split(df)):
            intra_group_folds[document_type].append((list(ids_df.iloc[train_indcs].index), 
                                                     list(ids_df.iloc[test_indcs].index)))
    return intra_group_folds

# Inter-group folds

In [21]:
def get_inter_group_splits(ids_df: pd.DataFrame,
                     p: int, 
                     rnd_state: int=1524513,
                     train_frac: int=1) -> dict:
    '''
        in: 
            ids_df: pandas.DataFrame[id, group]
            p: number of lettertypes in train set
            rnd_state: seed for random number generator
            train_frac: fraction of training data used for model, default=1
        
        out: dict {}
    
    '''

    group_splits = dict()
    unique_groups = ids_df.group.unique()
    groups_train = [tuple(map(str, comb)) for comb in combinations(unique_groups, p)]
    groups_test = [tuple(set(unique_groups)-set(gt)) for gt in groups_train]
        
    for idx, gtrain in enumerate(groups_train):
        gtest = groups_test[idx]
        group_splits[idx] = {'groups_train': gtrain, 
                             'groups_test': gtest,
                             'train_fold': ids_df.loc[ids_df.group.isin(gtrain)]\
                                            .sample(frac=train_frac, random_state=rnd_state).index.tolist(), 
                             'test_fold': ids_df.loc[ids_df.group.isin(gtest)].index.tolist()
                          }
    return group_splits

In [5]:
dcc_path = "T:\\laupodteam\\AIOS\\Bram\\data\\Argus\\text_data\\external_text\\corpora\\EMCDutchClinicalCorpus"

In [15]:
ids_df = get_ids_dataframe(dcc_path)
intra_folds = get_intra_group_folds(ids_df=ids_df)
inter_folds = get_inter_group_splits(ids_df=ids_df, p=3)

100%|█████████████████████████████████████| 2999/2999 [00:04<00:00, 668.31it/s]
100%|█████████████████████████████████████| 4000/4000 [00:06<00:00, 632.85it/s]
100%|█████████████████████████████████████| 4001/4001 [00:06<00:00, 656.37it/s]
100%|█████████████████████████████████████| 3983/3983 [00:07<00:00, 566.19it/s]
