In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [9]:
df = pd.read_csv("../../dataset/train.csv")
df.head()

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score
0,0005f7aaab2800f6170c399693a96917,karolinska,0,0+0
1,000920ad0b612851f8e01bcc880d9b3d,karolinska,0,0+0
2,0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4
3,001c62abd11fa4b57bf7a6c603a11bb9,karolinska,4,4+4
4,001d865e65ef5d2579c190a0e0350d8f,karolinska,0,0+0


In [10]:
SEED=42
n_folds = 5
shuffle = True

In [11]:
train_df, val_test_df = train_test_split(df, test_size=0.3, random_state=SEED, shuffle=True, stratify=df["isup_grade"])
test_df, val_df = train_test_split(val_test_df, test_size=0.5, random_state=SEED, shuffle=True, stratify=val_test_df["isup_grade"])

In [12]:
(train_df.shape, val_df.shape, test_df.shape)

((7431, 4), (1593, 4), (1592, 4))

In [13]:
train_df.to_csv("train.csv", index=False)
val_df.to_csv("val.csv", index=False)
test_df.to_csv("test.csv", index=False)

In [14]:
train_val_df = pd.concat([train_df, val_df], axis=0)
train_val_df.to_csv("train_val.csv", index=False)

In [16]:
train_val_df.columns = train_val_df.columns.str.strip()
train_val_df = train_val_df.reset_index(drop=True)
stratified_k_fold = StratifiedKFold(n_folds, shuffle = shuffle, random_state=SEED)
train_val_df['fold'] = -1

train_val_df.head()
for i, (train_indexes, valid_indexes) in enumerate(stratified_k_fold.split(train_val_df, train_val_df['isup_grade'])):
    train_val_df.loc[valid_indexes, 'fold'] = i
train_val_df.to_csv(f'train_{n_folds}_fold.csv', index=False)

## Sample

In [19]:
N = 1000

df_sampled = (
    df.groupby("isup_grade", group_keys=False)
      .apply(
          lambda x: x.sample(frac=N/len(df), random_state=SEED),
          include_groups=False
      )
      .reset_index(drop=True)
)

train_df, val_test_df = train_test_split(
    df_sampled,
    test_size=0.3,          # 70% treino, 30% (val+test)
    random_state=SEED,
    shuffle=True,
    stratify=df_sampled["isup_grade"]
)

val_df, test_df = train_test_split(
    val_test_df,
    test_size=0.5,          # 15% val, 15% test
    random_state=SEED,
    shuffle=True,
    stratify=val_test_df["isup_grade"]
)

train_df.to_csv("sample/train.csv", index=False)
val_df.to_csv("sample/val.csv", index=False)
test_df.to_csv("sample/test.csv", index=False)

print(f"Tamanho final -> train: {len(train_df)}, val: {len(val_df)}, test: {len(test_df)}")
print(train_df["isup_grade"].value_counts(normalize=True))
print(val_df["isup_grade"].value_counts(normalize=True))
print(test_df["isup_grade"].value_counts(normalize=True))

KeyError: 'isup_grade'