In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold,StratifiedKFold
import glob

def transfer(x):
    if x == [1, 0, 0, 0]:
        return 0
    elif x == [0, 1, 0, 0]:
        return 1
    elif x == [0, 0, 1, 0]:
        return 2
    elif x == [0, 1, 1, 0]:
        return 3
    elif x == [0, 0, 0, 1]:
        return 4
    elif x == [0, 0, 1, 1]:
        return 5


df = pd.read_csv(f'./dataset_csv/sicap.csv')
ohe2label = dict({'0+0': 0, '3+3': 1, '3+4': 2, '4+3': 2, '4+4': 3, '3+5': 3, '5+3': 3, '4+5': 4, '5+4': 4, '5+5': 4,})
df.loc[:, 'gleason_score'] = df.loc[:, 'gleason_score'].map(lambda x: ohe2label[x])

#---->get patient data
patient_df = df.drop_duplicates(['patient_id']).copy()
patient_df.set_index(keys='patient_id', drop=True, inplace=True)
df.set_index(keys='patient_id', drop=True, inplace=True)

#---->construct link between patient_df and data
patient_dict = {}
for patient in patient_df.index:
    slide_ids = df.loc[patient, 'image_id'] #take case_id
    if isinstance(slide_ids, str):
        slide_ids = np.array(slide_ids).reshape(-1)
    else:
        slide_ids = slide_ids.values
    patient_dict.update({patient:slide_ids}) 

#----> 0.6: 0.15: 0.25
train_test_folder = StratifiedKFold(n_splits=4,random_state=0,shuffle=True)
for fold, (train_val, test) in enumerate(train_test_folder.split(list(patient_df.index),list(patient_df.loc[:, 'gleason_score']))):
    train_val_patient_df = patient_df.iloc[list(train_val), :]
    test_patient_df = patient_df.iloc[list(test), :]

    train_val_folder = StratifiedKFold(n_splits=5,random_state=0,shuffle=True)
    for train, val in train_val_folder.split(list(train_val_patient_df.index),list(train_val_patient_df.loc[:, 'gleason_score'])):
        train_patient_df = train_val_patient_df.iloc[list(train), :]
        val_patient_df = train_val_patient_df.iloc[list(val), :]
        break 

    train_patient_id = train_patient_df.index.values.tolist()
    train_gleason_score = train_patient_df.loc[:, 'gleason_score'].values.tolist()
    df_train_fold = pd.DataFrame({'train_patient_id': [], 'train_image_id': [], 'train_gleason_score': []})
    for idx, patient in enumerate(train_patient_id):
        patient_img = patient_dict[patient]
        for img in patient_img:
            df_train_fold = df_train_fold.append({'train_patient_id': str(patient), 'train_image_id': img, 'train_gleason_score': str(train_gleason_score[idx])}, ignore_index=True)

    val_patient_id = val_patient_df.index.values.tolist()
    val_gleason_score = val_patient_df.loc[:, 'gleason_score'].values.tolist()
    df_val_fold = pd.DataFrame({'val_patient_id': [], 'val_image_id': [], 'val_gleason_score': []})
    for idx, patient in enumerate(val_patient_id):
        patient_img = patient_dict[patient]
        for img in patient_img:
            df_val_fold = df_val_fold.append({'val_patient_id': str(patient), 'val_image_id': img, 'val_gleason_score': str(val_gleason_score[idx])}, ignore_index=True)


    test_patient_id = test_patient_df.index.values.tolist()
    test_gleason_score = test_patient_df.loc[:, 'gleason_score'].values.tolist()      
    df_test_fold = pd.DataFrame({'test_patient_id': [], 'test_image_id': [], 'test_gleason_score': []})
    for idx, patient in enumerate(test_patient_id):
        patient_img = patient_dict[patient]
        for img in patient_img:
            df_test_fold = df_test_fold.append({'test_patient_id': str(patient), 'test_image_id': img, 'test_gleason_score': str(test_gleason_score[idx])}, ignore_index=True)  

    df_fold = pd.concat([df_train_fold, df_val_fold, df_test_fold], axis=1)

    #---->save
    df_fold.to_csv(f'./dataset_csv/SICAPv2/fold{fold}.csv')




In [None]:


excep_list = ['/data114_2/shaozc/TMA/images/pdac/HPan-Qde120Sur_A-3']

normal_list = glob.glob('/data112/shaozc/TMA/images/normal/*')
normal_list = list(set(normal_list)-set(excep_list))
normal_list = [normal.split('/')[-1] for normal in normal_list]
df_normal = pd.DataFrame({'patient_id': normal_list, 'image_id': normal_list, 'type_label': 0})

pancreatitis_list = glob.glob('/data112/shaozc/TMA/images/pancreatitis/*')
pancreatitis_list = list(set(pancreatitis_list)-set(excep_list))
pancreatitis_list = [pancreatitis.split('/')[-1] for pancreatitis in pancreatitis_list]
df_pancreatitis = pd.DataFrame({'patient_id': pancreatitis_list, 'image_id': pancreatitis_list, 'type_label': 1})

pdac_list = glob.glob('/data112/shaozc/TMA/images/pdac/*')
pdac_list = list(set(pdac_list)-set(excep_list))
pdac_list = [pdac.split('/')[-1] for pdac in pdac_list]
df_pdac = pd.DataFrame({'patient_id': pdac_list, 'image_id': pdac_list, 'type_label': 2})

df = pd.concat([df_normal, df_pancreatitis, df_pdac])
df.reset_index(inplace=True, drop=True)

#---->get patient data
patient_df = df.drop_duplicates(['patient_id']).copy()
patient_df.set_index(keys='patient_id', drop=True, inplace=True)
df.set_index(keys='patient_id', drop=True, inplace=True)

#---->construct link between patient_df and data
patient_dict = {}
for patient in patient_df.index:
    slide_ids = df.loc[patient, 'image_id'] 
    if isinstance(slide_ids, str):
        slide_ids = np.array(slide_ids).reshape(-1)
    else:
        slide_ids = slide_ids.values
    patient_dict.update({patient:slide_ids}) 

#----> 0.6: 0.15: 0.25
train_test_folder = StratifiedKFold(n_splits=4,random_state=0,shuffle=True)
for fold, (train_val, test) in enumerate(train_test_folder.split(list(patient_df.index),list(patient_df.loc[:, 'type_label']))):
    train_val_patient_df = patient_df.iloc[list(train_val), :]
    test_patient_df = patient_df.iloc[list(test), :]

    train_val_folder = StratifiedKFold(n_splits=5,random_state=0,shuffle=True)
    for train, val in train_val_folder.split(list(train_val_patient_df.index),list(train_val_patient_df.loc[:, 'type_label'])):
        train_patient_df = train_val_patient_df.iloc[list(train), :]
        val_patient_df = train_val_patient_df.iloc[list(val), :]
        break 

    train_patient_id = train_patient_df.index.values.tolist()
    train_type_label = train_patient_df.loc[:, 'type_label'].values.tolist()
    df_train_fold = pd.DataFrame({'train_patient_id': [], 'train_image_id': [], 'train_type_label': []})
    for idx, patient in enumerate(train_patient_id):
        patient_img = patient_dict[patient]
        for img in patient_img:
            df_train_fold = df_train_fold.append({'train_patient_id': str(patient), 'train_image_id': img, 'train_type_label': str(train_type_label[idx])}, ignore_index=True)

    val_patient_id = val_patient_df.index.values.tolist()
    val_type_label = val_patient_df.loc[:, 'type_label'].values.tolist()
    df_val_fold = pd.DataFrame({'val_patient_id': [], 'val_image_id': [], 'val_type_label': []})
    for idx, patient in enumerate(val_patient_id):
        patient_img = patient_dict[patient]
        for img in patient_img:
            df_val_fold = df_val_fold.append({'val_patient_id': str(patient), 'val_image_id': img, 'val_type_label': str(val_type_label[idx])}, ignore_index=True)


    test_patient_id = test_patient_df.index.values.tolist()
    test_type_label = test_patient_df.loc[:, 'type_label'].values.tolist()      
    df_test_fold = pd.DataFrame({'test_patient_id': [], 'test_image_id': [], 'test_type_label': []})
    for idx, patient in enumerate(test_patient_id):
        patient_img = patient_dict[patient]
        for img in patient_img:
            df_test_fold = df_test_fold.append({'test_patient_id': str(patient), 'test_image_id': img, 'test_type_label': str(test_type_label[idx])}, ignore_index=True)  

    df_fold = pd.concat([df_train_fold, df_val_fold, df_test_fold], axis=1)

    #---->save
    df_fold.to_csv(f'./dataset_csv/TMA/fold{fold}.csv')