Training, Validation and testing dataset were labeled by batch manually

In [43]:
import sys
sys.path.append('/workspace/Documents')
import os
import numpy as np
import pandas as pd
import nibabel as nb
import Diffusion_models.functions_collection as ff

# UC dataset

### build list for MVF (voxelmorph)

In [2]:
main_path = '/mnt/camca_NAS/4DCT/Patient_lists/'
patient_list = pd.read_excel(os.path.join(main_path,'patient_list_final_selection.xlsx'))
normal_cases = patient_list[patient_list['patient_class']=='Normal']
abnormal_cases = patient_list[patient_list['patient_class']=='Abnormal']
print(len(normal_cases),len(abnormal_cases))    

In [4]:
# Shuffle the cases for randomness
normal_cases = normal_cases.sample(frac=1, random_state=42).reset_index(drop=True)
abnormal_cases = abnormal_cases.sample(frac=1, random_state=42).reset_index(drop=True)

# Divide each into 5 batches
num_batches = 6
normal_batches = np.array_split(normal_cases, num_batches)
abnormal_batches = np.array_split(abnormal_cases, num_batches)

# Combine corresponding normal and abnormal batches
combined_batches = []
for i in range(num_batches):
    combined_batch = pd.concat([normal_batches[i], abnormal_batches[i]], ignore_index=True)
    combined_batch['batch'] = i + 1  # Add a column to indicate the batch number
    combined_batches.append(combined_batch)

# Display results
for i, batch in enumerate(combined_batches):
    print(f"Batch {i + 1} contains {len(batch)} patients.")

Batch 1 contains 66 patients.
Batch 2 contains 64 patients.
Batch 3 contains 64 patients.
Batch 4 contains 64 patients.
Batch 5 contains 64 patients.


In [5]:
final_dataframe = pd.concat(combined_batches, ignore_index=True)

# Save to an Excel file
final_dataframe.to_excel(os.path.join(main_path, 'patient_list_train_test.xlsx'), index=False)

### build list for MVF diffusion

In [4]:
main_path = '/mnt/camca_NAS/4DCT/'
patient_list = pd.read_excel(os.path.join(main_path,'Patient_lists/patient_list_train_test_reorder.xlsx'))
mvf_predict_list = pd.read_excel(os.path.join(main_path,'mvf_warp0_onecase/check_mvf_max_min.xlsx'))

In [6]:
row_indices_to_delete = []
for i in range(0,patient_list.shape[0]):
    patient_id = patient_list.loc[i,'patient_id']
    row_in_mvf = mvf_predict_list[mvf_predict_list['patient_id']==patient_id]

    if (row_in_mvf['max'].iloc[0] > 20.6): 
        row_indices_to_delete.append(i)
    elif (row_in_mvf['min'].iloc[0] < -20.6): 
        row_indices_to_delete.append(i)

    


patient_list_drop = patient_list.drop(row_indices_to_delete)
print(patient_list_drop.shape[0], mvf_predict_list.shape[0])

317 322


In [7]:
patient_list_drop.to_excel(os.path.join(main_path, 'Patient_lists/uc/patient_list_MVF_diffusion_train_test.xlsx'), index=False)

## MGH data

### build list for diffusion

In [47]:
main_path = '/mnt/camca_NAS/4DCT/'
patient_list = pd.read_excel(os.path.join(main_path,'Patient_lists/mgh/patient_list_selected.xlsx'))
ef_list = pd.read_excel(os.path.join(main_path,'Patient_lists/mgh/patient_list_final_selection_timeframes.xlsx'))
mvf_predict_list = pd.read_excel(os.path.join(main_path,'Patient_lists/mgh/check_mvf_max_min.xlsx'))

In [57]:
# drop the one with too large mvf
row_indices_to_delete = []
for i in range(0,patient_list.shape[0]):
    patient_id = patient_list.loc[i,'patient_id']
    row_in_mvf = mvf_predict_list[mvf_predict_list['patient_id']==patient_id]
    
    if (row_in_mvf['max'].iloc[0] > 20.6): 
        row_indices_to_delete.append(i)
    elif (row_in_mvf['min'].iloc[0] < -20.6): 
        row_indices_to_delete.append(i)

print('row indices to delete:', row_indices_to_delete)
patient_list_drop = patient_list.drop(row_indices_to_delete)
# reset index
patient_list_drop = patient_list_drop.reset_index(drop=True)

EF_column = []
for i in range(0,patient_list_drop.shape[0]):
    patient_id = patient_list_drop['patient_id'].iloc[i]
    row_in_ef = ef_list[ef_list['patient_id']==patient_id]
    EF_column.append(row_in_ef['EF'].iloc[0])
patient_list_drop['EF'] = EF_column
print(patient_list_drop.shape[0], mvf_predict_list.shape[0])

row indices to delete: [36, 114]
208 210
208 210


In [58]:
# separate by EF
ef_low_group = patient_list_drop[patient_list_drop['EF'] < 0.40]
ef_high_group = patient_list_drop[patient_list_drop['EF'] >= 0.40]
print('low group number:', ef_low_group.shape[0],' high group number:', ef_high_group.shape[0])

low group number: 15  high group number: 193


In [59]:
import random
# Shuffle the EF high cases for randomness
normal_cases = ef_high_group.sample(frac=1, random_state=42).reset_index(drop=True)

# Divide each into 6 batches
num_batches = 6
normal_batches = np.array_split(normal_cases, num_batches)
# for low EF group
def generate_valid_partition():
    while True:
        split = [random.choice([1, 2]) for _ in range(5)]
        if sum(split) == 7:
            return split
split_sizes = generate_valid_partition()
abnormal_batches = []
start = 0
for size in split_sizes:
    abnormal_batches.append(ef_low_group.iloc[start:start + size])
    start += size

abnormal_batches.append(ef_low_group.iloc[start:])
for i, batch in enumerate(abnormal_batches):
    print('batch index:', i, 'batch size:', len(batch))


# # combine EF high cases with EF low cases (for training group each batch get 1, the rest low cases goes to test groups)
combined_batches = []
for i in range(num_batches):
    combined_batch = pd.concat([normal_batches[i], abnormal_batches[i]], ignore_index=True)
    combined_batch['batch'] = i  # Add a column to indicate the batch number
    combined_batches.append(combined_batch)

# # Display results
for i, batch in enumerate(combined_batches):
    print(f"Batch {i + 1} contains {len(batch)} patients.")

batch index: 0 batch size: 2
batch index: 1 batch size: 1
batch index: 2 batch size: 2
batch index: 3 batch size: 1
batch index: 4 batch size: 1
batch index: 5 batch size: 8
Batch 1 contains 35 patients.
Batch 2 contains 33 patients.
Batch 3 contains 34 patients.
Batch 4 contains 33 patients.
Batch 5 contains 33 patients.
Batch 6 contains 40 patients.


In [60]:
final_dataframe = pd.concat(combined_batches, ignore_index=True)

# Save to an Excel file
final_dataframe.to_excel(os.path.join(main_path, 'Patient_lists/mgh/patient_list_MVF_diffusion_train_test.xlsx'), index=False)