# Creating a table with paths

Created a table with the following columns: 
- cts_path, bone_path, fold. 

Data directory:
/home/yerkyn/tumor_seg/data

In [1]:
import os
from natsort import natsorted
import pandas as pd
import random
import numpy as np
import glob
from sklearn.model_selection import train_test_split


In [2]:
cts_dir = "/home/yerkyn/tumor_seg/data/img/"
masks_dir = "/home/yerkyn/tumor_seg/data/seg/"

In [3]:
cts_files = natsorted(glob.glob(os.path.join(cts_dir, "*.nii.gz")))
masks_files = natsorted(glob.glob(os.path.join(masks_dir, "*.nii.gz")))

In [4]:
len(cts_files) == len(masks_files)

True

In [None]:
masks_dict = {os.path.basename(mask): mask for mask in masks_files}

In [6]:
data = []

for cts_path in cts_files:
    filename = os.path.basename(cts_path)
    
    if filename in masks_dict:
        mask_path = masks_dict[filename]
        
        data.append({
            "cts_path": cts_path,
            "masks_path": mask_path,
            "fold": np.nan  
        })

df = pd.DataFrame(data, columns=["cts_path", "masks_path", "fold"])

In [8]:
(df['cts_path'].apply(lambda x: os.path.basename(x)) == df['masks_path'].apply(lambda x: os.path.basename(x))).value_counts()

True    2240
dtype: int64

In [9]:
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1111, random_state=42)  # 0.1111 * 0.9 ≈ 0.1

train_df['fold'] = 0
val_df['fold'] = 1
test_df['fold'] = 2

final_df = pd.concat([train_df, val_df, test_df]).reset_index(drop=True)

In [12]:
fold_01_df = final_df[final_df['fold'].isin([0, 1])]
fold_2_df = final_df[final_df['fold'] == 2]

csv_path_fold_01 = '/home/yerkyn/tumor_seg/data/tumor_train_dataset.csv'
csv_path_fold_2 = '/home/yerkyn/tumor_seg/data/tumor_test_dataset.csv'

fold_01_df.to_csv(csv_path_fold_01, index=False)
fold_2_df.to_csv(csv_path_fold_2, index=False)