In [1]:
import os
import cv2
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [2]:
image_folder = r'C:\Users\S.SAI\OneDrive\Desktop\LiTS\train_images\train_images'
mask_folder  = r'C:\Users\S.SAI\OneDrive\Desktop\LiTS\train_masks\train_masks'

In [3]:
tumor_pairs = []
non_tumor_pairs = []
for fname in os.listdir(mask_folder):
    if not fname.endswith('.jpg'):
        continue

    mask_path = os.path.join(mask_folder, fname)
    image_path = os.path.join(image_folder, fname)

    mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
    if mask is None:
        continue

    if np.any(mask):
        tumor_pairs.append((image_path, mask_path))
    else:
        non_tumor_pairs.append((image_path, mask_path))

print("Available tumor slices:", len(tumor_pairs))
print("Available non-tumor slices:", len(non_tumor_pairs))

Available tumor slices: 18834
Available non-tumor slices: 39804


In [8]:
tumor_sample = random.sample(tumor_pairs, 10000)
non_tumor_sample = random.sample(non_tumor_pairs, 10000)

combined = tumor_sample + non_tumor_sample
random.shuffle(combined)

train_pairs, val_pairs = train_test_split(combined, test_size=0.2, random_state=42)
print("Training pairs:", len(train_pairs))
print("Validation pairs:", len(val_pairs))

Training pairs: 16000
Validation pairs: 4000


In [10]:
def save_to_txt(pairs, filename):
    with open(filename, 'w') as f:
        for img_path, mask_path in pairs:
            f.write(f"{img_path},{mask_path}\n")

save_to_txt(train_pairs, 'train_balanced.txt')
save_to_txt(val_pairs, 'val_balanced.txt')

print("Saved: train_balanced.txt and val_balanced.txt")

Saved: train_balanced.txt and val_balanced.txt


In [12]:
with open("train_balanced.txt", "r") as f:
    all_lines = f.readlines()
random.shuffle(all_lines)

# Split into 10 chunks of 2,000
chunk_size = 2000
for i in range(10):
    chunk_lines = all_lines[i * chunk_size: (i + 1) * chunk_size]
    with open(f"chunk_{i}.txt", "w") as f:
        f.writelines(chunk_lines)

print("Split train_balanced.txt into 10 chunks of 2000 each.")

Split train_balanced.txt into 10 chunks of 2000 each.
