# copy filter

In [3]:
import os
import pandas as pd
import shutil
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [4]:
# Path sumber dan tujuan
source_dir = "D:/dataset/rsna-intracranial-hemorrhage-detection/stage_2_train"     # ganti sesuai lokasi SSD eksternal
target_dir = "C:/Users/Lenovo/Documents/malaysia/FYP/PROJECT/data/raw_data55k"         # ganti sesuai lokasi local
os.makedirs(target_dir, exist_ok=True)

In [5]:
# Baca daftar ID hasil sampling
df_sampled = pd.read_csv("data_55k.csv")
ids_to_copy = set(df_sampled["ImageID"].astype(str))

In [6]:
# Ekstensi file (ubah sesuai dataset kamu)
ext = ".dcm"

In [7]:
def copy_file(img_id):
    src_path = os.path.join(source_dir, img_id + ext)
    dst_path = os.path.join(target_dir, img_id + ext)
    if os.path.exists(src_path):
        shutil.copy2(src_path, dst_path)
        return True
    else:
        return img_id

missing = []


In [8]:
with ThreadPoolExecutor(max_workers=8) as executor:  # 8 thread biasanya optimal untuk SSD
    futures = {executor.submit(copy_file, img_id): img_id for img_id in ids_to_copy}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Copying sampled images"):
        result = future.result()
        if result is not True:
            missing.append(result)

Copying sampled images: 100%|██████████| 55297/55297 [06:21<00:00, 144.84it/s]


In [9]:
# Simpan daftar file yang tidak ditemukan
if missing:
    with open("missing_files.txt", "w") as f:
        f.write("\n".join(missing))
    print(f"{len(missing)} files not found. Saved to missing_files.txt.")
else:
    print("All sampled files copied successfully.")

All sampled files copied successfully.
