In [3]:
!pip install tensorflow albumentations numpy pillow scikit-learn


Collecting pydantic>=2.9.2 (from albumentations)
  Using cached pydantic-2.12.5-py3-none-any.whl.metadata (90 kB)
Collecting albucore==0.0.24 (from albumentations)
  Using cached albucore-0.0.24-py3-none-any.whl.metadata (5.3 kB)
Collecting eval-type-backport (from albumentations)
  Using cached eval_type_backport-0.3.1-py3-none-any.whl.metadata (2.4 kB)
Collecting opencv-python-headless>=4.9.0.80 (from albumentations)
  Using cached opencv_python_headless-4.12.0.88-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting annotated-types>=0.6.0 (from pydantic>=2.9.2->albumentations)
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Using cached albucore-0.0.24-py3-none-any.whl (15 kB)
Using cached opencv_python_headless-4.12.0.88-cp37-abi3-win_amd64.whl (38.9 MB)
Using cached pydantic-2.12.5-py3-none-any.whl (463 kB)
Using cached annotated_types-0.7.0-py3-none-any.whl (13 kB)
Using cached eval_type_backport-0.3.1-py3-none-any.whl (6.1 kB)
Installing collected packages: 

In [4]:
import albumentations as A
import cv2
import numpy as np
import os
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
import shutil

In [5]:
train_df = pd.read_csv(r"data_train_u.csv")

In [6]:
label_cols = ["any", "epidural", "intraparenchymal", "intraventricular", "subarachnoid", "subdural"]

In [7]:
max_count = 8000

In [8]:
src_dir = r"data_train_u"
aug_dir = r"data/data_train"
os.makedirs(aug_dir, exist_ok=True)

In [9]:
# Augmentasi dasar (rotasi, flip, brightness)
augment = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.RandomRotate90(p=0.05),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=5, p=0.7),
])

  original_init(self, **validated_kwargs)


In [10]:
# Tentukan jumlah target per kelas (misal mau samakan ke jumlah maksimum)
label_counts = {label: train_df[label].sum() for label in label_cols}

In [12]:
print("Jumlah awal per label:")
for lbl, cnt in label_counts.items():
    print(f"  {lbl}: {int(cnt)}")

Jumlah awal per label:
  any: 36237
  epidural: 2500
  intraparenchymal: 14100
  intraventricular: 11650
  subarachnoid: 13589
  subdural: 14294


In [13]:
# Hitung jumlah positif dan negatif di tiap label
for col in label_cols:
    counts = train_df[col].value_counts()
    total = counts.sum()
    positive = counts.get(1, 0)
    negative = counts.get(0, 0)
    balance_ratio = positive / total * 100
    print(f"{col:17} | Positif: {positive:5d} | Negatif: {negative:5d} | Rasio positif: {balance_ratio:6.2f}%")


any               | Positif: 36237 | Negatif:  8000 | Rasio positif:  81.92%
epidural          | Positif:  2500 | Negatif: 41737 | Rasio positif:   5.65%
intraparenchymal  | Positif: 14100 | Negatif: 30137 | Rasio positif:  31.87%
intraventricular  | Positif: 11650 | Negatif: 32587 | Rasio positif:  26.34%
subarachnoid      | Positif: 13589 | Negatif: 30648 | Rasio positif:  30.72%
subdural          | Positif: 14294 | Negatif: 29943 | Rasio positif:  32.31%


In [14]:
aug_records = []

for label in label_cols:
    subset = train_df[train_df[label] == 1]
    needed = int(max_count - len(subset))
    if needed <= 0:
        continue

    print(f"\nAugmenting label '{label}' sebanyak {needed} gambar...")
    for i in tqdm(range(needed)):
        sample = subset.sample(1).iloc[0]
        img_path = os.path.join(src_dir, f"{sample['ImageID']}.png")

        img = cv2.imread(img_path)
        if img is None:
            continue

        aug_img = augment(image=img)["image"]
        new_name = f"{sample['ImageID']}_{label}_aug{i}"
        new_path = os.path.join(aug_dir, f"{new_name}.png")
        cv2.imwrite(new_path, aug_img)

        # Salin label dari sample asli
        aug_records.append({
            "ImageID": new_name,
            "any": sample["any"],
            "epidural": sample["epidural"],
            "intraparenchymal": sample["intraparenchymal"],
            "intraventricular": sample["intraventricular"],
            "subarachnoid": sample["subarachnoid"],
            "subdural": sample["subdural"],
        })


Augmenting label 'epidural' sebanyak 5500 gambar...


100%|██████████| 5500/5500 [01:00<00:00, 90.77it/s] 


In [15]:
files = os.listdir(src_dir)

for f in tqdm(files, desc="Copying files", unit="file"):
    shutil.copy2(os.path.join(src_dir, f), os.path.join(aug_dir, f))

print("Balancing selesai. Semua data tersimpan di:", aug_dir)

Copying files: 100%|██████████| 44237/44237 [10:13<00:00, 72.05file/s]

Balancing selesai. Semua data tersimpan di: data/data_train





In [None]:
# Gabungkan dataframe asli + augmentasi
aug_df = pd.DataFrame(aug_records)
train_balanced_df = pd.concat([train_df, aug_df], ignore_index=True)

# Simpan ke CSV baru
csv_path = os.path.join("data_train.csv")
train_balanced_df.to_csv(csv_path, index=False)

print(f"Total data setelah balancing: {len(train_balanced_df)}")

Total data setelah balancing: 49737


In [17]:
# Jumlah data sesudah
train_labels = pd.read_csv("train_labels.csv")

# Hitung jumlah positif dan negatif di tiap label
for col in label_cols:
    counts = train_labels[col].value_counts()
    total = counts.sum()
    positive = counts.get(1, 0)
    negative = counts.get(0, 0)
    balance_ratio = positive / total * 100
    print(f"{col:17} | Positif: {positive:5d} | Negatif: {negative:5d} | Rasio positif: {balance_ratio:6.2f}%")


any               | Positif: 41737 | Negatif:  8000 | Rasio positif:  83.92%
epidural          | Positif:  8000 | Negatif: 41737 | Rasio positif:  16.08%
intraparenchymal  | Positif: 15130 | Negatif: 34607 | Rasio positif:  30.42%
intraventricular  | Positif: 12010 | Negatif: 37727 | Rasio positif:  24.15%
subarachnoid      | Positif: 14569 | Negatif: 35168 | Rasio positif:  29.29%
subdural          | Positif: 15576 | Negatif: 34161 | Rasio positif:  31.32%
