# 76

In [None]:
import os
import shutil
import pandas as pd

# --- 1. Konfigurasi Path ---
# Sesuaikan path ini dengan lokasi direktori dataset Anda
base_path = './'
train_dir = os.path.join(base_path, 'train')
test_dir = os.path.join(base_path, 'test')

# Verifikasi struktur folder
print(f"Train directory: {train_dir}")
print(f"Test directory: {test_dir}")
print(f"Train exists: {os.path.exists(train_dir)}")
print(f"Test exists: {os.path.exists(test_dir)}")

# Mendapatkan daftar kelas dari folder train (5 kelas)
if os.path.exists(train_dir):
    classes = [d for d in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, d))]
    print(f"Kelas yang terdeteksi: {classes}")
    print(f"Jumlah kelas: {len(classes)}")
    
    # Tampilkan jumlah file di setiap kelas
    for cls in classes:
        class_path = os.path.join(train_dir, cls)
        if os.path.exists(class_path):
            file_count = len([f for f in os.listdir(class_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
            print(f"  - {cls}: {file_count} gambar")
else:
    print("Error: Folder train tidak ditemukan!")
    classes = []

# --- 2. Verifikasi Struktur Data ---
print("\n=== VERIFIKASI STRUKTUR DATA ===")
print("Struktur data siap digunakan:")
print(f"- Direktori Train: {train_dir}")
print(f"- Direktori Test: {test_dir}")

if classes:
    print("\nJumlah gambar per kelas:")
    total_images = 0
    for cls in classes:
        class_path = os.path.join(train_dir, cls)
        if os.path.exists(class_path):
            file_count = len([f for f in os.listdir(class_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
            print(f"  - {cls}: {file_count} gambar")
            total_images += file_count
    
    print(f"\nTotal gambar untuk training: {total_images}")


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
import timm # Pustaka untuk model SOTA seperti ConvNeXt
from tqdm.auto import tqdm
import torch.nn.functional as F
import numpy as np
from torchmetrics.classification import MulticlassF1Score

# --- Persiapan Awal (CUDA, Seed) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
torch.manual_seed(42)

# --- Point 5: Implementasi Focal Loss ---
# PyTorch tidak memiliki Focal Loss bawaan, jadi kita definisikan sendiri.
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt)**self.gamma * ce_loss
        
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

# --- Point 1, 2, 3, 7: Definisi Augmentasi ---
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(size=224, scale=(0.8, 1.0)), # Point 1
    transforms.RandomRotation(degrees=15), # Point 2
    transforms.RandomPerspective(distortion_scale=0.2, p=0.5), # Point 2
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1), # Point 3
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.2)), # Point 7
])

val_test_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# --- Memuat Data ---
DATA_DIR = './train'  # Sesuaikan dengan struktur folder Anda
full_dataset = datasets.ImageFolder(DATA_DIR, transform=train_transforms)
train_size = int(0.85 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_subset, val_subset_aug = random_split(full_dataset, [train_size, val_size])

# Ganti transform val_subset agar tidak ada augmentasi saat evaluasi
val_subset_aug.dataset.transform = val_test_transforms

BATCH_SIZE = 32
train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_subset_aug, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
NUM_CLASSES = len(full_dataset.classes)

print(f"\nDataset information:")
print(f"Total images: {len(full_dataset)}")
print(f"Training images: {len(train_subset)}")
print(f"Validation images: {len(val_subset_aug)}")
print(f"Number of classes: {NUM_CLASSES}")
print(f"Class names: {full_dataset.classes}")

# --- Point 4: Definisi Model ConvNeXt ---
model = timm.create_model(
    'convnext_tiny.in12k_ft_in1k', # Menggunakan varian ConvNeXt Tiny yang sudah di-fine-tune
    pretrained=True,
    num_classes=NUM_CLASSES
)
model.to(device)

# --- Point 6: Strategi Gradual Unfreezing & Differential Learning Rates ---

# --- TAHAP 1: Latih hanya kepala (classifier) ---
print("\n--- TAHAP 1: Melatih Kepala Klasifikasi ---")
# Bekukan semua layer kecuali kepala
for param in model.parameters():
    param.requires_grad = False
for param in model.head.parameters():
    param.requires_grad = True

optimizer = optim.AdamW(model.head.parameters(), lr=1e-3)
criterion = FocalLoss().to(device)
f1_metric = MulticlassF1Score(num_classes=NUM_CLASSES, average='macro').to(device)

# Loop pelatihan singkat untuk kepala
for epoch in range(3): # Cukup 3 epoch untuk pemanasan
    model.train()
    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/3"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
print("Pelatihan kepala selesai.\n")

# --- TAHAP 2: Fine-tuning seluruh model dengan Differential LR ---
print("--- TAHAP 2: Fine-tuning Seluruh Model ---")
# Cairkan (unfreeze) semua layer
for param in model.parameters():
    param.requires_grad = True

# Siapkan parameter group untuk Differential LR
optimizer = optim.AdamW([
    {'params': model.head.parameters(), 'lr': 1e-4}, # LR lebih tinggi untuk kepala
    {'params': [p for n, p in model.named_parameters() if 'head' not in n], 'lr': 1e-5} # LR lebih rendah untuk backbone
], weight_decay=1e-3)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_eta_min=1e-6, T_max=10)

# --- Loop Pelatihan Utama ---
NUM_EPOCHS = 15
best_f1 = 0.0
for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss = 0.0
    for inputs, labels in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{NUM_EPOCHS}"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}/{NUM_EPOCHS}"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            preds = torch.argmax(outputs, 1)
            all_preds.append(preds)
            all_labels.append(labels)

    # Hitung F1-score untuk epoch ini
    val_f1 = f1_metric(torch.cat(all_preds), torch.cat(all_labels))
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}, Val F1-Score: {val_f1:.4f}")

    scheduler.step()

    # Simpan model jika F1-score membaik
    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), 'best_model_convnext.pth')
        print(f"Model terbaik disimpan dengan F1-score: {best_f1:.4f}")

print("\nPelatihan selesai!")
print(f"F1-score terbaik di set validasi: {best_f1:.4f}")

# --- Point 8: Fungsi untuk Test Time Augmentation (TTA) ---
def predict_with_tta(model, dataloader, device):
    model.eval()
    all_final_preds = []
    
    # Definisikan augmentasi untuk TTA
    tta_transforms = transforms.Compose([
        transforms.RandomHorizontalFlip(p=1.0), # Pasti di-flip
        transforms.ColorJitter(brightness=0.2, contrast=0.2),
    ])
    
    with torch.no_grad():
        for images, _ in tqdm(dataloader, desc="Predicting with TTA"):
            images = images.to(device)
            
            # 1. Prediksi original
            original_probs = F.softmax(model(images), dim=1)
            
            # 2. Prediksi dengan augmentasi (misal: flip)
            flipped_images = tta_transforms(images)
            flipped_probs = F.softmax(model(flipped_images), dim=1)
            
            # Rata-ratakan probabilitas
            avg_probs = (original_probs + flipped_probs) / 2.0
            final_preds = torch.argmax(avg_probs, dim=1)
            all_final_preds.append(final_preds.cpu().numpy())
            
    return np.concatenate(all_final_preds)


import pandas as pd
import os
from torchvision.datasets import ImageFolder

# Pastikan semua fungsi dan variabel dari skrip training sebelumnya sudah didefinisikan
# (model, FocalLoss, predict_with_tta, val_test_transforms, device, dll.)

# --- 1. Muat Model Terbaik yang Telah Disimpan ---
# Pastikan model sudah didefinisikan seperti di skrip training
model.load_state_dict(torch.load('best_model_convnext.pth', map_location=device))
print("Model terbaik (best_model_convnext.pth) berhasil dimuat.")

# --- 2. Siapkan DataLoader untuk Data Tes ---
# Arahkan ke folder test yang sebenarnya
TEST_DIR = './test/' 

# Kita gunakan ImageFolder agar mudah mendapatkan nama file dan kelas dummy
# Gunakan transform yang sama dengan validasi
test_dataset = ImageFolder(root=TEST_DIR, transform=val_test_transforms)

# PENTING: shuffle=False agar urutan file tidak berubah
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

print(f"Data tes dimuat dari: {TEST_DIR}")
print(f"Jumlah gambar tes: {len(test_dataset)}\n")

# --- 3. Jalankan Prediksi dengan TTA ---
predictions_indices = predict_with_tta(model, test_loader, device)
print(f"Prediksi dengan TTA selesai. Total prediksi: {len(predictions_indices)}")

# --- 4. Format Output ke dalam CSV ---
# Dapatkan nama kelas dari dataset yang kita latih
class_names = full_dataset.classes # 'full_dataset' dari skrip training
# -> ['balinese', 'batak', 'dayak', 'javanese', 'minangkabau']

# Dapatkan nama file asli dari test_dataset
# test_dataset.samples berisi path lengkap, kita hanya butuh nama filenya
test_filenames = [os.path.basename(path) for path, _ in test_dataset.samples]

# Hapus ekstensi file (misal: .jpg) dari nama file untuk mendapatkan ID
test_ids = [os.path.splitext(name)[0] for name in test_filenames]

# Ubah indeks prediksi menjadi nama kelas
predicted_styles = [class_names[i] for i in predictions_indices]

# Buat DataFrame pandas
submission_df = pd.DataFrame({
    'id': test_ids,
    'style': predicted_styles
})

# Simpan ke file CSV
submission_df.to_csv('submission.csv', index=False)

print("\nFile 'submission.csv' telah berhasil dibuat dan siap diunggah!")
print("Contoh isi file:")
print(submission_df.head())

Kelas yang terdeteksi: ['dayak', 'minangkabau', 'balinese', 'javanese', 'batak']

Memulai proses penggabungan data 'val' ke 'train'...
Proses selesai. Sebanyak 0 file telah dipindahkan dari 'val' ke 'train'.
Direktori './dataset-logika-resize/val' sekarang kosong dan bisa dihapus.

Membaca '/kaggle/input/delete/delete_file.csv'. Terdapat 240 file untuk dihapus.
Proses pembersihan selesai. Sebanyak 240 file telah dihapus dari direktori 'train'.

Struktur data akhir siap digunakan:
- Direktori Train: ./dataset-logika-resize/train
- Direktori Test: ./dataset-logika-resize/test
