In [6]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
import os

In [7]:
# Path input/output
input_path = '../data/tokenized/tokenized_data.csv'
output_dir = '../data/folds/'

In [8]:
df = pd.read_csv(input_path)

required_cols = ['text', 'aspect', 'sentimen', 'stemmed_text', 'input_ids', 'attention_mask', 'token_type_ids']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    raise ValueError(f"Kolom berikut tidak ditemukan: {missing_cols}")

df['sentimen'] = df['sentimen'].astype(str)


In [11]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(skf.split(df, df['sentimen'])):
    train_val_df = df.iloc[train_idx]  # 90%
    test_df = df.iloc[test_idx]         # 10% (sebagai test)
    
    # Dari train_val_df, ambil 10% sebagai dev/val
    train_df, val_df = train_test_split(
        train_val_df,
        test_size=0.1,
        stratify=train_val_df['sentimen'],
        random_state=42
    )
    
    # Simpan
    train_df.to_csv(f"{output_dir}train_fold_{fold}.csv", index=False)
    val_df.to_csv(f"{output_dir}val_fold_{fold}.csv", index=False)
    test_df.to_csv(f"{output_dir}test_fold_{fold}.csv", index=False)  # val sebagai evaluasi utama fold

    print(f"✅ Fold-{fold} selesai → Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")


✅ Fold-0 selesai → Train: 4786 | Val: 532 | Test: 591
✅ Fold-1 selesai → Train: 4786 | Val: 532 | Test: 591
✅ Fold-2 selesai → Train: 4786 | Val: 532 | Test: 591
✅ Fold-3 selesai → Train: 4786 | Val: 532 | Test: 591
✅ Fold-4 selesai → Train: 4786 | Val: 532 | Test: 591
✅ Fold-5 selesai → Train: 4786 | Val: 532 | Test: 591
✅ Fold-6 selesai → Train: 4786 | Val: 532 | Test: 591
✅ Fold-7 selesai → Train: 4786 | Val: 532 | Test: 591
✅ Fold-8 selesai → Train: 4786 | Val: 532 | Test: 591
✅ Fold-9 selesai → Train: 4787 | Val: 532 | Test: 590
