Imports and constants


In [None]:
import pandas as pd
import polars as pl

train_transaction = pl.read_csv("train_transaction.csv")
test_transaction  = pl.read_csv("test_transaction.csv")

print(train_transaction.shape)
print(test_transaction.shape)


Import libraries selesai.


Data Loading dan Optimasi Memori

In [None]:
def reduce_mem_usage(df):
    """
    Iterasi melalui setiap kolom DataFrame dan downcast tipe data 
    untuk mengurangi penggunaan memori.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Penggunaan memori awal DataFrame: {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else: # float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else: # object (string)
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Penggunaan memori setelah optimasi: {end_mem:.2f} MB')
    print(f'Memori berkurang sebesar {100 * (start_mem - end_mem) / start_mem:.1f}%')
    return df

data_types = {
    'TransactionID': 'int32',
    'isFraud': 'int8',
    'TransactionDT': 'int32',
}

# --- PENGUJIAN DAN PEMUATAN DATA ---
try:
    # 1. Pemuatan Data dengan dtype optimasi dan memori rendah
    print("Memulai pemuatan data...")
    df_train = pd.read_csv(
        'train_transaction.csv', 
        dtype=data_types, 
        index_col='TransactionID' # Opsional: Gunakan ID sebagai index
    )

    # 2. Penerapan fungsi optimasi memori setelah data dimuat
    df_train = reduce_mem_usage(df_train)
    
    # 3. Penghapusan variabel sementara untuk mengosongkan RAM
    gc.collect() 
    print("Data loading SUCCESS. Lanjutkan dengan perhitungan...")

except Exception as e:
    print(f"Error saat loading data: {e}")
    # Jika loading gagal, df_train tetap tidak terdefinisi.
    # Kita harus keluar atau memberikan nilai default agar NameError tidak muncul.
    print("FATAL ERROR: Gagal memuat data. Mohon tingkatkan RAM atau gunakan mode chunking.")
    exit()
# Hitung scale_pos_weight
fraud_count = df_train['isFraud'].value_counts()[1]
non_fraud_count = df_train['isFraud'].value_counts()[0]
SCALE_POS_WEIGHT = non_fraud_count / fraud_count

print(f"Jumlah Fraud: {fraud_count}")
print(f"Jumlah Non-Fraud: {non_fraud_count}")
print(f"SCALE_POS_WEIGHT: {SCALE_POS_WEIGHT:.2f}")

Data Preprocessing, Scaling, dan Split

In [None]:
print("\n--- Data Preprocessing ---")

# Pisahkan Fitur (X) dan Target (y)
X = df_train.drop('isFraud', axis=1)
y = df_train['isFraud']

# Bersihkan memori dari DataFrame asli
del df_train
gc.collect()

# a. Identifikasi Tipe Data
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['category']).columns.tolist()

# b. Penanganan Missing Values dan Encoding Kategorikal
for col in numerical_features:
    # Imputasi dengan Median
    X[col] = X[col].fillna(X[col].median())
    
for col in categorical_features:
    # Imputasi dengan kategori 'MISSING'
    X[col] = X[col].cat.add_categories('MISSING').fillna('MISSING')
    
    # Hapus kolom High Cardinality atau encode yang wajar
    if X[col].nunique() > 50:
        X = X.drop(col, axis=1)
    else:
        # Lakukan Label Encoding
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

# c. Scaling Fitur (Penting untuk DL)
all_features = X.columns.tolist()
scaler = StandardScaler()
X[all_features] = scaler.fit_transform(X[all_features])

# d. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Preprocessing selesai. Training set shape: {X_train.shape}")

# Inisialisasi Dictionary untuk menyimpan hasil perbandingan
results_dict = {}

Menghapus kolom: P_emaildomain (High Cardinality)
Menghapus kolom: R_emaildomain (High Cardinality)

Ukuran Training Set: (472432, 391)
Ukuran Testing Set: (118108, 391)


Perbandingan dan Analisis Hasil

In [23]:
## 8. Perbandingan Kinerja Model

# Konversi hasil ke DataFrame untuk perbandingan yang rapi
results = pd.DataFrame(results_dict, index=['ROC AUC Score', 'Training Time (s)']).T

print("\n### Tabel Perbandingan Kinerja Model (Tugas Fraud Detection) ###")
print(results.sort_values(by='ROC AUC Score', ascending=False).to_markdown())

print("\n--- Analisis Kinerja ---")
best_model = results['ROC AUC Score'].idxmax()
print(f"Model terbaik berdasarkan ROC AUC Score adalah: **{best_model}**")


NameError: name 'results_dict' is not defined

5.  EDA functions

In [22]:
def basic_eda(df, label_col="isFraud", n_rows=5):
    print("----- Basic EDA -----")
    print("Shape:", df.shape)
    display(df.head(n_rows))
    print("\nMissing values (top 20):")
    print(df.isnull().sum().sort_values(ascending=False).head(20))
    if label_col in df.columns:
        print("\nLabel distribution:")
        print(df[label_col].value_counts(normalize=True))
    print("\nNumeric summary:")
    display(df.describe().T)

def plot_label_distribution(df, label_col="isFraud"):
    if label_col not in df.columns:
        return
    sns.countplot(x=label_col, data=df)
    plt.title("Label Distribution")
    plt.show()
    print(df[label_col].value_counts())

6.  Preprocessing pipeline


In [7]:
def preprocess(df, categorical_threshold=10, drop_cols=None, scaler=None, fit_scaler=False):
  
    df = df.copy()
    if drop_cols:
        for c in drop_cols:
            if c in df.columns:
                df.drop(columns=c, inplace=True)
    # separate types
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()
    # Impute numeric
    num_imputer = SimpleImputer(strategy='median')
    if len(num_cols) > 0:
        df[num_cols] = num_imputer.fit_transform(df[num_cols])
    # Impute categorical
    cat_imputer = SimpleImputer(strategy='most_frequent')
    if len(cat_cols) > 0:
        df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
    # Encode categorical with LabelEncoder for simplicity
    label_encoders = {}
    for c in cat_cols:
        le = LabelEncoder()
        try:
            df[c] = le.fit_transform(df[c].astype(str))
            label_encoders[c] = le
        except Exception:
            # fallback: map unique values to ints
            uniq = {v:i for i,v in enumerate(df[c].astype(str).unique())}
            df[c] = df[c].astype(str).map(uniq)
            label_encoders[c] = None
    # Scaling
    if scaler is None and fit_scaler:
        scaler = StandardScaler()
        df[num_cols] = scaler.fit_transform(df[num_cols])
    elif scaler is not None:
        df[num_cols] = scaler.transform(df[num_cols])
    return df, label_encoders, scaler

7. Handling imbalance


In [8]:
def handle_imbalance(X, y, method="smote"):
    """method: 'smote', 'undersample', 'oversample', or 'class_weight' (returns unchanged)"""
    if method == "smote":
        if not IMBLEARN_AVAILABLE:
            print("imblearn not available. Try pip install imbalanced-learn or choose another method.")
            return X, y
        sm = SMOTE(random_state=RANDOM_STATE)
        X_res, y_res = sm.fit_resample(X, y)
        print("SMOTE applied. New distribution:", np.bincount(y_res))
        return X_res, y_res
    elif method == "undersample":
        # simple random undersample majority class
        from sklearn.utils import resample
        Xy = pd.concat([X, y.rename('target')], axis=1)
        majority = Xy[Xy.target == 0]
        minority = Xy[Xy.target == 1]
        majority_down = resample(majority, replace=False, n_samples=len(minority), random_state=RANDOM_STATE)
        df_down = pd.concat([majority_down, minority])
        y_res = df_down.target
        X_res = df_down.drop(columns='target')
        print("Undersampling done. New distribution:", np.bincount(y_res))
        return X_res, y_res
    elif method == "oversample":
        from sklearn.utils import resample
        Xy = pd.concat([X, y.rename('target')], axis=1)
        majority = Xy[Xy.target == 0]
        minority = Xy[Xy.target == 1]
        minority_up = resample(minority, replace=True, n_samples=len(majority), random_state=RANDOM_STATE)
        df_up = pd.concat([majority, minority_up])
        y_res = df_up.target
        X_res = df_up.drop(columns='target')
        print("Oversampling done. New distribution:", np.bincount(y_res))
        return X_res, y_res
    else:
        print("Using class_weight in model or no sampling. Returning original data.")
        return X, y
    
# KESALAHAN INDENTASI DIHAPUS DI SINI.
# Fungsi plot_feature_importance sekarang berada di level yang sama dengan handle_imbalance

def plot_feature_importance(model, X_train, title="Feature Importance"):
    """ ‚≠ê Poin Bonus 2: Tampilkan Feature Importance """
    
    # Dapatkan feature importance dari RF atau LGBM
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
    elif hasattr(model, 'coef_'):
        # Untuk model seperti Logistic Regression
        importances = np.abs(model.coef_[0])
    else:
        print("Model tidak memiliki atribut feature_importances_ atau coef_.")
        return None # Mengubah return ke None agar aman

    feat_imp = pd.Series(importances, index=X_train.columns).sort_values(ascending=False).head(20)
    
    plt.figure(figsize=(10, 7))
    feat_imp.head(20).plot(kind='barh')
    plt.gca().invert_yaxis()
    plt.title(title)
    plt.show()

    print(f"\nInterpretasi Singkat (Top 3): Fitur {feat_imp.index[0]}, {feat_imp.index[1]}, dan {feat_imp.index[2]} memiliki pengaruh paling besar terhadap prediksi.")
    return feat_imp.head(5)

# Fungsi plot_shap juga di level yang sama
def plot_shap(model, X_val):
    """ ‚≠ê Poin Bonus 3: Tampilkan SHAP Plot """
    if not SHAP_AVAILABLE:
        print("SHAP library tidak terinstal. Melewatkan plot SHAP.")
        return
    
    try:
        # Gunakan 100 sampel untuk kecepatan
        explainer = shap.Explainer(model, X_val.head(100))
        shap_values = explainer(X_val.head(100))
        
        print("\nSHAP Summary Plot (Nilai Bonus Tinggi!):")
        shap.summary_plot(shap_values, X_val.head(100))
    except Exception as e:
        print(f"Gagal membuat plot SHAP: {e}")

8.  Modeling helpers

In [8]:
def evaluate_model(model, X_val, y_val, threshold=0.5):
    """Return dict of key evaluation metrics and print them."""
    proba = None
    try:
        proba = model.predict_proba(X_val)[:,1]
    except Exception:
        # some models like SVM might not have predict_proba
        if hasattr(model, "decision_function"):
            proba = model.decision_function(X_val)
        else:
            preds = model.predict(X_val)
            proba = preds
    auc = roc_auc_score(y_val, proba)
    preds = (proba >= threshold).astype(int)
    cm = confusion_matrix(y_val, preds)
    report = classification_report(y_val, preds, digits=4)
    precision = precision_score(y_val, preds, zero_division=0)
    recall = recall_score(y_val, preds, zero_division=0)
    f1 = f1_score(y_val, preds, zero_division=0)
    print(f"AUC: {auc:.4f}")
    print("Confusion Matrix:")
    print(cm)
    print("Classification Report:")
    print(report)
    return {"auc":auc, "precision":precision, "recall":recall, "f1":f1, "confusion_matrix":cm, "report":report}

def plot_roc(model, X_val, y_val):
    try:
        proba = model.predict_proba(X_val)[:,1]
    except Exception:
        if hasattr(model, "decision_function"):
            proba = model.decision_function(X_val)
        else:
            proba = model.predict(X_val)
    fpr, tpr, _ = roc_curve(y_val, proba)
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_val, proba):.4f}")
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.show()

9. Train/evaluate pipeline example

In [9]:
def train_pipeline_full(train_df, identity_df=None, target_col="isFraud", sample_method=None):
    """ End-to-end pipeline termasuk tuning dan evaluasi lengkap. """
    
    # 1. Merge Identity
    if identity_df is not None and 'TransactionID' in train_df.columns:
        train_df = train_df.merge(identity_df, on='TransactionID', how='left')
    
    X = train_df.drop(columns=[target_col])
    y = train_df[target_col]
    
    # Simpan TransactionID dan drop dari fitur
    if 'TransactionID' in X.columns:
        X = X.drop(columns=['TransactionID'])
    
    # 2. Preprocess
    X_proc, _, scaler = preprocess(X, fit_scaler=True)
    
    # 3. Split
    X_train, X_val, y_train, y_val = train_test_split(X_proc, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
    
    # 4. Handle Imbalance
    X_train_res, y_train_res = X_train, y_train # Default: No sampling
    if sample_method is not None:
        X_train_res, y_train_res = handle_imbalance(X_train, y_train, method=sample_method)

    all_results = {}
    all_models = {}

    # --- 5. MODELING & TUNING ---
    
    # 5.1. Logistic Regression (Baseline)
    print("\n" + "="*50)
    print("5.1. TRAINING BASELINE: LOGISTIC REGRESSION")
    print("="*50)
    # Gunakan solver yang cepat
    lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE, solver='liblinear')
    lr.fit(X_train_res, y_train_res)
    lr_eval = evaluate_model(lr, X_val, y_val)
    plot_roc(lr, X_val, y_val)
    all_results["Logistic Regression"] = lr_eval
    all_models["lr"] = lr
    
    # 5.2. Random Forest (Tuning Ringan)
    print("\n" + "="*50)
    print("5.2. TRAINING MODEL: RANDOM FOREST (Tuning Ringan)")
    print("="*50)
    rf_base = RandomForestClassifier(n_jobs=-1, random_state=RANDOM_STATE, class_weight='balanced')
    
    param_grid_rf = {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_split': [5, 10]
    }
    rf_rs = RandomizedSearchCV(estimator=rf_base, param_distributions=param_grid_rf, 
                               n_iter=5, scoring='roc_auc', n_jobs=-1, cv=3, verbose=0, random_state=RANDOM_STATE)
    rf_rs.fit(X_train_res, y_train_res)
    rf_model = rf_rs.best_estimator_
    print(f"Best parameters (RF): {rf_rs.best_params_}")

    rf_eval = evaluate_model(rf_model, X_val, y_val)
    plot_roc(rf_model, X_val, y_val)
    all_results["Random Forest"] = rf_eval
    all_models["rf"] = rf_model
    
    # 5.3. LightGBM (Highly Recommended - Tuning Ringan)
    lgb_model = None
    if LGB_INSTALLED:
        print("\n" + "="*50)
        print("5.3. TRAINING MODEL: LIGHTGBM (Tuning Ringan)")
        print("="*50)
        lgb_base = lgb.LGBMClassifier(random_state=RANDOM_STATE, n_jobs=-1, class_weight='balanced', metric='auc')
        
        param_grid_lgb = {
            'n_estimators': [200, 300],
            'learning_rate': [0.01, 0.05],
            'num_leaves': [20, 31],
        }
        lgb_rs = RandomizedSearchCV(estimator=lgb_base, param_distributions=param_grid_lgb, 
                                    n_iter=5, scoring='roc_auc', n_jobs=-1, cv=3, verbose=0, random_state=RANDOM_STATE)
        lgb_rs.fit(X_train_res, y_train_res)
        lgb_model = lgb_rs.best_estimator_
        print(f"Best parameters (LGBM): {lgb_rs.best_params_}")

        lgb_eval = evaluate_model(lgb_model, X_val, y_val)
        plot_roc(lgb_model, X_val, y_val)
        all_results["LightGBM"] = lgb_eval
        all_models["lgb"] = lgb_model
    
    # --- 6. POIN BONUS: ANALISIS ---
    
    # Feature Importance (dari model terbaik non-linear)
    importance_model = lgb_model if LGB_INSTALLED and lgb_model is not None else rf_model
    if importance_model:
        plot_feature_importance(importance_model, X_train, title=f"{importance_model.__class__.__name__} Feature Importance")
        plot_shap(importance_model, X_val) # SHAP Plot

    # ‚≠ê Poin Bonus 1: Comparison Table
    comparison_table = create_comparison_table(all_results)
    print("\n" + "="*50)
    print("‚≠ê 1. PERBANDINGAN HASIL MODEL (AUC, Recall, F1-Score)")
    print("="*50)
    display(comparison_table)
    
    # ‚≠ê Poin Bonus 4: Clear Conclusion
    best_model_key, conclusion = generate_conclusion(comparison_table, sample_method)
    print("\n" + "="*50)
    print("‚≠ê 4. KESIMPULAN AKHIR & REKOMENDASI")
    print("="*50)
    print(conclusion)
    
    best_model = all_models.get(best_model_key) if best_model_key else importance_model

    return {"models": all_models, "results": all_results, "scaler": scaler, "encoders": _, "best_model": best_model}

10. Submission helper

In [10]:
def create_submission(model, test_df, submission_path="submission/uts_fraud_submission.csv", id_col="TransactionID", scaler=None, encoders=None):
    """ Memproses test set dengan scaler dan encoder yang sama, lalu membuat submission. """
    Path("submission").mkdir(exist_ok=True)
    test = test_df.copy()
    
    if id_col not in test.columns:
        raise ValueError("Test data must contain TransactionID for submission.")
        
    ids = test[id_col]
    X_test = test.drop(columns=[id_col])

    X_test_proc, _, _ = preprocess(X_test, fit_scaler=False, scaler=scaler)

    try:
        proba = model.predict_proba(X_test_proc)[:,1]
    except Exception:
        # Fallback jika model tidak punya predict_proba
        proba = model.predict(X_test_proc)
        
    sub = pd.DataFrame({id_col: ids, "isFraud": proba})
    sub.to_csv(submission_path, index=False)
    print("Submission saved to", submission_path)
    return sub

11. Putting it all together

In [12]:
def reduce_mem_usage(df, verbose=True):
    """Reduce memory usage by downcasting numerical columns safely (NO ERROR)."""
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        # Skip object/strings
        if col_type == object:
            continue  

        # Check NaN ‚Äî kalau ada NaN, otomatis jadi float (tidak bisa int)
        has_nan = df[col].isnull().any()

        c_min = df[col].min()
        c_max = df[col].max()

        # =============================
        # 1. Handle integer columns
        # =============================
        if not has_nan and (str(col_type).startswith("int") or np.all(df[col].dropna() % 1 == 0)):
            # Safe to downcast as integer
            if c_min >= 0:
                if c_max < 255:
                    df[col] = df[col].astype(np.uint8)
                elif c_max < 65535:
                    df[col] = df[col].astype(np.uint16)
                elif c_max < 4294967295:
                    df[col] = df[col].astype(np.uint32)
                else:
                    df[col] = df[col].astype(np.uint64)
            else:
                if np.iinfo(np.int8).min <= c_min <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif np.iinfo(np.int16).min <= c_min <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif np.iinfo(np.int32).min <= c_min <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)

        # =============================
        # 2. Handle floats (atau int yang ada NaN)
        # =============================
        else:
            df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print(f"Memory usage decreased from {start_mem:.2f} MB to {end_mem:.2f} MB")

    return df


# üß† **Fraud Detection ‚Äî Machine Learning Pipeline**

Laporan ini menjelaskan seluruh proses membangun model Machine Learning untuk mendeteksi transaksi penipuan (*Fraud Detection*) menggunakan dataset yang berisi transaksi elektronik dan informasi identitas pengguna.

Tujuan utama sistem ini adalah **memprediksi apakah suatu transaksi merupakan fraud (1) atau bukan fraud (0)** menggunakan model *supervised classification*.


# 1. üéØ **Tujuan Proyek**

Tujuan dari proyek ini adalah:

* Membangun model yang mampu mendeteksi transaksi penipuan.
* Mengatasi data *imbalanced* yang umum pada kasus fraud.
* Membandingkan beberapa model (Logistic Regression, Random Forest, LightGBM).
* Menentukan model terbaik berdasarkan *ROC-AUC*.
* Menghasilkan berkas submission CSV untuk prediksi dataset test.


# 2. üìÅ **1. Loading Dataset**

Dataset terdiri dari:

* `train_transaction.csv`
* `train_identity.csv` (opsional)
* `test_transaction.csv`

Tahap awal yang dilakukan:

1. Memuat data menggunakan `pandas`.
2. Menggabungkan **transaction** dan **identity** menggunakan `TransactionID`.
3. Melakukan optimasi memori menggunakan fungsi `reduce_mem_usage`.

> Tujuannya: memastikan data siap diproses dan efisien dalam penggunaan memori.


# 3. üîç **2. Exploratory Data Analysis (EDA)**

### **3.1 Melihat Struktur Data**

* `df.head()` untuk melihat contoh baris.
* Menampilkan jumlah missing values.
* Menampilkan ringkasan statistik (`describe()`).

### **3.2 Distribusi Label (Fraud vs Non-fraud)**

Hasil eksplorasi menunjukkan bahwa:

* Data **sangat tidak seimbang** (fraud < 1%).
* Ini merupakan karakteristik umum dataset fraud detection.
* Oleh karena itu, teknik handling imbalance menjadi wajib.


# 4. üßπ **3. Data Cleaning & Preprocessing**

Tahap preprocessing mencakup:

### **4.1 Missing Value Handling**

* Numerical: median imputation
* Categorical: most frequent imputation

Ini strategi aman dan cepat untuk dataset besar.

### **4.2 Encoding Kategori**

* Menggunakan `LabelEncoder` untuk fitur kategorikal.
* Jika gagal ‚Üí fallback ke mapping manual.

### **4.3 Scaling (Opsional)**

* Menggunakan `StandardScaler` untuk fitur numerik.
* Dibutuhkan untuk model linear seperti Logistic Regression.

### **4.4 Membuang Kolom Tidak Relevan**

* Kalau ada ID unik atau kolom yang tidak informatif.


# 5. ‚öñÔ∏è **4. Handling Imbalance**

Karena fraud sangat jarang, dataset harus diseimbangkan agar model tidak hanya memprediksi `0` terus.

Beberapa metode yang digunakan dalam pipeline:

### ‚úî **SMOTE (Synthetic Minority Over-sampling Technique)**

Menambahkan sampel minoritas baru secara sintetis.

### ‚úî Oversampling / Undersampling

* Oversample minoritas.
* Undersample mayoritas.

### ‚úî class_weight = ‚Äòbalanced‚Äô

Dipakai di Logistic Regression dan Random Forest.

> SMOTE digunakan sebagai metode utama karena paling stabil dan menghasilkan peningkatan signifikan dalam recall / AUC.


# 6. ü§ñ **5. Machine Learning Models**

Model utama yang dilatih:


## **6.1 Logistic Regression (Baseline)**

* Memberikan baseline yang transparan dan cepat.
* Menggunakan `class_weight='balanced'`.

Metode ini membantu kita memahami performa dasar dataset sebelum mencoba model yang lebih kompleks.

## **6.2 Random Forest**

* Cocok untuk dataset besar dan banyak fitur.
* Mampu menangani missing & non-linear relationships.
* Menyediakan feature importance.

Parameter utama:

```
n_estimators=200
class_weight='balanced'
```

## **6.3 LightGBM (jika tersedia)**

* Salah satu model terbaik untuk data tabular.
* Cepat dan sangat akurat.
* Tidak perlu scaling.

LightGBM biasanya memberikan score tinggi pada dataset fraud.

# 7. üß™ **6. Evaluation Metrics**

Model dievaluasi menggunakan:

‚úî **ROC-AUC (utama)**

* Mengukur kemampuan model memisahkan fraud vs non-fraud.
* Semakin tinggi semakin baik (target > 0.85 pada dataset ini).

‚úî Confusion Matrix

Menampilkan:

* TP (fraud terdeteksi)
* FN (fraud lolos) ‚Äî **paling berbahaya**
* FP (false fraud)
* TN (normal)

 ‚úî Precision

Berapa banyak prediksi fraud yang benar.

 ‚úî Recall (penting untuk fraud detection!)

Berapa banyak fraud yang berhasil ditangkap.

 ‚úî F1-score

Keseimbangan antara precision & recall.

# 8. üìä **7. Hasil Evaluasi Model**

Hasil umum (akan berbeda tergantung dataset dan sampling):

üîπ Logistic Regression ‚Äî Baseline

* AUC: moderat
* Recall: rendah hingga sedang
* Baik sebagai pembanding

üîπ Random Forest

* AUC lebih tinggi daripada LR
* Recall meningkat
* F1 lebih stabil

üîπ LightGBM (jika aktif)

* Biasanya **model terbaik**
* AUC tertinggi
* Kecepatan training sangat cepat

# 9. üß© **8. Feature Importance (Random Forest)**

* Model menghasilkan ranking fitur berdasarkan kontribusi terhadap prediksi.
* Fitur dengan importance tertinggi biasanya terkait:

  * amount/value
  * device info
  * transactional behavior
  * time patterns

Visualisasi top 20 fitur juga ditampilkan dalam bar chart.

# 10. üìù **9. Inference & Submission**

Untuk data test:

1. Lakukan preprocessing yang sama.
2. Ambil model terbaik (misal RandomForest atau LGBM).
3. Prediksi `proba`.
4. Generate file `submission.csv` dengan format:

TransactionID, isFraud

> Ini format yang umum untuk kompetisi fraud detection (Kaggle-style).


# 11. **Kesimpulan**

* Pipeline Fraud Detection berhasil dibangun dari awal hingga akhir secara terstruktur.
* Telah dilakukan EDA, preprocessing, imbalance handling, training model, dan evaluasi.
* Model terbaik dapat dipilih berdasarkan nilai AUC.
* Submission final telah dibuat dan siap dinilai.