In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import joblib 
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler # Ditambahkan StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer 
from sklearn.utils.class_weight import compute_class_weight

# Feature Selection Imports (DITAMBAHKAN)
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

# Model
import lightgbm as lgb

# Untuk jarak geografis
from math import radians, sin, cos, sqrt, asin

In [None]:
# 1. LOAD DATA
print("Loading data...")
train_df = pd.read_csv('/kaggle/input/data-anv-v1/train.csv')
test_df = pd.read_csv('/kaggle/input/data-anv-v1/test.csv')

In [None]:
def fix_data_types(df):
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
    
    if df['Battery_Level'].dtype == 'object':
        df['Battery_Level'] = df['Battery_Level'].astype(str).str.replace('%', '', regex=False)
        df['Battery_Level'] = pd.to_numeric(df['Battery_Level'], errors='coerce')
    
    float_cols = df.select_dtypes(include=['float64']).columns
    for col in float_cols:
        df[col] = df[col].astype('float32')
            
    df['Device_FP'] = df['Device_FP'].astype(str).replace('nan', np.nan)
    return df

train_df = fix_data_types(train_df)
test_df = fix_data_types(test_df)

In [None]:
# 2. SAMPLING
def stratified_sampling(df, target_col, sample_frac=0.20, random_state=42):
    df_sampled = df.groupby(target_col, group_keys=False).apply(
        lambda x: x.sample(frac=sample_frac, random_state=random_state)
    )
    return df_sampled.reset_index(drop=True)

In [None]:
# Sampling train data
train_sampled = stratified_sampling(train_df, 'Trip_Label', sample_frac=0.20)
train_labels = train_sampled['Trip_Label'].copy()
train_sampled = train_sampled.drop('Trip_Label', axis=1)

In [None]:
# Tandai mana train mana test
train_sampled['is_train'] = 1
test_df['is_train'] = 0

# Gabungkan
df_all = pd.concat([train_sampled, test_df], axis=0, ignore_index=True)

# Feature Engineering

In [None]:
# 3. FEATURE ENGINEERING
print("Generating features...")

# --- A. Fitur Temporal ---
df_all['Timestamp'] = pd.to_datetime(df_all['Timestamp'])
df_all['hour'] = df_all['Timestamp'].dt.hour
df_all['day_of_week'] = df_all['Timestamp'].dt.dayofweek
df_all['is_weekend'] = (df_all['day_of_week'] >= 5).astype(int)
df_all['is_rush_hour'] = ((df_all['hour'] >= 7) & (df_all['hour'] <= 9) | 
                           (df_all['hour'] >= 17) & (df_all['hour'] <= 19)).astype(int)
df_all['is_late_night'] = ((df_all['hour'] >= 23) | (df_all['hour'] <= 5)).astype(int)

In [None]:
# --- B. Fitur Spasial (Haversine) ---
def haversine_distance(lat1, lon1, lat2, lon2):
    # Vectorized haversine
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c

df_all['haversine_dist'] = haversine_distance(
    df_all['Pickup_Lat'], df_all['Pickup_Long'],
    df_all['Dropoff_Lat'], df_all['Dropoff_Long']
)

In [None]:


# --- C. Penanganan Sensor Hilang (SOGP Logic) ---
sensor_cols = ['Accel_X', 'Accel_Y', 'Accel_Z', 'Gyro_X', 'Gyro_Y', 'Gyro_Z']
available_sensors = [c for c in sensor_cols if c in df_all.columns]

if available_sensors:
    df_all['missing_sensor_count'] = df_all[available_sensors].isnull().sum(axis=1)
    df_all['is_sensor_data_missing'] = (df_all['missing_sensor_count'] > 0).astype(int)
    
    mice_imputer = IterativeImputer(max_iter=5, random_state=42)
    df_all[available_sensors] = mice_imputer.fit_transform(df_all[available_sensors])
else:
    df_all['missing_sensor_count'] = 0
    df_all['is_sensor_data_missing'] = 0

# --- D. Telematics Standard ---
df_all['accel_magnitude'] = np.sqrt(df_all['Accel_X']**2 + df_all['Accel_Y']**2 + df_all['Accel_Z']**2)
df_all['vertical_jerk'] = np.abs(df_all['Accel_Z'] - 9.8)
df_all['harsh_accel_count'] = (
    (np.abs(df_all['Accel_X']) > df_all['Accel_X'].quantile(0.95)).astype(int) + 
    (np.abs(df_all['Accel_Y']) > df_all['Accel_Y'].quantile(0.95)).astype(int) + 
    (np.abs(df_all['Accel_Z'] - 9.8) > 2.0).astype(int)
)

# --- E. Separator Logic (Nav vs Service) ---
df_all['is_battery_saver'] = (df_all['Battery_Level'] <= 20).astype(int)
df_all['is_critical_battery'] = (df_all['Battery_Level'] <= 10).astype(int)

signal_map = {'No Signal': 0.1, '2G': 1, 'Edge': 2, '3G': 3, '4G': 4, '5G': 5}
if 'Signal_Strength' in df_all.columns:
    df_all['signal_score'] = df_all['Signal_Strength'].map(signal_map).fillna(3)
else:
    df_all['signal_score'] = 3

df_all['gps_reliability'] = (1 / (df_all['GPS_Accuracy_M'] + 1)) * df_all['signal_score'] * (1 - (df_all['is_battery_saver'] * 0.8))

df_all['dist_deviation'] = np.abs(df_all['haversine_dist'] - df_all['Distance_KM'])
df_all['dist_deviation_ratio'] = df_all['dist_deviation'] / (df_all['haversine_dist'] + 0.01)

df_all['prob_nav_issue'] = df_all['dist_deviation_ratio'] / (df_all['gps_reliability'] + 0.001)
df_all['prob_service_fraud'] = df_all['dist_deviation_ratio'] * df_all['gps_reliability']

# --- F. Fitur Lainnya ---
df_all['has_promo'] = (~df_all['Promo_Code'].isna()).astype(int)
df_all['price_per_km'] = df_all['Est_Price_IDR'] / (df_all['Distance_KM'] + 1e-5)

df_all['Device_FP'] = df_all['Device_FP'].fillna('Unknown-Unknown-0-v0.0')
device_split = df_all['Device_FP'].str.split('-', expand=True)
df_all['temp_brand'] = device_split[0]
df_all['temp_model'] = device_split[1]
valid_device_map = {"Apple": ["iPhone"], "Samsung": ["Galaxy"], "Oppo": ["Reno"], "Vivo": ["Y_Series"], "Xiaomi": ["Redmi"], "Infinix": ["Hot"]}

def check_device_mismatch(row):
    if row['temp_brand'] in valid_device_map:
        if row['temp_model'] in valid_device_map[row['temp_brand']]:
            return 0 
    return 1
df_all['device_mismatch'] = df_all.apply(check_device_mismatch, axis=1)
df_all.drop(columns=['temp_brand', 'temp_model'], inplace=True)

# 4. CLEANUP & ENCODING
cols_to_drop = [
    'Timestamp', 'Pickup_Lat', 'Pickup_Long', 'Dropoff_Lat', 'Dropoff_Long', 
    'Promo_Code', 'lat_diff', 'long_diff', 'Pickup_Zone', 'Dropoff_Zone', 'Car_Model'
]
df_all = df_all.drop(columns=[c for c in cols_to_drop if c in df_all.columns])

categorical_features = ['Payment_Method', 'Device_FP', 'Signal_Strength', 'Weather', 'Traffic']
label_encoders = {}
for col in categorical_features:
    if col in df_all.columns:
        le = LabelEncoder()
        df_all[col] = df_all[col].astype(str)
        le.fit(df_all[col])
        df_all[col] = le.transform(df_all[col])
        label_encoders[col] = le

obj_cols = df_all.select_dtypes(include=['object']).columns
cols_to_remove = [c for c in obj_cols if c != 'Trip_ID']
if cols_to_remove:
    df_all = df_all.drop(columns=cols_to_remove)

# 5. SPLIT DATA
train_final = df_all[df_all['is_train'] == 1].copy()
test_final = df_all[df_all['is_train'] == 0].copy()

train_final = train_final.drop(['is_train', 'Trip_ID'], axis=1)
test_ids = test_final['Trip_ID'].values 
test_final = test_final.drop(['is_train', 'Trip_ID'], axis=1)

le_target = LabelEncoder()
y_encoded = le_target.fit_transform(train_labels)

X_train, X_val, y_train, y_val = train_test_split(
    train_final, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# ==============================================================================
# FEATURE SELECTION: LOGISTIC REGRESSION (LASSO/L1) - MULTINOMIAL
# ==============================================================================
print("\n=== STARTING FEATURE SELECTION (L1 LASSO) ===")
print(f"Features before selection: {X_train.shape[1]}")

# 1. Scaling (Wajib untuk Logistic Regression agar penaltinya adil)
scaler = StandardScaler()
# Kita impute 0 hanya untuk proses seleksi fitur agar LR tidak error kena NaN (LGBM nanti tetap pakai NaN asli)
X_train_clean_for_select = X_train.fillna(0) 
X_train_scaled = scaler.fit_transform(X_train_clean_for_select)

# 2. Definisikan Logistic Regression dengan L1 (Lasso)
# Solver 'saga' diperlukan untuk L1 pada Multiclass
lasso_selector = LogisticRegression(
    penalty='l1',            # L1 Regularization (Lasso)
    solver='saga',           # Solver yang support L1
    multi_class='multinomial', 
    C=0.5,                   # Inverse strength (Makin kecil C, makin sedikit fitur terpilih)
    random_state=42,
    max_iter=1000,           # Iterasi tinggi agar konvergen
    n_jobs=-1
)

# 3. SelectFromModel
selector = SelectFromModel(estimator=lasso_selector)
selector.fit(X_train_scaled, y_train)

# 4. Ambil Mask Fitur Terpilih
selected_mask = selector.get_support()
selected_columns = X_train.columns[selected_mask]

print(f"Features selected: {len(selected_columns)}")
print(f"Dropped features: {len(X_train.columns) - len(selected_columns)}")
print(f"List Selected: {list(selected_columns)}")

# 5. Terapkan Seleksi ke DataFrame Asli (X_train, X_val, test_final)
X_train = X_train[selected_columns]
X_val = X_val[selected_columns]
test_final = test_final[selected_columns]

print("Feature selection applied successfully.\n")
# ==============================================================================


# Class Weights
classes = np.unique(y_train)
weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, weights))
print(f"Class Weights: {class_weight_dict}")

# 6. MODELING (LGBM Optimized)
lgb_params = {
    'objective': 'multiclass',
    'n_estimators': 6200,
    'learning_rate': 0.015,
    'num_leaves': 90,
    'max_depth': 12,
    'min_child_samples': 40,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'reg_alpha': 0.5,
    'reg_lambda': 0.5,
    'random_state': 42,
    'n_jobs': -1,
    'class_weight': class_weight_dict,
    'verbose': -1
}

lgb_model = lgb.LGBMClassifier(**lgb_params)

print("Training Initial LightGBM (with Selected Features)...")
lgb_model.fit(
    X_train, 
    y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='multi_logloss',
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=200)
    ]
)

# 7. EVALUASI AWAL
y_val_pred = lgb_model.predict(X_val)
macro_f1 = f1_score(y_val, y_val_pred, average='macro')
print(f"\n{'='*50}")
print(f"INITIAL MACRO F1-SCORE (VALIDATION): {macro_f1:.4f}")
print(f"{'='*50}\n")
print(classification_report(y_val, y_val_pred, target_names=le_target.classes_))

# 8. PSEUDO LABELING
print("\n=== STARTING PSEUDO LABELING ===")

# 1. Prediksi probabilitas pada data TEST (yang sudah dikurangi fiturnya)
print("Predicting test data probabilities...")
test_probs = lgb_model.predict_proba(test_final)

# 2. Ambil prediksi label
test_preds_idx = np.argmax(test_probs, axis=1)
test_confidence = np.max(test_probs, axis=1)

# 3. Filter data dengan confidence > 95%
high_conf_mask = test_confidence > 0.95
X_pseudo = test_final[high_conf_mask].copy()
y_pseudo = test_preds_idx[high_conf_mask]

print(f"Total data test: {len(test_final)}")
print(f"Data dengan confidence > 95%: {len(X_pseudo)} ({len(X_pseudo)/len(test_final)*100:.2f}%)")

if len(X_pseudo) > 0:
    # 4. Gabungkan Data Train Awal + Data Pseudo
    print("Combining Train Data with Pseudo Data...")
    X_train_augmented = pd.concat([X_train, X_pseudo], axis=0)
    y_train_augmented = np.concatenate([y_train, y_pseudo], axis=0)
    
    # 5. Retrain Model
    print("Retraining LightGBM with Pseudo Labels...")
    
    classes_aug = np.unique(y_train_augmented)
    weights_aug = compute_class_weight('balanced', classes=classes_aug, y=y_train_augmented)
    class_weight_dict_aug = dict(zip(classes_aug, weights_aug))
    
    lgb_params['class_weight'] = class_weight_dict_aug
    
    lgb_model_pseudo = lgb.LGBMClassifier(**lgb_params)
    
    lgb_model_pseudo.fit(
        X_train_augmented, 
        y_train_augmented,
        eval_set=[(X_val, y_val)], 
        eval_metric='multi_logloss',
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=200)
        ]
    )
    
    y_val_pred_pseudo = lgb_model_pseudo.predict(X_val)
    macro_f1_pseudo = f1_score(y_val, y_val_pred_pseudo, average='macro')
    print(f"\n{'='*50}")
    print(f"PSEUDO LABELING MACRO F1-SCORE (VALIDATION): {macro_f1_pseudo:.4f}")
    print(f"Improvement: {macro_f1_pseudo - macro_f1:.4f}")
    print(f"{'='*50}\n")
    
    final_model = lgb_model_pseudo
    final_model_name = 'lgbm_model_pseudo_lasso.pkl'
else:
    print("Tidak ada data yang memenuhi threshold confidence > 95%. Menggunakan model awal.")
    final_model = lgb_model
    final_model_name = 'lgbm_model_initial_lasso.pkl'

# 9. SUBMISSION FINAL
print("Creating final submission...")
y_test_pred_idx_final = final_model.predict(test_final)
y_test_pred_label_final = le_target.inverse_transform(y_test_pred_idx_final)

submission = pd.DataFrame({
    'Trip_ID': test_ids,
    'Trip_Label': y_test_pred_label_final
})

submission.to_csv('submission_lgbm_lasso_pseudo.csv', index=False)
print("Selesai! File: submission_lgbm_lasso_pseudo.csv")

# 10. SAVE MODEL
print("Saving final model to .pkl...")
joblib.dump(final_model, final_model_name)
print(f"Model berhasil disimpan: {final_model_name}")

# Tambahan: Save juga dengan format .joblib untuk efisiensi
final_model_name_joblib = final_model_name.replace('.pkl', '.joblib')
joblib.dump(final_model, final_model_name_joblib, compress=3)
print(f"Model juga disimpan dalam format .joblib: {final_model_name_joblib}")

# üéØ Feature Importance & SHAP Analysis

Bagian ini menganalisis fitur-fitur yang paling berpengaruh dalam model menggunakan Feature Importance dari LightGBM dan SHAP values untuk interpretability yang lebih mendalam.

In [None]:
# Feature Importance dari LightGBM
print("="*60)
print("FEATURE IMPORTANCE ANALYSIS - LGBM")
print("="*60)

# Ambil feature importance dari final model
feature_importance = final_model.feature_importances_
feature_names = selected_columns.tolist()

# Buat DataFrame
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Sort by importance
importance_df = importance_df.sort_values('Importance', ascending=False).reset_index(drop=True)

# Hitung persentase
importance_df['Importance_Pct'] = (importance_df['Importance'] / importance_df['Importance'].sum() * 100).round(2)
importance_df['Cumulative_Pct'] = importance_df['Importance_Pct'].cumsum().round(2)

# Tampilkan Top 20 fitur
print("\nüîù Top 20 Fitur Paling Penting:")
print(importance_df.head(20).to_string(index=False))

# Simpan ke CSV
importance_df.to_csv('feature_importance_lgbm.csv', index=False)
print(f"\n‚úÖ Feature importance disimpan: feature_importance_lgbm.csv")

# Visualisasi Top 20 Features
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Plot 1: Bar plot horizontal untuk Top 20
top_20 = importance_df.head(20)
axes[0].barh(range(len(top_20)), top_20['Importance'], color='steelblue')
axes[0].set_yticks(range(len(top_20)))
axes[0].set_yticklabels(top_20['Feature'])
axes[0].invert_yaxis()
axes[0].set_xlabel('Importance Score', fontsize=10)
axes[0].set_title('Top 20 Feature Importance (LightGBM)', fontsize=12, fontweight='bold')
axes[0].grid(axis='x', alpha=0.3)

# Plot 2: Cumulative importance
axes[1].plot(range(1, len(importance_df)+1), importance_df['Cumulative_Pct'], 
             color='darkgreen', linewidth=2)
axes[1].axhline(y=80, color='red', linestyle='--', linewidth=1, label='80% Threshold')
axes[1].axhline(y=90, color='orange', linestyle='--', linewidth=1, label='90% Threshold')
axes[1].set_xlabel('Number of Features', fontsize=10)
axes[1].set_ylabel('Cumulative Importance (%)', fontsize=10)
axes[1].set_title('Cumulative Feature Importance', fontsize=12, fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('feature_importance_lgbm.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"‚úÖ Plot feature importance disimpan: feature_importance_lgbm.png")

# Analisis threshold
n_features_80 = (importance_df['Cumulative_Pct'] <= 80).sum()
n_features_90 = (importance_df['Cumulative_Pct'] <= 90).sum()
print(f"\nüìä Analisis:")
print(f"   - Fitur yang berkontribusi 80% importance: {n_features_80}/{len(importance_df)}")
print(f"   - Fitur yang berkontribusi 90% importance: {n_features_90}/{len(importance_df)}")

In [None]:
# Install SHAP jika belum ada (untuk Kaggle environment)
try:
    import shap
    print("‚úÖ SHAP library sudah terinstall")
except ImportError:
    print("‚ö†Ô∏è SHAP library belum terinstall. Menginstall...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "shap"])
    import shap
    print("‚úÖ SHAP library berhasil diinstall")

In [None]:
# SHAP Analysis - Summary Plot
print("="*60)
print("SHAP VALUES ANALYSIS")
print("="*60)

# Gunakan subset data untuk SHAP (untuk efisiensi komputasi)
# Karena SHAP computation mahal, ambil sample
sample_size = min(1000, len(X_val))
X_val_sample = X_val.sample(n=sample_size, random_state=42)

print(f"\nüîÑ Menghitung SHAP values untuk {sample_size} sampel validasi...")
print("   (Proses ini mungkin memakan waktu beberapa menit)")

# Create SHAP explainer
explainer = shap.TreeExplainer(final_model)

# Calculate SHAP values
shap_values = explainer.shap_values(X_val_sample)

print("‚úÖ SHAP values berhasil dihitung")

# Info tentang SHAP values
if isinstance(shap_values, list):
    print(f"   - SHAP values shape: {len(shap_values)} classes √ó {shap_values[0].shape}")
else:
    print(f"   - SHAP values shape: {shap_values.shape}")

In [None]:
# SHAP Summary Plot - Global Feature Importance
print("="*60)
print("SHAP SUMMARY PLOT")
print("="*60)

fig, ax = plt.subplots(figsize=(12, 8))

# Summary plot (untuk multiclass, SHAP akan aggregate semua classes)
shap.summary_plot(shap_values, X_val_sample, plot_type="bar", show=False)
plt.title('SHAP Feature Importance (Mean |SHAP Value|)', fontsize=12, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('shap_summary_bar.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ SHAP summary bar plot disimpan: shap_summary_bar.png")

# Summary plot dengan distribusi
fig, ax = plt.subplots(figsize=(12, 10))
shap.summary_plot(shap_values, X_val_sample, show=False)
plt.title('SHAP Feature Impact Distribution', fontsize=12, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('shap_summary_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ SHAP distribution plot disimpan: shap_summary_distribution.png")

In [None]:
# SHAP Values per Class
print("="*60)
print("SHAP ANALYSIS PER CLASS")
print("="*60)

# Untuk multiclass, SHAP values berbentuk list (satu untuk tiap class)
if isinstance(shap_values, list):
    n_classes = len(shap_values)
    class_names = le_target.classes_
    
    print(f"\nüìä Total Classes: {n_classes}")
    print(f"   Classes: {', '.join(class_names)}\n")
    
    # Buat plot untuk setiap class
    fig, axes = plt.subplots(n_classes, 1, figsize=(12, 6*n_classes))
    
    if n_classes == 1:
        axes = [axes]
    
    for idx, (class_shap, class_name) in enumerate(zip(shap_values, class_names)):
        print(f"üîç Class: {class_name}")
        
        # Hitung mean absolute SHAP values untuk class ini
        mean_abs_shap = np.abs(class_shap).mean(axis=0)
        
        # Buat DataFrame
        shap_class_df = pd.DataFrame({
            'Feature': selected_columns,
            'Mean_Abs_SHAP': mean_abs_shap
        }).sort_values('Mean_Abs_SHAP', ascending=False)
        
        print(f"   Top 5 Features: {', '.join(shap_class_df.head(5)['Feature'].tolist())}\n")
        
        # Plot Top 15 untuk class ini
        top_15 = shap_class_df.head(15)
        axes[idx].barh(range(len(top_15)), top_15['Mean_Abs_SHAP'], color=f'C{idx}')
        axes[idx].set_yticks(range(len(top_15)))
        axes[idx].set_yticklabels(top_15['Feature'])
        axes[idx].invert_yaxis()
        axes[idx].set_xlabel('Mean |SHAP Value|', fontsize=10)
        axes[idx].set_title(f'Top 15 Features for Class: {class_name}', 
                           fontsize=11, fontweight='bold')
        axes[idx].grid(axis='x', alpha=0.3)
        
        # Simpan per class
        shap_class_df.to_csv(f'shap_values_class_{class_name}.csv', index=False)
        print(f"   ‚úÖ Disimpan: shap_values_class_{class_name}.csv")
    
    plt.tight_layout()
    plt.savefig('shap_per_class.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\n‚úÖ SHAP per class plot disimpan: shap_per_class.png")
else:
    print("‚ö†Ô∏è SHAP values tidak dalam format multiclass list")

In [None]:
# SHAP Dependence Plot untuk Top Features
print("="*60)
print("SHAP DEPENDENCE PLOTS")
print("="*60)

# Ambil top 6 features dari feature importance
top_features_for_dependence = importance_df.head(6)['Feature'].tolist()

print(f"üìä Membuat dependence plots untuk top {len(top_features_for_dependence)} fitur:\n")

# Untuk multiclass, ambil SHAP values dari class pertama (atau bisa dipilih)
if isinstance(shap_values, list):
    shap_for_dependence = shap_values[0]  # Class pertama
    selected_class = le_target.classes_[0]
    print(f"   Menggunakan SHAP values untuk class: {selected_class}\n")
else:
    shap_for_dependence = shap_values

# Buat grid plot
n_cols = 2
n_rows = (len(top_features_for_dependence) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 5*n_rows))
axes = axes.flatten() if n_rows > 1 else axes

for idx, feature in enumerate(top_features_for_dependence):
    print(f"   - {feature}")
    
    # Get feature index
    feature_idx = list(selected_columns).index(feature)
    
    # Plot pada subplot
    plt.sca(axes[idx])
    shap.dependence_plot(
        feature_idx, 
        shap_for_dependence, 
        X_val_sample,
        show=False,
        ax=axes[idx]
    )
    axes[idx].set_title(f'SHAP Dependence: {feature}', fontsize=10, fontweight='bold')

# Hide unused subplots
for idx in range(len(top_features_for_dependence), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.savefig('shap_dependence_plots.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\n‚úÖ SHAP dependence plots disimpan: shap_dependence_plots.png")

In [None]:
# Summary Comparison: Feature Importance vs SHAP
print("="*60)
print("COMPARISON: LGBM FEATURE IMPORTANCE vs SHAP")
print("="*60)

# Hitung mean absolute SHAP values (aggregate across all classes)
if isinstance(shap_values, list):
    # Average across all classes
    mean_abs_shap_all = np.mean([np.abs(sv).mean(axis=0) for sv in shap_values], axis=0)
else:
    mean_abs_shap_all = np.abs(shap_values).mean(axis=0)

# Buat DataFrame comparison
comparison_df = pd.DataFrame({
    'Feature': selected_columns,
    'LGBM_Importance': feature_importance,
    'SHAP_MeanAbs': mean_abs_shap_all
})

# Normalize untuk perbandingan
comparison_df['LGBM_Norm'] = (comparison_df['LGBM_Importance'] / comparison_df['LGBM_Importance'].max() * 100).round(2)
comparison_df['SHAP_Norm'] = (comparison_df['SHAP_MeanAbs'] / comparison_df['SHAP_MeanAbs'].max() * 100).round(2)

# Hitung korelasi ranking
comparison_df['LGBM_Rank'] = comparison_df['LGBM_Importance'].rank(ascending=False)
comparison_df['SHAP_Rank'] = comparison_df['SHAP_MeanAbs'].rank(ascending=False)
comparison_df['Rank_Diff'] = np.abs(comparison_df['LGBM_Rank'] - comparison_df['SHAP_Rank'])

# Sort by LGBM importance
comparison_df = comparison_df.sort_values('LGBM_Importance', ascending=False).reset_index(drop=True)

print("\nüîù Top 15 Features Comparison:")
print(comparison_df[['Feature', 'LGBM_Norm', 'SHAP_Norm', 'LGBM_Rank', 'SHAP_Rank', 'Rank_Diff']].head(15).to_string(index=False))

# Simpan comparison
comparison_df.to_csv('feature_importance_comparison.csv', index=False)
print(f"\n‚úÖ Comparison disimpan: feature_importance_comparison.csv")

# Visualisasi comparison untuk Top 15
top_15_comp = comparison_df.head(15)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Normalized comparison
x_pos = np.arange(len(top_15_comp))
width = 0.35

axes[0].barh(x_pos - width/2, top_15_comp['LGBM_Norm'], width, 
            label='LGBM Importance', color='steelblue', alpha=0.8)
axes[0].barh(x_pos + width/2, top_15_comp['SHAP_Norm'], width, 
            label='SHAP Mean |Value|', color='coral', alpha=0.8)
axes[0].set_yticks(x_pos)
axes[0].set_yticklabels(top_15_comp['Feature'])
axes[0].invert_yaxis()
axes[0].set_xlabel('Normalized Importance (0-100)', fontsize=10)
axes[0].set_title('Top 15: LGBM vs SHAP Importance', fontsize=12, fontweight='bold')
axes[0].legend()
axes[0].grid(axis='x', alpha=0.3)

# Plot 2: Rank difference
axes[1].barh(range(len(top_15_comp)), top_15_comp['Rank_Diff'], color='green', alpha=0.6)
axes[1].set_yticks(range(len(top_15_comp)))
axes[1].set_yticklabels(top_15_comp['Feature'])
axes[1].invert_yaxis()
axes[1].set_xlabel('Rank Difference (|LGBM Rank - SHAP Rank|)', fontsize=10)
axes[1].set_title('Ranking Consistency (Lower = More Consistent)', fontsize=12, fontweight='bold')
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('lgbm_vs_shap_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"‚úÖ Comparison plot disimpan: lgbm_vs_shap_comparison.png")

# Correlation analysis
from scipy.stats import spearmanr
corr, p_value = spearmanr(comparison_df['LGBM_Rank'], comparison_df['SHAP_Rank'])
print(f"\nüìà Spearman Rank Correlation: {corr:.4f} (p-value: {p_value:.4e})")
print(f"   Interpretasi: {'Sangat Konsisten' if corr > 0.8 else 'Cukup Konsisten' if corr > 0.6 else 'Kurang Konsisten'}")

# üìä Analisis Data Setelah Sampling

Bagian ini menganalisis data training setelah proses stratified sampling untuk memahami distribusi dan karakteristik data yang akan digunakan untuk training model.

In [None]:
# Hitung jumlah data dan proporsi kelas setelah sampling
print("="*60)
print("INFORMASI DATA SETELAH SAMPLING")
print("="*60)

print(f"\nüìå Total Data Awal (Train): {len(train_df):,}")
print(f"üìå Total Data Setelah Sampling: {len(train_sampled):,}")
print(f"üìå Persentase Sampling: {len(train_sampled)/len(train_df)*100:.2f}%")

print("\n" + "="*60)
print("DISTRIBUSI KELAS")
print("="*60)

# Hitung distribusi kelas
class_distribution = train_labels.value_counts().sort_index()
class_percentage = (class_distribution / len(train_labels) * 100).round(2)

# Buat DataFrame untuk visualisasi yang lebih baik
dist_df = pd.DataFrame({
    'Kelas': class_distribution.index,
    'Jumlah': class_distribution.values,
    'Persentase (%)': class_percentage.values
})

print(dist_df.to_string(index=False))

# Visualisasi distribusi kelas
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
dist_df.plot(x='Kelas', y='Jumlah', kind='bar', ax=axes[0], color='steelblue', legend=False)
axes[0].set_title('Distribusi Jumlah Sampel per Kelas', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Kelas', fontsize=10)
axes[0].set_ylabel('Jumlah Sampel', fontsize=10)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
colors = plt.cm.Set3(range(len(dist_df)))
axes[1].pie(dist_df['Jumlah'], labels=dist_df['Kelas'], autopct='%1.1f%%', 
            colors=colors, startangle=90)
axes[1].set_title('Proporsi Kelas (%)', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('class_distribution_after_sampling.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\n‚úÖ Plot distribusi kelas disimpan: class_distribution_after_sampling.png")

In [None]:
# Statistik Deskriptif Data Setelah Sampling
print("="*60)
print("STATISTIK DESKRIPTIF - DATA NUMERIK")
print("="*60)

# Ambil kolom numerik dari train_sampled (sebelum feature engineering)
numeric_cols = train_sampled.select_dtypes(include=[np.number]).columns.tolist()

# Exclude kolom 'is_train' jika ada
if 'is_train' in numeric_cols:
    numeric_cols.remove('is_train')

# Statistik deskriptif
stats_desc = train_sampled[numeric_cols].describe().T
stats_desc['missing'] = train_sampled[numeric_cols].isnull().sum()
stats_desc['missing_pct'] = (stats_desc['missing'] / len(train_sampled) * 100).round(2)

# Reorder columns
stats_desc = stats_desc[['count', 'missing', 'missing_pct', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']]

print(stats_desc.to_string())

# Simpan ke CSV
stats_desc.to_csv('statistics_after_sampling.csv')
print(f"\n‚úÖ Statistik deskriptif disimpan: statistics_after_sampling.csv")

In [None]:
# Visualisasi distribusi fitur-fitur penting
print("="*60)
print("VISUALISASI DISTRIBUSI FITUR NUMERIK")
print("="*60)

# Pilih beberapa fitur penting untuk divisualisasikan
important_features = [
    'Distance_KM', 'Duration_Minutes', 'Est_Price_IDR', 
    'GPS_Accuracy_M', 'Battery_Level', 'Accel_X', 'Accel_Y', 'Accel_Z'
]

# Filter hanya yang ada di data
available_features = [f for f in important_features if f in train_sampled.columns]

if len(available_features) > 0:
    # Hitung jumlah baris yang dibutuhkan
    n_cols = 3
    n_rows = (len(available_features) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows*4))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for idx, feature in enumerate(available_features):
        train_sampled[feature].hist(bins=50, ax=axes[idx], color='skyblue', edgecolor='black', alpha=0.7)
        axes[idx].set_title(f'Distribusi {feature}', fontsize=10, fontweight='bold')
        axes[idx].set_xlabel(feature, fontsize=9)
        axes[idx].set_ylabel('Frekuensi', fontsize=9)
        axes[idx].grid(axis='y', alpha=0.3)
    
    # Hide unused subplots
    for idx in range(len(available_features), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.savefig('feature_distributions_after_sampling.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"‚úÖ Plot distribusi fitur disimpan: feature_distributions_after_sampling.png")
else:
    print("‚ö†Ô∏è Tidak ada fitur numerik yang tersedia untuk divisualisasikan")

In [None]:
# Simpan data setelah sampling ke CSV untuk analisis lebih lanjut
print("="*60)
print("EXPORT DATA SETELAH SAMPLING")
print("="*60)

# Gabungkan kembali dengan label untuk export
train_sampled_with_label = train_sampled.copy()
train_sampled_with_label['Trip_Label'] = train_labels

# Simpan ke CSV
output_filename = 'train_data_after_sampling.csv'
train_sampled_with_label.to_csv(output_filename, index=False)

print(f"‚úÖ Data training setelah sampling disimpan: {output_filename}")
print(f"   - Total Rows: {len(train_sampled_with_label):,}")
print(f"   - Total Columns: {len(train_sampled_with_label.columns)}")
print(f"   - File Size: {train_sampled_with_label.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Summary ringkas
print("\nüìã Summary Kolom:")
print(train_sampled_with_label.dtypes.value_counts())

# Swap

In [None]:
# Lakukan swap label pada submission yang sudah ada
submission['Trip_Label'] = submission['Trip_Label'].replace({'Navigation_Issue': 'Service_Complaint', 'Service_Complaint': 'Navigation_Issue'})

# Simpan submission baru
submission.to_csv('submission_lgbm_pseudo_swap_all_v3.csv', index=False)