In [1]:
import pandas as pd
import numpy as np
import joblib
import json
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import os

print("--- Memulai Proses Training dan Penyimpanan Aset ---")

# --- 1. MEMUAT DAN MEMBERSIHKAN DATA ---
# Ganti dengan path Anda jika perlu
base_path = r'C:\Users\HP\Documents\ptbdb'
data_path = os.path.join(base_path, 'dataset_final_untuk_ml_final.csv')
assets_path = os.path.join(base_path, 'assets_final') # Folder baru untuk aset kita
os.makedirs(assets_path, exist_ok=True)

df_final = pd.read_csv(data_path)

# Grouping Kelas
replacement_map = {
    'Heart failure (NYHA 2)': 'Heart failure', 'Heart failure (NYHA 3)': 'Heart failure',
    'Heart failure (NYHA 4)': 'Heart failure', 'Stable angina': 'Angina/Other Symptoms',
    'Unstable angina': 'Angina/Other Symptoms', 'Palpitation': 'Angina/Other Symptoms',
    'Hypertrophy': 'Myocardial hypertrophy'
}
df_final['Diagnosis'] = df_final['Diagnosis'].replace(replacement_map)

# One-Hot Encoding
kolom_untuk_encode = df_final.select_dtypes(include=['object']).columns.tolist()
if 'Record' in kolom_untuk_encode: kolom_untuk_encode.remove('Record')
if 'Diagnosis' in kolom_untuk_encode: kolom_untuk_encode.remove('Diagnosis')
df_encoded = pd.get_dummies(df_final, columns=kolom_untuk_encode, dummy_na=False)
print("✅ Data cleaning dan encoding selesai.")

# --- 2. PERSIAPAN X dan y ---
df_encoded['Diagnosis_ID'] = df_encoded['Diagnosis'].astype('category').cat.codes
y = df_encoded['Diagnosis_ID']
X = df_encoded.drop(columns=['Record', 'Diagnosis', 'Diagnosis_ID'])

# Simpan nama dan urutan kolom fitur SEBELUM imputasi
feature_columns = X.columns.tolist()
with open(os.path.join(assets_path, 'feature_columns.json'), 'w') as f:
    json.dump(feature_columns, f)
print("✅ Urutan kolom fitur berhasil disimpan.")

# --- 3. TRAIN-TEST SPLIT ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# --- 4. IMPUTASI NaN ---
# Buat imputer, latih HANYA pada X_train, lalu simpan
imputer = SimpleImputer(strategy='median')
imputer.fit(X_train)
joblib.dump(imputer, os.path.join(assets_path, 'imputer.joblib'))
print("✅ Imputer yang sudah dilatih berhasil disimpan.")

# Terapkan imputer ke data latih dan uji
X_train_imputed = imputer.transform(X_train)
X_test_imputed = imputer.transform(X_test)

# --- 5. SMOTE ---
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_imputed, y_train)
print("✅ SMOTE selesai.")

# --- 6. TRAINING MODEL ---
model = RandomForestClassifier(n_estimators=150, random_state=42)
model.fit(X_train_resampled, y_train_resampled)
print("✅ Model berhasil dilatih.")

# --- 7. MENYIMPAN MODEL ---
joblib.dump(model, os.path.join(assets_path, 'model_ekg_randomforest.joblib'))
print("✅ Model yang sudah dilatih berhasil disimpan.")
print("\n--- Semua Aset Berhasil Dibuat di Folder 'assets_final' ---")

--- Memulai Proses Training dan Penyimpanan Aset ---
✅ Data cleaning dan encoding selesai.
✅ Urutan kolom fitur berhasil disimpan.
✅ Imputer yang sudah dilatih berhasil disimpan.
✅ SMOTE selesai.
✅ Model berhasil dilatih.
✅ Model yang sudah dilatih berhasil disimpan.

--- Semua Aset Berhasil Dibuat di Folder 'assets_final' ---
