In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [None]:
# === 1. Baca dataset ===
df = pd.read_csv('lap_belanja_jan-juni2025.csv', sep=';', header=None, low_memory=False)

In [None]:
print("="*60)
print("CEK STRUKTUR DATAFRAME:")
print("="*60)
print(f"Jumlah kolom: {len(df.columns)}")
print(f"Jumlah baris: {len(df)}")
print("\nDaftar kolom yang ada:")
for i, col in enumerate(df.columns):
    print(f"  {i}: {col}")

print("\nInfo DataFrame:")
print(df.info())

# 2. LIHAT DATA
print("\nSample 3 baris pertama:")
print(df.head(3))

In [None]:
# === 2. Definisikan kolom ===
columns = [
    'id_transaksi', 'id_pasien', 'id_kunjungan', 'nama_pasien', 'waktu',
    'dokter', 'jenis_layanan', 'poli', 'sumber_pembayaran', 'biaya',
    'diskon', 'flags'
]
df.columns = columns

In [None]:
# === 3. Preprocessing ===
print("Preprocessing...")

In [None]:
# Convert 'waktu' ke datetime
df['waktu'] = pd.to_datetime(df['waktu'], format='%d/%m/%Y', errors='coerce')
df = df.dropna(subset=['waktu', 'biaya']).copy()

In [None]:
# Ekstrak fitur waktu
df['bulan'] = df['waktu'].dt.month
df['hari_dlm_minggu'] = df['waktu'].dt.dayofweek
df['hari_dlm_bulan'] = df['waktu'].dt.day

In [None]:
# Encode fitur kategorikal
label_encoders = {}
kategori_cols = ['dokter', 'poli', 'jenis_layanan']
for col in kategori_cols:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

In [None]:
# Pilih fitur untuk prediksi
feature_cols = ['bulan', 'hari_dlm_minggu', 'hari_dlm_bulan',
                'dokter_encoded', 'poli_encoded', 'jenis_layanan_encoded']
X = df[feature_cols]
y = df['biaya']

In [None]:
# === 4. Split data ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Jumlah data latih: {X_train.shape[0]}, data uji: {X_test.shape[0]}")

In [None]:
# === 5. Latih model ===
print("\nMelatih model...")

In [None]:
print("="*60)
print("CEK DATA SEBELUM SPLIT:")
print("="*60)

# Pastikan X dan y sudah didefinisikan
# Misalnya dari kode sebelumnya:
# X = df.drop('target_column', axis=1)
# y = df['target_column']

print(f"Shape X: {X.shape}")
print(f"Shape y: {y.shape}")
print(f"\nTipe data X:\n{X.dtypes}")
print(f"\nTipe data y: {y.dtype}")

# Cek missing values
print(f"\nMissing values di X: {X.isnull().sum().sum()}")
print(f"Missing values di y: {y.isnull().sum()}")

# Cek infinite values
print(f"\nInfinite values di X: {np.isinf(X.select_dtypes(include=[np.number])).sum().sum()}")
print(f"Infinite values di y: {np.isinf(y).sum() if np.issubdtype(y.dtype, np.number) else 0}")

# Sample data
print(f"\nSample X (3 baris pertama):")
print(X.head(3))
print(f"\nSample y (3 nilai pertama):")
print(y.head(3))

# ===== 2. BERSIHKAN DATA =====
print("\n" + "="*60)
print("MEMBERSIHKAN DATA:")
print("="*60)

# Pastikan hanya kolom numerik
X_numeric = X.select_dtypes(include=[np.number])
print(f"Kolom numerik di X: {X_numeric.columns.tolist()}")

# Jika ada kolom yang hilang
if len(X_numeric.columns) < len(X.columns):
    dropped = set(X.columns) - set(X_numeric.columns)
    print(f"Kolom non-numerik yang dihapus: {dropped}")

X = X_numeric.copy()

# Handle missing values
if X.isnull().sum().sum() > 0:
    print("Mengisi missing values dengan median...")
    X = X.fillna(X.median())

if y.isnull().sum() > 0:
    print("Mengisi missing values di y dengan median...")
    y = y.fillna(y.median())

# Handle infinite values
X = X.replace([np.inf, -np.inf], np.nan)
if X.isnull().sum().sum() > 0:
    X = X.fillna(X.median())

if np.issubdtype(y.dtype, np.number):
    y = y.replace([np.inf, -np.inf], np.nan)
    if y.isnull().sum() > 0:
        y = y.fillna(y.median())

print(f"✓ Data setelah cleaning:")
print(f"  X shape: {X.shape}")
print(f"  y shape: {y.shape}")
print(f"  Missing values di X: {X.isnull().sum().sum()}")
print(f"  Missing values di y: {y.isnull().sum()}")

# ===== 3. FILTER DATA YANG VALID =====
# Pastikan X dan y punya panjang yang sama dan tidak ada NaN
valid_idx = ~(X.isnull().any(axis=1) | y.isnull())
X = X[valid_idx]
y = y[valid_idx]

print(f"\n✓ Data valid: {len(X)} baris")

In [None]:
# ===== 4. CEK APAKAH DATA CUKUP =====
if len(X) < 10:
    print(f"\n ERROR: Data terlalu sedikit ({len(X)} baris)")
    print("Perlu minimal 10 data untuk training model")
    print("\nKemungkinan penyebab:")
    print("1. Target column punya terlalu banyak nilai kosong")
    print("2. Terlalu banyak missing values di features")
    print("3. Filtering terlalu ketat")
else:
    # ===== 5. SPLIT DATA =====
    print("\n" + "="*60)
    print("SPLIT DATA:")
    print("="*60)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2, 
        random_state=42
    )
    
    print(f" Split berhasil!")
    print(f"  X_train: {X_train.shape}")
    print(f"  X_test: {X_test.shape}")
    print(f"  y_train: {y_train.shape}")
    print(f"  y_test: {y_test.shape}")
    
    # Validasi sekali lagi
    print(f"\n Validasi data training:")
    print(f"  X_train kosong? {X_train.empty}")
    print(f"  y_train kosong? {y_train.empty}")
    print(f"  X_train punya NaN? {X_train.isnull().sum().sum() > 0}")
    print(f"  y_train punya NaN? {y_train.isnull().sum() > 0}")
    
    # ===== 6. TRAIN MODEL =====
    print("\n" + "="*60)
    print("TRAINING MODEL:")
    print("="*60)
    
    try:
        # Model 1: Linear Regression
        lr = LinearRegression()
        lr.fit(X_train, y_train)
        
        # Prediksi
        y_pred_lr = lr.predict(X_test)
        
        # Evaluasi
        from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
        
        train_score = lr.score(X_train, y_train)
        test_score = lr.score(X_test, y_test)
        mse = mean_squared_error(y_test, y_pred_lr)
        mae = mean_absolute_error(y_test, y_pred_lr)
        rmse = np.sqrt(mse)
        
        print("✓ Linear Regression berhasil dilatih!")
        print(f"\nPerforma Model:")
        print(f"  Train R² Score: {train_score:.4f}")
        print(f"  Test R² Score: {test_score:.4f}")
        print(f"  RMSE: {rmse:.2f}")
        print(f"  MAE: {mae:.2f}")
        print(f"  MSE: {mse:.2f}")
        
        # Koefisien
        print(f"\n Feature Importance (Koefisien):")
        coef_df = pd.DataFrame({
            'Feature': X.columns,
            'Coefficient': lr.coef_
        }).sort_values('Coefficient', key=abs, ascending=False)
        print(coef_df.head(10))
        
    except Exception as e:
        print(f"Error saat training: {str(e)}")
        print("\nDebug info:")
        print(f"  X_train dtype: {X_train.dtypes.unique()}")
        print(f"  y_train dtype: {y_train.dtype}")
        print(f"  X_train sample:\n{X_train.head()}")
        print(f"  y_train sample:\n{y_train.head()}")

# MODEL FOREST REGRESSOR

In [None]:
# ===== 1. IDENTIFIKASI KOLOM BERMASALAH =====
print("="*60)
print("IDENTIFIKASI KOLOM BERMASALAH:")
print("="*60)

# Cek tipe data setiap kolom
print("Tipe data setiap kolom:")
print(X_train.dtypes)

# Cari kolom yang bertipe object (kemungkinan string)
object_columns = X_train.select_dtypes(include=['object']).columns.tolist()
print(f"\nKolom bertipe 'object' (string): {object_columns}")

# Lihat sample data dari kolom object
for col in object_columns:
    print(f"\n{col} - Sample data:")
    print(X_train[col].head(10).values)

# ===== 2. BERSIHKAN FORMAT ANGKA =====
print("\n" + "="*60)
print("MEMBERSIHKAN FORMAT ANGKA:")
print("="*60)

def clean_numeric_column(series):
    """
    Fungsi untuk membersihkan kolom yang berisi angka dengan format string
    Contoh: '790,000.00' -> 790000.00
    """
    if series.dtype == 'object':
        # Hapus koma, spasi, dan karakter non-numerik lainnya
        # Kecuali titik (desimal) dan tanda minus
        cleaned = series.astype(str).str.replace(',', '', regex=False)
        cleaned = cleaned.str.replace(' ', '', regex=False)
        cleaned = cleaned.str.strip()
        
        # Konversi ke float
        try:
            return pd.to_numeric(cleaned, errors='coerce')
        except:
            return series
    return series

# Terapkan ke semua kolom di X_train dan X_test
print("Membersihkan X_train...")
for col in X_train.columns:
    X_train[col] = clean_numeric_column(X_train[col])

print("Membersihkan X_test...")
for col in X_test.columns:
    X_test[col] = clean_numeric_column(X_test[col])

# Bersihkan y_train dan y_test jika perlu
if y_train.dtype == 'object':
    print("Membersihkan y_train...")
    y_train = clean_numeric_column(y_train)

if y_test.dtype == 'object':
    print("Membersihkan y_test...")
    y_test = clean_numeric_column(y_test)

print("✓ Pembersihan selesai!")

# ===== 3. CEK HASIL PEMBERSIHAN =====
print("\n" + "="*60)
print("HASIL SETELAH PEMBERSIHAN:")
print("="*60)

print("Tipe data X_train:")
print(X_train.dtypes)

print(f"\nTipe data y_train: {y_train.dtype}")

# Cek missing values yang mungkin terjadi setelah konversi
print(f"\nMissing values di X_train: {X_train.isnull().sum().sum()}")
print(f"Missing values di y_train: {y_train.isnull().sum()}")

# ===== 4. HANDLE MISSING VALUES =====
if X_train.isnull().sum().sum() > 0:
    print("\n Ada missing values setelah konversi, mengisi dengan median...")
    X_train = X_train.fillna(X_train.median())
    X_test = X_test.fillna(X_test.median())

if y_train.isnull().sum() > 0:
    print(" Ada missing values di y_train, mengisi dengan median...")
    y_train = y_train.fillna(y_train.median())
    y_test = y_test.fillna(y_test.median())

# ===== 5. VALIDASI AKHIR =====
print("\n" + "="*60)
print("VALIDASI FINAL:")
print("="*60)

print(f"Shape X_train: {X_train.shape}")
print(f"Shape X_test: {X_test.shape}")
print(f"Shape y_train: {y_train.shape}")
print(f"Shape y_test: {y_test.shape}")

print(f"\nSemua kolom X_train numerik? {X_train.select_dtypes(include=[np.number]).shape[1] == X_train.shape[1]}")
print(f"y_train numerik? {np.issubdtype(y_train.dtype, np.number)}")

print("\nSample X_train (3 baris):")
print(X_train.head(3))
print("\nSample y_train (5 nilai):")
print(y_train.head(5))

# ===== 6. TRAIN MODEL =====
print("\n" + "="*60)
print("TRAINING RANDOM FOREST:")
print("="*60)

try:
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    # Prediksi
    y_pred_rf = rf.predict(X_test)
    
    # Evaluasi
    from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
    
    train_score = rf.score(X_train, y_train)
    test_score = rf.score(X_test, y_test)
    mse = mean_squared_error(y_test, y_pred_rf)
    mae = mean_absolute_error(y_test, y_pred_rf)
    rmse = np.sqrt(mse)
    
    print(" Random Forest berhasil dilatih!")
    print(f"\n Performa Model:")
    print(f"  Train R² Score: {train_score:.4f}")
    print(f"  Test R² Score: {test_score:.4f}")
    print(f"  RMSE: {rmse:,.2f}")
    print(f"  MAE: {mae:,.2f}")
    print(f"  MSE: {mse:,.2f}")
    
    # Feature Importance
    print(f"\n Top 10 Feature Importance:")
    importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': rf.feature_importances_
    }).sort_values('Importance', ascending=False)
    print(importance_df.head(10).to_string(index=False))
    
except Exception as e:
    print(f" Error saat training: {str(e)}")
    print("\nDebug lebih lanjut:")
    print("Tipe data X_train:")
    for col in X_train.columns:
        print(f"  {col}: {X_train[col].dtype}")

In [None]:
# Model 2: Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
# === 6. Evaluasi ===
def evaluate(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"\n{model_name}:")
    print(f"  MAE: {mae:,.2f}")
    print(f"  RMSE: {rmse:,.2f}")
    print(f"  R²: {r2:.4f}")
    return mae, rmse, r2


In [None]:
# ===== FUNGSI EVALUASI =====
def evaluate(y_true, y_pred, model_name):
    """
    Fungsi untuk mengevaluasi performa model
    """
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{'='*60}")
    print(f"EVALUASI MODEL: {model_name}")
    print(f"{'='*60}")
    print(f"MAE (Mean Absolute Error)  : {mae:,.2f}")
    print(f"RMSE (Root Mean Squared Error): {rmse:,.2f}")
    print(f"R² Score                   : {r2:.4f}")
    
    return mae, rmse, r2

In [None]:
# ===== FUNGSI EVALUASI =====
def evaluate(y_true, y_pred, model_name):
    """
    Fungsi untuk mengevaluasi performa model
    """
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{'='*60}")
    print(f"EVALUASI MODEL: {model_name}")
    print(f"{'='*60}")
    print(f"MAE (Mean Absolute Error)  : {mae:,.2f}")
    print(f"RMSE (Root Mean Squared Error): {rmse:,.2f}")
    print(f"R² Score                   : {r2:.4f}")
    
    return mae, rmse, r2

# ===== PASTIKAN DATA SUDAH BERSIH DAN TER-SPLIT =====
print("="*60)
print("CEK DATA:")
print("="*60)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Validasi tipe data
print(f"\nTipe data X_train: {X_train.dtypes.unique()}")
print(f"Tipe data y_train: {y_train.dtype}")

# Cek missing values
print(f"\nMissing values X_train: {X_train.isnull().sum().sum()}")
print(f"Missing values X_test: {X_test.isnull().sum().sum()}")
print(f"Missing values y_train: {y_train.isnull().sum()}")
print(f"Missing values y_test: {y_test.isnull().sum()}")

# ===== MODEL 1: LINEAR REGRESSION =====
print("\n" + "="*60)
print("TRAINING MODEL 1: LINEAR REGRESSION")
print("="*60)

try:
    # Inisialisasi dan train model
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    # Prediksi
    y_pred_lr = lr.predict(X_test)
    
    print("✓ Linear Regression berhasil dilatih!")
    print(f"Train Score: {lr.score(X_train, y_train):.4f}")
    print(f"Test Score: {lr.score(X_test, y_test):.4f}")
    
    # Evaluasi
    mae_lr, rmse_lr, r2_lr = evaluate(y_test, y_pred_lr, "Linear Regression")
    
except Exception as e:
    print(f" Error pada Linear Regression: {str(e)}")
    y_pred_lr = None
    mae_lr, rmse_lr, r2_lr = None, None, None

# ===== MODEL 2: RANDOM FOREST =====
print("\n" + "="*60)
print("TRAINING MODEL 2: RANDOM FOREST")
print("="*60)

try:
    # Inisialisasi dan train model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    
    # Prediksi
    y_pred_rf = rf.predict(X_test)
    
    print(" Random Forest berhasil dilatih!")
    print(f"Train Score: {rf.score(X_train, y_train):.4f}")
    print(f"Test Score: {rf.score(X_test, y_test):.4f}")
    
    # Evaluasi
    mae_rf, rmse_rf, r2_rf = evaluate(y_test, y_pred_rf, "Random Forest")
    
    # Feature Importance
    print(f"\n Top 10 Feature Importance:")
    importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': rf.feature_importances_
    }).sort_values('Importance', ascending=False)
    print(importance_df.head(10).to_string(index=False))
    
except Exception as e:
    print(f" Error pada Random Forest: {str(e)}")
    y_pred_rf = None
    mae_rf, rmse_rf, r2_rf = None, None, None

# ===== PERBANDINGAN MODEL =====
if y_pred_lr is not None and y_pred_rf is not None:
    print("\n" + "="*60)
    print("PERBANDINGAN PERFORMA MODEL:")
    print("="*60)
    
    comparison_df = pd.DataFrame({
        'Model': ['Linear Regression', 'Random Forest'],
        'MAE': [mae_lr, mae_rf],
        'RMSE': [rmse_lr, rmse_rf],
        'R² Score': [r2_lr, r2_rf]
    })
    
    print(comparison_df.to_string(index=False))
    
    # Tentukan model terbaik
    best_model = 'Random Forest' if r2_rf > r2_lr else 'Linear Regression'
    print(f"\n Model terbaik (berdasarkan R² Score): {best_model}")
    
    # Visualisasi perbandingan (opsional)
    print("\n" + "="*60)
    print("VISUALISASI PREDIKSI vs AKTUAL:")
    print("="*60)
    
    result_df = pd.DataFrame({
        'Aktual': y_test.values[:10],
        'Prediksi LR': y_pred_lr[:10],
        'Prediksi RF': y_pred_rf[:10]
    })
    print("\nSample 10 prediksi pertama:")
    print(result_df.to_string(index=False))

else:
    print("\n Tidak bisa membandingkan model karena ada error pada salah satu model")

In [None]:
# Pastikan X_train, X_test, y_train, y_test sudah ada
print(f"Data ready: X_train {X_train.shape}, y_train {y_train.shape}")

In [None]:
# CELL 2: Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("✓ Linear Regression selesai")
print(f"R² Score: {r2_score(y_test, y_pred_lr):.4f}")

In [None]:
# CELL 3: Train Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("✓ Random Forest selesai")
print(f"R² Score: {r2_score(y_test, y_pred_rf):.4f}")

In [None]:
# CELL 4: Evaluasi (sekarang y_pred_lr dan y_pred_rf sudah ada)
def evaluate(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{model_name}:")
    print(f"  MAE: {mae:,.2f}")
    print(f"  RMSE: {rmse:,.2f}")
    print(f"  R²: {r2:.4f}")
    
    return mae, rmse, r2

mae_lr, rmse_lr, r2_lr = evaluate(y_test, y_pred_lr, "Linear Regression")
mae_rf, rmse_rf, r2_rf = evaluate(y_test, y_pred_rf, "Random Forest")

In [None]:
# === 7. Pilih model terbaik ===
if r2_rf > r2_lr:
    best_model = "Random Forest"
    best_r2 = r2_rf
else:
    best_model = "Linear Regression"
    best_r2 = r2_lr

In [43]:
print(f"\n Model terbaik: {best_model} (R² = {best_r2:.4f})")


 Model terbaik: Linear Regression (R² = 0.3438)


In [45]:
input_data = np.array([[6, 2, 15, 3, 1, 2]])  # Contoh input data
predicted_cost = lr.predict(input_data)
print(f"\n Prediksi biaya untuk input {input_data[0]}: {predicted_cost[0]:,.2f}")


 Prediksi biaya untuk input [ 6  2 15  3  1  2]: -17,615,423.44


In [44]:
import pickle

filename = 'prediksibelanja.sav'
pickle.dump(lr, open(filename, 'wb'))