In [1]:
# Import library yang dibutuhkan
import pickle
import pandas as pd
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error,root_mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
data = pd.read_csv('stunting.csv')
data.head(10)

Unnamed: 0,Kabupaten,Tahun,PrevalensiStunting,BayiBBLR,IbuNifasVitA,K4,IPM,MinumLayak,SanitasiLayak
0,KAB ASAHAN,2023,11.0,0.69,86.75,92.6,71.56,93.94,92.13
1,KAB ASAHAN,2022,15.3,0.4,62.23,87.23,71.13,92.92,89.33
2,KAB ASAHAN,2021,18.9,0.23,95.29,80.0,70.49,95.78,89.09
3,KAB BATU BARA,2023,17.7,0.59,97.13,86.0,70.31,97.21,92.54
4,KAB BATU BARA,2022,21.7,0.09,99.7,94.98,69.51,96.96,88.25
5,KAB BATU BARA,2021,30.9,0.86,99.7,92.0,68.58,97.83,88.04
6,KAB DAIRI,2023,32.6,1.62,98.73,71.3,73.27,92.31,94.42
7,KAB DAIRI,2022,28.6,0.95,99.05,66.43,72.56,96.68,94.14
8,KAB DAIRI,2021,34.2,0.98,89.72,113.0,71.84,91.9,92.35
9,KAB DELI SERDANG,2023,33.8,0.75,88.95,98.2,76.52,95.51,96.78


In [3]:
# Menentukan fitur dan target
features = ['BayiBBLR','IbuNifasVitA', 'K4', 'IPM', 'MinumLayak', 'SanitasiLayak']
target = 'PrevalensiStunting'

In [4]:
#Inisialisasi DataFrame kosong untuk menyimpan data yang diwinsorized
winsorized_data = pd.DataFrame()

# Step 1: Winsorizing data per kabupaten
for kabupaten in data["Kabupaten"].unique():
    # Filter data untuk kabupaten saat ini
    kabupaten_data = data[data["Kabupaten"] == kabupaten].copy()

    # Terapkan Winsorizing pada setiap kolom numerik
    for col in features:
        lower_limit = kabupaten_data[col].quantile(0.05)
        upper_limit = kabupaten_data[col].quantile(0.95)
        kabupaten_data[col] = kabupaten_data[col].clip(lower=lower_limit, upper=upper_limit)

    # Gabung data yang telah diwinsorasi untuk setiap kabupaten.
    winsorized_data = pd.concat([winsorized_data, kabupaten_data], ignore_index=True)

In [5]:
# Membagi data menjadi training dan testing berdasarkan kolom tahun
train_data = winsorized_data[winsorized_data["Tahun"].isin([2021, 2022])]
test_data = winsorized_data[winsorized_data["Tahun"] == 2023]

In [9]:
# Tentukan kamus untuk menyimpan hasil
results = {'Kabupaten': [], 'Tahun': [], 'Metode': [], 'Aktual': [], 'Prediksi': [], 'MSE': [], 'MAPE': []}

# Latih dan evaluasi model per kabupaten
for city in test_data['Kabupaten'].unique():
    # Memfilter data untuk kota saat ini
    city_train_data = train_data[train_data['Kabupaten'] == city]
    city_test_data = test_data[test_data['Kabupaten'] == city]

    # Lewati jika tidak ada data untuk pelatihan atau pengujian
    if city_train_data.empty or city_test_data.empty:
        continue

    # Fitur dan target terpisah
    X_train_city = city_train_data[features]
    y_train_city = city_train_data[target]
    X_test_city = city_test_data[features]
    y_test_city = city_test_data[target]

    # Standarisasi X_train_city dan X_test_city untuk SVR
    scaler = StandardScaler()
    X_train_city_scaled = scaler.fit_transform(X_train_city)
    X_test_city_scaled = scaler.transform(X_test_city)

    # SVR model
    svr_model = SVR(kernel='linear')
    svr_model.fit(X_train_city_scaled, y_train_city)
    y_pred_svr = svr_model.predict(X_test_city_scaled)

    # Decision Tree model
    dt_model = DecisionTreeRegressor(random_state=42)
    dt_model.fit(X_train_city, y_train_city)
    y_pred_dt = dt_model.predict(X_test_city)

    # Random Forest model
    rf_model = RandomForestRegressor(n_estimators=3, random_state=42)
    rf_model.fit(X_train_city, y_train_city)
    y_pred_rf = rf_model.predict(X_test_city)

    # Simpan prediksi dan hitung metrik untuk setiap model
    for model_name, y_pred in zip(['SVR', 'Decision Tree', 'Random Forest'], [y_pred_svr, y_pred_dt, y_pred_rf]):
        mse = mean_squared_error(y_test_city, y_pred)
        mape = np.mean(np.abs((y_test_city - y_pred) / y_test_city)) * 100

        # Tambahkan hasil untuk setiap data uji
        for actual, pred, year in zip(y_test_city.values, y_pred, city_test_data['Tahun']):
            results['Kabupaten'].append(city)
            results['Tahun'].append(year)
            results['Metode'].append(model_name)
            results['Aktual'].append(actual)
            results['Prediksi'].append(pred)
            results['MSE'].append(mse)
            results['MAPE'].append(mape)

# Mengonversi hasil ke DataFrame
final_results_df = pd.DataFrame(results)
final_results_df.head(20)


Unnamed: 0,Kabupaten,Tahun,Metode,Aktual,Prediksi,MSE,MAPE
0,KAB ASAHAN,2023,SVR,11.0,7.642992,11.269502,30.518254
1,KAB ASAHAN,2023,Decision Tree,11.0,15.3,18.49,39.090909
2,KAB ASAHAN,2023,Random Forest,11.0,16.5,30.25,50.0
3,KAB BATU BARA,2023,SVR,17.7,-9.409582,734.929439,153.16148
4,KAB BATU BARA,2023,Decision Tree,17.7,21.7,16.0,22.59887
5,KAB BATU BARA,2023,Random Forest,17.7,27.833333,102.684444,57.250471
6,KAB DAIRI,2023,SVR,32.6,48.719506,259.838466,49.446337
7,KAB DAIRI,2023,Decision Tree,32.6,34.2,2.56,4.907975
8,KAB DAIRI,2023,Random Forest,32.6,30.466667,4.551111,6.543967
9,KAB DELI SERDANG,2023,SVR,33.8,21.23862,157.788265,37.163846


In [7]:
# Simpan DataFrame ke dalam file CSV
output_file = 'final_results.csv'
final_results_df.to_csv(output_file, index=False)
