In [125]:
import pandas as pd
import statsmodels.formula.api as smf

In [126]:
df = pd.read_csv('../data/gaji_dummy.csv')

In [127]:
print("=== 5 Data Pertama ===")
df.head()

=== 5 Data Pertama ===


Unnamed: 0,id,experience_years,gender,education_level,salary_million
0,1,10,L,SMA,14.0
1,2,3,L,SMA,8.4
2,3,10,L,S2,17.3
3,4,0,L,SMA,5.0
4,5,9,L,S2,16.1


In [128]:
df_gender_dummy = pd.get_dummies(df['gender'], dtype='int', prefix='gender')

In [129]:
df = pd.concat([df, df_gender_dummy], axis=1)

In [130]:
df['education_level'] = df['education_level'].map({'SMA': 0, 'S1': 1, 'S2': 2, })

In [131]:
print("=== Distribusi Kategori ===")
df[["gender", "education_level"]].value_counts()

=== Distribusi Kategori ===


gender  education_level
L       0                  7
        2                  5
P       2                  4
        0                  2
L       1                  1
P       1                  1
Name: count, dtype: int64

In [132]:
df.drop(['gender'], axis=1, inplace=True)

In [133]:
df

Unnamed: 0,id,experience_years,education_level,salary_million,gender_L,gender_P
0,1,10,0,14.0,1,0
1,2,3,0,8.4,1,0
2,3,10,2,17.3,1,0
3,4,0,0,5.0,1,0
4,5,9,2,16.1,1,0
5,6,10,0,11.9,0,1
6,7,4,0,9.1,1,0
7,8,5,0,7.4,0,1
8,9,5,0,9.3,1,0
9,10,5,2,11.5,0,1


In [139]:
# 2. Model regresi dengan variabel dummy (pakai formula)
# salary_million ~ experience_years + gender + education_level
model = smf.ols(
    "salary_million ~ experience_years + C(gender_L) + C(gender_P) + C(education_level)",
    data=df
).fit()

In [138]:
print("=== Ringkasan Model Regresi Dummy (Dasar) ===")
model.summary()

=== Ringkasan Model Regresi Dummy (Dasar) ===


0,1,2,3
Dep. Variable:,salary_million,R-squared:,0.986
Model:,OLS,Adj. R-squared:,0.982
Method:,Least Squares,F-statistic:,263.9
Date:,"Tue, 02 Dec 2025",Prob (F-statistic):,1.05e-13
Time:,15:13:15,Log-Likelihood:,-11.172
No. Observations:,20,AIC:,32.34
Df Residuals:,15,BIC:,37.32
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.9862,0.171,17.514,0.000,2.623,3.350
C(gender_L)[T.1],2.3137,0.122,18.929,0.000,2.053,2.574
C(gender_P)[T.1],0.6726,0.166,4.043,0.001,0.318,1.027
C(education_level)[T.1],1.9695,0.387,5.083,0.000,1.144,2.795
C(education_level)[T.2],3.6250,0.240,15.134,0.000,3.114,4.136
experience_years,0.8548,0.033,25.767,0.000,0.784,0.925

0,1,2,3
Omnibus:,3.521,Durbin-Watson:,1.808
Prob(Omnibus):,0.172,Jarque-Bera (JB):,2.421
Skew:,0.852,Prob(JB):,0.298
Kurtosis:,2.946,Cond. No.,1.28e+17


In [136]:
# Catatan interpretasi singkat:
print("\nCatatan:")
print("- Intercept    : kategori baseline (misal: P & SMA) dengan experience_years = 0")
print("- experience_years : tambahan gaji (juta) tiap 1 tahun pengalaman")
print("- C(gender)[T.L]   : selisih gaji L terhadap P (baseline)")
print("- C(education_level)[T.S1], [T.S2] : selisih terhadap SMA (baseline)")


Catatan:
- Intercept    : kategori baseline (misal: P & SMA) dengan experience_years = 0
- experience_years : tambahan gaji (juta) tiap 1 tahun pengalaman
- C(gender)[T.L]   : selisih gaji L terhadap P (baseline)
- C(education_level)[T.S1], [T.S2] : selisih terhadap SMA (baseline)
