# Preprocessing & Feature Engineering

In [1]:
import numpy as np
import pandas as pd

# — Mulai block pembuatan data dummy —
np.random.seed(42)
data = {
    'School_ID': np.random.randint(1000, 9999, 10),
    'Date': pd.date_range(start='2018-09-01', periods=10).strftime('%Y-%m-%d'),
    'School_Name': [f'School {i}' for i in range(1, 11)],
    'School_Level': np.random.choice(['Elementary', 'Middle', 'High', 'K-8'], 10),
    'Enrolled': np.random.randint(300, 1200, 10),
    'Present': [],
    'Absent': [],
    'Released_Early': []
}
for enrolled in data['Enrolled']:
    present = int(enrolled * np.random.uniform(0.7, 0.98))
    absent  = int(enrolled * np.random.uniform(0.01, 0.2))
    if present + absent > enrolled:
        absent -= (present + absent) - enrolled
    released_early = enrolled - present - absent
    data['Present'].append(present)
    data['Absent'].append(absent)
    data['Released_Early'].append(released_early)
df = pd.DataFrame(data)
# — Selesai block pembuatan data dummy —

In [2]:
# Menghitung persentase kehadiran
df['Attendance_Rate'] = df['Present'] / df['Enrolled'] * 100

# Menghitung persentase ketidakhadiran
df['Absence_Rate'] = df['Absent'] / df['Enrolled'] * 100

# Menghitung persentase pulang lebih awal
df['Early_Release_Rate'] = df['Released_Early'] / df['Enrolled'] * 100



In [3]:
# Labeling: High jika attendance ≥ 90%, sisanya Low
df['Attendance_Label'] = df['Attendance_Rate'].apply(lambda x: 'High' if x >= 90 else 'Low')

# Tampilkan beberapa baris hasil
df[['School_Name', 'Enrolled', 'Present', 'Absent', 
    'Attendance_Rate', 'Absence_Rate', 
    'Early_Release_Rate', 'Attendance_Label']].head()


Unnamed: 0,School_Name,Enrolled,Present,Absent,Attendance_Rate,Absence_Rate,Early_Release_Rate,Attendance_Label
0,School 1,576,501,21,86.979167,3.645833,9.375,Low
1,School 2,460,359,36,78.043478,7.826087,14.130435,Low
2,School 3,759,628,120,82.740448,15.810277,1.449275,Low
3,School 4,613,463,66,75.530179,10.766721,13.7031,Low
4,School 5,321,277,6,86.292835,1.869159,11.838006,Low


> Langkah ini menghasilkan tiga fitur utama untuk klasifikasi:
> - `Attendance_Rate` = (Present / Enrolled) * 100
> - `Absence_Rate` = (Absent / Enrolled) * 100
> - `Early_Release_Rate` = (Released_Early / Enrolled) * 100
>
> Label target `Attendance_Label` berisi `High` jika kehadiran ≥ 90%, sisanya `Low`.


In [4]:
# Simpan hasil preprocessing & labeling ke CSV
df.to_csv("datasets/processed_dataset.csv", index=False)
print("✅ processed_dataset.csv berhasil dibuat di folder datasets/")

✅ processed_dataset.csv berhasil dibuat di folder datasets/
