# Load & Preview Dataset

In [1]:
import os
# Buat folder datasets
os.makedirs("datasets", exist_ok=True)

In [2]:
import numpy as np
import pandas as pd

In [3]:
# Atur seed agar hasil random bisa direproduksi
np.random.seed(42)

# Membuat 10 baris data dummy
data = {
    'School_ID': np.random.randint(1000, 9999, 10),
    'Date': pd.date_range(start='2018-09-01', periods=10).strftime('%Y-%m-%d'),
    'School_Name': [f'School {i}' for i in range(1, 11)],
    'School_Level': np.random.choice(['Elementary', 'Middle', 'High', 'K-8'], 10),
    'Enrolled': np.random.randint(300, 1200, 10),
    'Present': [],
    'Absent': [],
    'Released_Early': []
}

In [4]:
# Menyesuaikan jumlah siswa agar totalnya = Enrolled
for enrolled in data['Enrolled']:
    present = int(enrolled * np.random.uniform(0.7, 0.98))  # 70–98% kehadiran
    absent = int(enrolled * np.random.uniform(0.01, 0.2))   # 1–20% ketidakhadiran

    # Koreksi jika melebihi total
    if present + absent > enrolled:
        excess = (present + absent) - enrolled
        absent = max(0, absent - excess)
    
    released_early = enrolled - present - absent

    data['Present'].append(present)
    data['Absent'].append(absent)
    data['Released_Early'].append(released_early)


In [5]:
# Buat DataFrame dari data dummy
df = pd.DataFrame(data)

# Tampilkan 10 baris pertama
df.head(10)


Unnamed: 0,School_ID,Date,School_Name,School_Level,Enrolled,Present,Absent,Released_Early
0,8270,2018-09-01,School 1,Middle,576,501,21,54
1,1860,2018-09-02,School 2,Elementary,460,359,36,65
2,6390,2018-09-03,School 3,Middle,759,628,120,11
3,6191,2018-09-04,School 4,K-8,613,463,66,84
4,6734,2018-09-05,School 5,K-8,321,277,6,38
5,7265,2018-09-06,School 6,Middle,552,480,23,49
6,1466,2018-09-07,School 7,Middle,1047,751,199,97
7,5426,2018-09-08,School 8,Middle,1156,1121,35,0
8,6578,2018-09-09,School 9,K-8,860,675,24,161
9,9322,2018-09-10,School 10,K-8,774,690,72,12


In [6]:
# Simpan data dummy (tiruan dataset asli) ke CSV
df.to_csv("datasets/original_dataset.csv", index=False)
print("✅ original_dataset.csv berhasil dibuat di folder datasets/")

✅ original_dataset.csv berhasil dibuat di folder datasets/


> Dataset ini merupakan contoh tiruan dari NYC 2018-2019 Student Attendance, dengan struktur kolom yang identik.
> Kolom-kolom utama: `Enrolled`, `Present`, `Absent`, `Released_Early`, serta atribut sekolah.