# 📘 Preprocessing Data Mahasiswa
Contoh preprocessing data mahasiswa menggunakan **pandas**, **scikit-learn**, dan **SMOTE**.


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [None]:
# 1. Baca data (contoh dari CSV)
# Ubah 'data_mahasiswa.csv' sesuai nama file dataset kamu
data = pd.read_csv("data_mahasiswa.csv")

# 2. Data Cleaning
data = data.drop_duplicates()
data['IPK'] = data['IPK'].fillna(data['IPK'].mean())
data['ABSEN'] = data['ABSEN'].fillna(data['ABSEN'].median())
data['MATA KULIAH GAGAL'] = data['MATA KULIAH GAGAL'].fillna(0)

data.head()

In [None]:
# 3. Encoding Data Kategorikal
label_enc = LabelEncoder()
data['KELAMIN'] = label_enc.fit_transform(data['KELAMIN'])
data['JURUSAN'] = label_enc.fit_transform(data['JURUSAN'])

# 4. Feature Engineering
data['PERSEN_SKS'] = data['TOTAL SKS'] / (data['TOTAL SKS'] + data['SISA SKS'])
data['ABSEN_NORM'] = 1 - data['ABSEN']

data.head()

In [None]:
# 5. Pisahkan fitur dan target
X = data.drop(columns=['NIM','LULUS DALAM 4 TAHUN'])
y = data['LULUS DALAM 4 TAHUN']

# 6. Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 7. Tangani Imbalance dengan SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# 8. Split Data Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

print("Jumlah data setelah SMOTE:", X_resampled.shape)
print("Distribusi target:\n", y_resampled.value_counts())