In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.metrics import accuracy_score
from category_encoders import TargetEncoder

# === LOAD DATA ===
df = pd.read_excel('Dataset TWS.xlsx')  # ganti dengan nama filemu

# === TARGET ENCODING SESUAI PYCARET ===
target_mapping = {
    'Sains Data Terapan': 0,
    'Sistem Pembangkit Energi': 1,
    'Teknik Elektro Industri': 2,
    'Teknik Elektronika': 3,
    'Teknik Informatika': 4,
    'Teknik Komputer': 5,
    'Teknik Mekatronika': 6,
    'Teknik Telekomunikasi': 7,
    'Teknologi Game': 8,
    'Teknologi Multimedia Broadcasting': 9,
    'Teknologi Rekayasa Internet': 10,
    'Teknologi Rekayasa Multimedia': 11
}

df['Program Studi'] = df['Program Studi'].map(target_mapping)

# ==== HANDLE MISSING VALUES ====
df = df.dropna(subset=['Program Studi'])
df['Program Studi'] = df['Program Studi'].astype(int)

# === SPLIT FEATURE & TARGET ===
X = df.drop('Program Studi', axis=1)
y = df['Program Studi']

# === IDENTIFIKASI FEATURES ===
numeric_features = ['Rata-rata Nilai Masuk PENS']
categorical_features = ['Jenjang Pendidikan', 'Minat dan Bakat', 'Jalur Pendaftaran PENS', 'Rencana Karir']

# === PREPROCESSING ===
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=3)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', TargetEncoder())  # TargetEncoder untuk menangkap hubungan kategori dengan target
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# === DECISION TREE CLASSIFIER ===
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=4824))
])

# === HYPERPARAMETER GRID ===
param_grid = {
    'classifier__criterion': ['entropy', 'gini'],
    'classifier__max_depth': [5, 10, 15, None],
    'classifier__min_samples_leaf': [1, 5, 10],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__max_features': [1.0, 'sqrt', 'log2'],
    'classifier__min_impurity_decrease': [0.0, 0.001],
    'classifier__splitter': ['best', 'random']
}

# === SPLIT TRAIN-TEST ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=4824)

# === CROSS VALIDATION & GRID SEARCH ===
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=4824)

grid_search = GridSearchCV(pipe, param_grid, cv=skf, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print(f"\nBest Hyperparameters: {grid_search.best_params_}")
print(f"Best CV Accuracy: {grid_search.best_score_:.4f}")

# === EVALUASI DI TEST SET ===
y_pred = grid_search.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {test_acc:.4f}")


Fitting 10 folds for each of 864 candidates, totalling 8640 fits

Best Hyperparameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': 10, 'classifier__max_features': 1.0, 'classifier__min_impurity_decrease': 0.0, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__splitter': 'best'}
Best CV Accuracy: 0.9651
Test Set Accuracy: 0.9668


In [None]:
best_model = grid_search.best_estimator_

import numpy as np

# Data baru dalam format DataFrame
data_baru = pd.DataFrame({
    'Jenjang Pendidikan': ['D3'],
    'Minat dan Bakat': ['Teknologi Informasi'],
    'Jalur Pendaftaran PENS': ['SNBP'],
    'Rencana Karir': ['Software Engineer'],
    'Rata-rata Nilai Masuk PENS': [85.0]
})

# Prediksi Program Studi
prediksi = best_model.predict(data_baru)
print(f'Hasil Prediksi Program Studi (numerik): {prediksi[0]}')

# Mapping kembali ke label asli
reverse_mapping = {v: k for k, v in target_mapping.items()}
print(f'Hasil Prediksi Program Studi: {reverse_mapping[prediksi[0]]}')

Hasil Prediksi Program Studi (numerik): 4
Hasil Prediksi Program Studi: Teknik Informatika


In [14]:
import pandas as pd

dataset_prev = pd.read_excel("Dataset TWS.xlsx")
dataset_prev.head(5)
dataset_prev.loc[dataset_prev['Program Studi'] == "Teknik Elektronika", "Program Studi"] = 1
dataset_prev.loc[dataset_prev['Program Studi'] == "Teknik Telekomunikasi", "Program Studi"] = 2
dataset_prev.loc[dataset_prev['Program Studi'] == "Teknik Elektro Industri", "Program Studi"] = 3
dataset_prev.loc[dataset_prev['Program Studi'] == "Teknik Informatika", "Program Studi"] = 4
dataset_prev.loc[dataset_prev['Program Studi'] == "Teknik Mekatronika", "Program Studi"] = 5
dataset_prev.loc[dataset_prev['Program Studi'] == "Teknik Komputer", "Program Studi"] = 6
dataset_prev.loc[dataset_prev['Program Studi'] == "Teknologi Multimedia Broadcasting", "Program Studi"] = 7
dataset_prev.loc[dataset_prev['Program Studi'] == "Sistem Pembangkitan Energi", "Program Studi"] = 8
dataset_prev.loc[dataset_prev['Program Studi'] == "Teknologi Game", "Program Studi"] = 9
dataset_prev.loc[dataset_prev['Program Studi'] == "Teknologi Rekayasa Internet", "Program Studi"] = 10
dataset_prev.loc[dataset_prev['Program Studi'] == "Teknologi Rekayasa Multimedia", "Program Studi"] = 11
dataset_prev.loc[dataset_prev['Program Studi'] == "Sains Data Terapan", "Program Studi"] = 12

dataset_prev.loc[dataset_prev['Jalur Pendaftaran PENS'] == "SBMPN (Seleksi Bersama Masuk Politeknik Negeri)", "Jalur Pendaftaran PENS"] = "SBMPN"
dataset_prev.loc[dataset_prev['Jalur Pendaftaran PENS'] == "SNMPN (Seleksi Nasional Masuk Politeknik Negeri)", "Jalur Pendaftaran PENS"] = "SNMPN"
dataset_prev.loc[dataset_prev['Jalur Pendaftaran PENS'] == "SBMPTN/SNBT (Seleksi Nasional Berdasarkan Tes)", "Jalur Pendaftaran PENS"] = "SBMPTN/SNBT"
dataset_prev.loc[dataset_prev['Jalur Pendaftaran PENS'] == "SNMPTN / SNBP (Seleksi Nasional Berdasarkan Prestasi)", "Jalur Pendaftaran PENS"] = "SNMPTN/SNBP"
dataset_prev.loc[dataset_prev['Jalur Pendaftaran PENS'] == "SIMANDIRI PENS", "Jalur Pendaftaran PENS"] = "SIMANDIRI"

dataset_prev.index += 1
dataset_prev.to_csv("dataset_new.csv", index=True)