In [3]:
import numpy as np
import pandas as pd
import warnings

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# --- TensorFlow / Keras ---
try:
    import tensorflow as tf
    from tensorflow.keras import layers, models, callbacks, regularizers
except ImportError as e:
    raise ImportError(
        "TensorFlow is not installed. Please install it first, e.g.:\n"
        "  pip install tensorflow\n"
        "or\n"
        "  conda install -c conda-forge tensorflow"
    ) from e

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [4]:
# ========= HYPERPARAMETERS (adjust here) =========
learning_rate = 1e-3          # Adam learning rate
epochs = 100                  # Max epochs (early stopping is used)
batch_size = 512              # Batch size
hidden_units = [256, 128]     # Hidden layer widths (add/remove/resize layers)
dropout_rate = 0.30           # Dropout probability
l2_reg = 0.0                  # L2 regularization factor (e.g., 0.0001)
use_class_weight = True       # Use class weights for imbalanced data
early_stopping_patience = 10  # Patience for early stopping
reduce_lr_patience = 5        # Patience for reducing learning rate
reduce_lr_factor = 0.5        # Factor to reduce learning rate by
decision_threshold = 0.50     # Inference threshold (default 0.5)

# Embedding dimension rule: dim = min(50, max(4, ceil(vocab_size^0.5)))
def embedding_dim(vocab_size: int) -> int:
    return int(min(50, max(4, np.ceil(vocab_size ** 0.5))))
# ================================================

In [5]:
# ---------- 1) Load data ----------
in_path = "data/dropoutgraduate.csv"
df = pd.read_csv(in_path, sep=";")

# Find Target column (case-insensitive)
target_col = next((c for c in df.columns if c.strip().lower() == "target"), None)
if target_col is None:
    raise KeyError("Couldn't find a 'Target' column (case-insensitive).")

# Ensure binary target {0,1}; map labels if needed; drop any rows with class 2 if present
y_num = pd.to_numeric(df[target_col], errors="coerce")
if y_num.isna().any():
    print("Non-numeric target found, mapping 'Dropout'->0, 'Graduate'->1.")
    label_to_code = {"dropout": 0, "graduate": 1, "enrolled": 2}
    y_num = df[target_col].astype(str).str.strip().str.lower().map(label_to_code)
df[target_col] = y_num.astype(int)
df = df[df[target_col].isin([0, 1])].copy()

# Split features/labels
X = df.drop(columns=[target_col])
y = df[target_col].astype(int).values

# Identify column types
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in X.columns if c not in numeric_cols]

print(f"Data loaded. Shape: {df.shape}")
print(f"Numeric features: {len(numeric_cols)}")
print(f"Categorical features: {len(categorical_cols)}")

Data loaded. Shape: (3630, 37)
Numeric features: 36
Categorical features: 0


In [6]:
# ---------- 2) Train / test split (80/20 with stratify) ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Train set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Train set size: 2904
Test set size: 726


In [8]:
# ---------- 3) Preprocess: numeric impute+scale; categorical -> integer ids ----------
# Numeric: fill NaN with train medians, then StandardScaler
if numeric_cols:
    num_medians = X_train[numeric_cols].median()
    X_train_num = X_train[numeric_cols].copy().fillna(num_medians)
    X_test_num  = X_test[numeric_cols].copy().fillna(num_medians)

    scaler = StandardScaler()
    X_train_num_scaled = scaler.fit_transform(X_train_num)
    X_test_num_scaled  = scaler.transform(X_test_num)
    print("Numeric features scaled.")
else:
    X_train_num_scaled = None
    X_test_num_scaled = None

# Categorical: build per-column vocab on TRAIN only; map to int ids (0=unknown)
cat_mapping = {}
cat_vocab_size = {}
def build_mapping(series: pd.Series):
    uniq = pd.Series(series.dropna().astype(str).unique())
    mapping = {v: i+1 for i, v in enumerate(uniq)}  # 1..V; 0 reserved for unknown/missing
    vocab_size = len(mapping) + 1
    return mapping, vocab_size

for col in categorical_cols:
    m, V = build_mapping(X_train[col])
    cat_mapping[col] = m
    cat_vocab_size[col] = V

def encode_categorical(df_part: pd.DataFrame, cols, mappings):
    out = {}
    for c in cols:
        m = mappings[c]
        arr = df_part[c].astype(str).map(m).fillna(0).astype("int32").values  # unseen -> 0
        out[c] = arr
    return out

if categorical_cols:
    X_train_cat = encode_categorical(X_train, categorical_cols, cat_mapping)
    X_test_cat  = encode_categorical(X_test,  categorical_cols, cat_mapping)
    print("Categorical features encoded.")
else:
    X_train_cat = {}
    X_test_cat = {}

Numeric features scaled.


In [9]:
# ---------- 4) Build Keras MLP with embeddings (Functional API) ----------
inputs = []
feats = []

# numeric input
if numeric_cols:
    inp_num = layers.Input(shape=(len(numeric_cols),), name="num")
    inputs.append(inp_num)
    feats.append(inp_num)

# categorical inputs + embeddings
for col in categorical_cols:
    inp = layers.Input(shape=(1,), dtype="int32", name=f"cat_{col}")
    dim = embedding_dim(cat_vocab_size[col])
    emb = layers.Embedding(input_dim=cat_vocab_size[col], output_dim=dim, name=f"emb_{col}")(inp)
    emb = layers.Reshape((dim,), name=f"reshape_{col}")(emb)
    inputs.append(inp)
    feats.append(emb)

# concatenate
if len(feats) == 1:
    x = feats[0]
else:
    x = layers.Concatenate(name="concat")(feats)

x = layers.BatchNormalization(name="bn0")(x)

for i, units in enumerate(hidden_units, start=1):
    x = layers.Dense(units, activation="relu",
                     kernel_regularizer=regularizers.l2(l2_reg),
                     name=f"dense{i}")(x)
    x = layers.BatchNormalization(name=f"bn{i}")(x)
    x = layers.Dropout(dropout_rate, name=f"drop{i}")(x)

out = layers.Dense(1, activation="sigmoid", name="out")(x)
model = models.Model(inputs=inputs, outputs=out)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
    loss="binary_crossentropy",
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
)

model.summary()

In [10]:
# ---------- 5) Prepare model inputs ----------
def pack_inputs(num_scaled, cat_dict):
    inp = []
    if numeric_cols:
        inp.append(num_scaled.astype("float32"))
    for col in categorical_cols:
        inp.append(cat_dict[col])
    return inp

train_inp = pack_inputs(X_train_num_scaled if numeric_cols else np.empty((len(X_train), 0)),
                        X_train_cat)
test_inp  = pack_inputs(X_test_num_scaled  if numeric_cols else np.empty((len(X_test), 0)),
                        X_test_cat)

# class weights (optional)
class_weight = None
if use_class_weight:
    classes = np.array([0, 1])
    weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
    class_weight = {int(c): float(w) for c, w in zip(classes, weights)}
    print(f"Using class weights: {class_weight}")

# callbacks
cbs = [
    callbacks.EarlyStopping(monitor="val_auc", mode="max",
                            patience=early_stopping_patience, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(monitor="val_auc", mode="max",
                                patience=reduce_lr_patience, factor=reduce_lr_factor, min_lr=1e-6),
]

Using class weights: {0: 1.2770448548812665, 1: 0.8217317487266553}


In [16]:
# ---------- 6) Fit & evaluate ----------
# NOTE: Using the test set for validation is for demonstration.
# In a rigorous setting, you should create a separate validation set from the training data.
history = model.fit(
    train_inp, y_train,
    validation_data=(test_inp, y_test),
    epochs=epochs,
    batch_size=batch_size,
    class_weight=class_weight,
    callbacks=cbs,
    verbose=1
)

Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9153 - auc: 0.9668 - loss: 0.2216 - val_accuracy: 0.8967 - val_auc: 0.9517 - val_loss: 0.2989 - learning_rate: 1.0000e-06
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9236 - auc: 0.9691 - loss: 0.2133 - val_accuracy: 0.8953 - val_auc: 0.9519 - val_loss: 0.2949 - learning_rate: 1.0000e-06
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9194 - auc: 0.9679 - loss: 0.2169 - val_accuracy: 0.8953 - val_auc: 0.9519 - val_loss: 0.2908 - learning_rate: 1.0000e-06
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9167 - auc: 0.9655 - loss: 0.2249 - val_accuracy: 0.8953 - val_auc: 0.9519 - val_loss: 0.2871 - learning_rate: 1.0000e-06
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9132 - auc: 0.9658 - l

In [17]:
# Evaluate
proba = model.predict(test_inp, batch_size=batch_size).ravel()
pred  = (proba >= decision_threshold).astype(int)

print("\n--- Model Evaluation on Test Set ---")
print("Accuracy:", f"{accuracy_score(y_test, pred):.4f}")
try:
    auc = roc_auc_score(y_test, proba)
    print("ROC-AUC:", f"{auc:.4f}")
except Exception as e:
    print("ROC-AUC could not be computed:", e)

print("\nClassification report:\n", classification_report(y_test, pred, digits=4, target_names=["Dropout (0)", "Graduate (1)"]))
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred))

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step

--- Model Evaluation on Test Set ---
Accuracy: 0.8967
ROC-AUC: 0.9520

Classification report:
               precision    recall  f1-score   support

 Dropout (0)     0.8693    0.8662    0.8677       284
Graduate (1)     0.9142    0.9163    0.9153       442

    accuracy                         0.8967       726
   macro avg     0.8917    0.8912    0.8915       726
weighted avg     0.8966    0.8967    0.8967       726


Confusion matrix:
 [[246  38]
 [ 37 405]]


In [18]:
# ---------- 7) Interactive inference ----------
def infer_from_input(model, numeric_cols, categorical_cols, num_medians, scaler, cat_mapping):
    """
    Prompts the user for a new student's data and makes a prediction.
    """
    print("\n--- Interactive Inference ---")
    print("Enter values for a NEW student (press Enter to use the training set median/default):")
    record_num = None
    if numeric_cols:
        record_num = []
        for col in numeric_cols:
            raw = input(f"  > {col} (numeric): ")
            if raw.strip() == "":
                val = num_medians[col]
            else:
                try:
                    val = float(raw)
                except ValueError:
                    print(f"    Invalid input. Using median value for '{col}'.")
                    val = num_medians[col]
            record_num.append(val)
        record_num = np.array(record_num, dtype="float32").reshape(1, -1)
        record_num = scaler.transform(record_num).astype("float32")

    record_cat = {}
    if categorical_cols:
        for col in categorical_cols:
            # Show up to 10 examples for context
            examples = pd.Series(list(cat_mapping[col].keys()))[:10]
            ex_str = ", ".join(map(str, examples))
            raw = input(f"  > {col} (categorical, e.g., {ex_str}): ")
            key = raw.strip()
            # Map the input to its integer ID, defaulting to 0 for unknown values
            idx = cat_mapping[col].get(key, 0)
            record_cat[col] = np.array([idx], dtype="int32")

    # Pack the processed inputs for the model
    inp = []
    if numeric_cols:
        inp.append(record_num)
    for col in categorical_cols:
        inp.append(record_cat[col])

    # Predict
    p = float(model.predict(inp, verbose=0)[0, 0])
    label = 1 if p >= decision_threshold else 0
    label_map = {0: "Dropout", 1: "Graduate"}

    print("\n--- Prediction Result ---")
    print(f"Predicted Status: {label_map[label]}")
    print(f"Probability Graduate (class 1): {p:.4f}")
    print(f"Probability Dropout (class 0): {1-p:.4f}")

In [20]:
# Call the function to start interactive inference
infer_from_input(
    model,
    numeric_cols,
    categorical_cols,
    num_medians if numeric_cols else pd.Series(dtype=float),
    scaler if numeric_cols else None,
    cat_mapping
)


--- Interactive Inference ---
Enter values for a NEW student (press Enter to use the training set median/default):

--- Prediction Result ---
Predicted Status: Graduate
Probability Graduate (class 1): 0.6422
Probability Dropout (class 0): 0.3578


