In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

In [2]:
# ---------- 1) Load data ----------
in_path = "data/dropoutgraduate.csv"
df = pd.read_csv(in_path, sep=";")

# Find Target column (case-insensitive)
target_col = next((c for c in df.columns if c.strip().lower() == "target"), None)
if target_col is None:
    raise KeyError("Couldn't find a 'Target' column (case-insensitive).")

# Ensure binary target {0,1}; map labels if needed; drop any rows with class 2 if present
y_num = pd.to_numeric(df[target_col], errors="coerce")
if y_num.isna().any():
    label_to_code = {"dropout": 0, "graduate": 1, "enrolled": 2}
    y_num = df[target_col].astype(str).str.strip().str.lower().map(label_to_code)
df[target_col] = y_num.astype(int)
df = df[df[target_col].isin([0, 1])].copy()

# Split features/labels
X = df.drop(columns=[target_col])
y = df[target_col].astype(int)

print("Data loaded and prepared.")
print("Features shape:", X.shape)
print("Target distribution:\n", y.value_counts())

Data loaded and prepared.
Features shape: (3630, 36)
Target distribution:
 Target
1    2209
0    1421
Name: count, dtype: int64


In [3]:
# Identify column types
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = [c for c in X.columns if c not in numeric_cols]

print(f"Found {len(numeric_cols)} numeric columns.")
print(f"Found {len(categorical_cols)} categorical columns.")

Found 36 numeric columns.
Found 0 categorical columns.


In [5]:
# ---------- 2) Build pipeline ----------
numeric_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, numeric_cols),
        ("cat", categorical_tf, categorical_cols),
    ],
    remainder="drop",
)

rf = RandomForestClassifier(
    n_estimators=300,           # trees (start reasonably high)
    max_depth=None,            # let trees expand fully unless limited
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",       # common default for classification
    class_weight="balanced",   # handle class imbalance if present
    random_state=42,
    n_jobs=-1,
)

pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("rf", rf),
])

print("Pipeline created successfully.")
pipe

Pipeline created successfully.


0,1,2
,steps,"[('preprocess', ...), ('rf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [6]:
# ---------- 3) Train / test split (80/20 with stratify) ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (2904, 36)
X_test shape: (726, 36)


In [7]:
# ---------- 4) Fit the pipeline ----------
print("Training the model...")
pipe.fit(X_train, y_train)
print("Training complete.")

Training the model...
Training complete.


In [9]:
# ---------- 4) Evaluate the model ----------
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe.named_steps["rf"], "predict_proba") else None

print("Accuracy:", f"{accuracy_score(y_test, y_pred):.4f}")
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=4))

if y_proba is not None and len(np.unique(y_test)) == 2:
    try:
        auc = roc_auc_score(y_test, y_proba)
        print("ROC-AUC:", f"{auc:.4f}")
    except Exception as e:
        print("ROC-AUC could not be computed:", e)

print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9105

Classification report:
               precision    recall  f1-score   support

           0     0.9544    0.8099    0.8762       284
           1     0.8887    0.9751    0.9299       442

    accuracy                         0.9105       726
   macro avg     0.9215    0.8925    0.9030       726
weighted avg     0.9144    0.9105    0.9089       726

ROC-AUC: 0.9565

Confusion matrix:
 [[230  54]
 [ 11 431]]


In [10]:
# ---------- 5) Interactive inference function definition ----------
def infer_from_input(pipeline, feature_frame):
    """
    Prompts user for each feature, builds a DataFrame, and predicts the outcome.
    """
    print("\nEnter values for a NEW student (press Enter to skip a field):")
    record = {}
    for col in feature_frame.columns:
        if col in feature_frame.select_dtypes(include=[np.number]).columns:
            prompt = f"{col} (numeric): "
            raw = input(prompt)
            if raw.strip() == "":
                record[col] = np.nan
            else:
                try:
                    record[col] = float(raw)
                except:
                    print(f"  Could not parse '{raw}', setting as NaN.")
                    record[col] = np.nan
        else:
            # show up to 10 example categories
            examples = feature_frame[col].dropna().astype(str).unique()[:10]
            ex_str = ", ".join(map(str, examples))
            prompt = f"{col} (categorical e.g. {ex_str}): "
            raw = input(prompt)
            record[col] = raw.strip() if raw.strip() != "" else np.nan

    # Create a DataFrame from the input record
    new_df = pd.DataFrame([record], columns=feature_frame.columns)

    # Make predictions
    pred = pipeline.predict(new_df)[0]
    proba = pipeline.predict_proba(new_df)[0] if hasattr(pipeline.named_steps["rf"], "predict_proba") else None

    # Map prediction to human-readable label
    label_map = {0: "Dropout", 1: "Graduate"}
    label = label_map.get(int(pred), str(pred))

    # Print results
    print("\n--- Prediction Result ---")
    print("Predicted Outcome:", label)
    if proba is not None and len(proba) == 2:
        print(f"Probability Graduate (class 1): {proba[1]:.4f}")
        print(f"Probability Dropout (class 0):  {proba[0]:.4f}")
    print("-------------------------\n")

In [12]:
# Call the function to test a new prediction
# You can run this cell as many times as you like.
infer_from_input(pipe, X)


Enter values for a NEW student (press Enter to skip a field):

--- Prediction Result ---
Predicted Outcome: Graduate
Probability Graduate (class 1): 0.9567
Probability Dropout (class 0):  0.0433
-------------------------

