In [1]:
# Cell 1: Import Libraries
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)
import joblib

print("Libraries imported successfully.")

Libraries imported successfully.


In [3]:
# Cell 2: Load and Preprocess Data
# =====================
# 1. ËØªÂèñÊï∞ÊçÆ (Load Data)
# =====================
df = pd.read_csv("data/dropoutgraduate.csv", sep=";")

# Clean up column names by stripping extra whitespace
df.columns = df.columns.str.strip()

# Ensure the 'Target' column exists
assert "Target" in df.columns, "‚ö†Ô∏è 'Target' column not found, please check the CSV file."

# Separate features (X) and target (y)
X = df.drop("Target", axis=1)
y = df["Target"]

# Apply one-hot encoding for categorical variables
# drop_first=True helps avoid multicollinearity
X = pd.get_dummies(X, drop_first=True)

print("Data loaded and preprocessed.")
print("Shape of features (X):", X.shape)
print("Shape of target (y):", y.shape)

Data loaded and preprocessed.
Shape of features (X): (3630, 36)
Shape of target (y): (3630,)


In [4]:
# Cell 3: Split Data into Training and Testing Sets
# =====================
# Êï∞ÊçÆÈõÜÂàíÂàÜ (Dataset Splitting)
# =====================
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,       # 20% of the data will be used for testing
    stratify=y,          # Ensures proportional class distribution
    random_state=42      # For reproducibility
)

print("Data split into training and testing sets.")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

Data split into training and testing sets.
X_train shape: (2904, 36)
X_test shape: (726, 36)


In [5]:
# Cell 4: Define Pipeline and Hyperparameter Grid
# =====================
# 2. Pipeline + GridSearchCV
# =====================

# Create a pipeline
# Although Decision Trees don't strictly require scaling, it's good practice in a pipeline.
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", DecisionTreeClassifier(random_state=42))
])

# Define the hyperparameter grid to search
param_grid = {
    "clf__criterion": ["gini", "entropy"],
    "clf__max_depth": [None, 5, 10, 20, 30],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__class_weight": [None, "balanced"]
}

print("Pipeline and hyperparameter grid defined.")

Pipeline and hyperparameter grid defined.


In [6]:
# Cell 5: Train the Model with GridSearchCV

# Set up stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create the GridSearchCV object
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="f1_weighted",  # Use weighted F1-score for evaluation
    cv=cv,
    n_jobs=-1,              # Use all available CPU cores
    verbose=1               # Show progress
)

# Start the training and hyperparameter tuning process
print("Starting GridSearchCV training...")
grid.fit(X_train, y_train)

# Get the best model from the grid search
best_model = grid.best_estimator_

print("\n‚úÖ Training complete.")
print("Best Hyperparameters:", grid.best_params_)

Starting GridSearchCV training...
Fitting 5 folds for each of 180 candidates, totalling 900 fits

‚úÖ Training complete.
Best Hyperparameters: {'clf__class_weight': None, 'clf__criterion': 'gini', 'clf__max_depth': 5, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 2}


In [7]:
# Cell 6: Evaluate the Best Model
# =====================
# 3. Ê®°ÂûãËØÑ‰º∞ (Model Evaluation)
# =====================

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate AUC score (handle potential errors if only one class is predicted)
try:
    y_prob = best_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)
except Exception as e:
    auc = None
    print(f"Could not calculate AUC: {e}")

# Print evaluation metrics
print("\nüìä Model Evaluation Results")
print("-" * 30)
print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1 Score : {f1_score(y_test, y_pred, average='weighted'):.4f}")
if auc:
    print(f"AUC      : {auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


üìä Model Evaluation Results
------------------------------
Accuracy : 0.8939
Precision: 0.8976
Recall   : 0.8939
F1 Score : 0.8919
AUC      : 0.8981

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.79      0.85       284
           1       0.87      0.96      0.92       442

    accuracy                           0.89       726
   macro avg       0.90      0.87      0.88       726
weighted avg       0.90      0.89      0.89       726


Confusion Matrix:
[[223  61]
 [ 16 426]]


In [8]:
# Cell 7: Save the Best Model
# =====================
# 4. ‰øùÂ≠òÊúÄ‰Ω≥Ê®°Âûã (Save Best Model)
# =====================

# Create the 'models' directory if it doesn't exist
os.makedirs("models", exist_ok=True)

# Save the best model pipeline
model_path = "models/best_decisiontree.pkl"
joblib.dump(best_model, model_path)

print(f"\nüíæ Best model saved to {model_path}")


üíæ Best model saved to models/best_decisiontree.pkl


In [9]:
# Cell 8: Save Evaluation Results
# =====================
# 5. ‰øùÂ≠òËØÑ‰º∞ÁªìÊûú (Save Evaluation Results)
# =====================

# Create a dictionary with the results
results = {
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred, average="weighted"),
    "recall": recall_score(y_test, y_pred, average="weighted"),
    "f1": f1_score(y_test, y_pred, average="weighted"),
    "auc": auc
}

# Convert to a pandas DataFrame
results_df = pd.DataFrame([results])

# Save the results to a CSV file
results_path = "data/DecisionTreeResults.csv"
results_df.to_csv(results_path, sep=";", index=False)

print(f"üìÑ Evaluation results saved to {results_path}")

üìÑ Evaluation results saved to data/DecisionTreeResults.csv
