# 6.2 Final Model Selection and Deployment - Code Brief

Condensed reference for model selection, validation, and deployment.

## Setup

In [None]:
import numpy as np
import pandas as pd
import joblib
import json
import os
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report
)
from sklearn.calibration import calibration_curve

## Final Model Training

In [None]:
# Final model with optimized parameters
final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    class_weight='balanced',
    n_jobs=-1,
    random_state=42,
    oob_score=True
)

final_model.fit(X_train, y_train)
print(f"OOB Score: {final_model.oob_score_:.4f}")

## Model Evaluation

In [None]:
# Generate predictions
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)[:, 1]

# Calculate metrics
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1 Score': f1_score(y_test, y_pred),
    'ROC-AUC': roc_auc_score(y_test, y_prob)
}

print(classification_report(y_test, y_pred))

## Threshold Optimization

In [None]:
# Calculate metrics at different thresholds
thresholds = np.arange(0.1, 0.9, 0.05)
threshold_metrics = []

for thresh in thresholds:
    y_pred_thresh = (y_prob >= thresh).astype(int)
    threshold_metrics.append({
        'Threshold': thresh,
        'Precision': precision_score(y_test, y_pred_thresh, zero_division=0),
        'Recall': recall_score(y_test, y_pred_thresh, zero_division=0),
        'F1 Score': f1_score(y_test, y_pred_thresh, zero_division=0)
    })

thresh_df = pd.DataFrame(threshold_metrics)
optimal_f1_thresh = thresh_df.loc[thresh_df['F1 Score'].idxmax(), 'Threshold']

## Feature Importance

In [None]:
# Get feature importances
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': final_model.feature_importances_
}).sort_values('Importance', ascending=False)

print(importance_df.head(10))

## Model Serialization

In [None]:
# Save model
model_dir = '../../models/'
os.makedirs(model_dir, exist_ok=True)

joblib.dump(final_model, os.path.join(model_dir, 'final_model.joblib'))

# Save feature names
with open(os.path.join(model_dir, 'feature_names.json'), 'w') as f:
    json.dump(feature_names, f)

# Save metadata
metadata = {
    'model_type': 'RandomForestClassifier',
    'version': '1.0.0',
    'created_date': datetime.now().isoformat(),
    'n_features': len(feature_names),
    'performance_metrics': metrics,
    'recommended_thresholds': {
        'default': 0.5,
        'high_recall': 0.35,
        'balanced': float(optimal_f1_thresh)
    }
}

with open(os.path.join(model_dir, 'metadata.json'), 'w') as f:
    json.dump(metadata, f, indent=2)

## Prediction Pipeline

In [None]:
class PredictionPipeline:
    def __init__(self, model_path, feature_names_path, threshold=0.5):
        self.model = joblib.load(model_path)
        with open(feature_names_path, 'r') as f:
            self.feature_names = json.load(f)
        self.threshold = threshold
    
    def predict(self, X):
        probabilities = self.model.predict_proba(X)[:, 1]
        predictions = (probabilities >= self.threshold).astype(int)
        
        risk_levels = []
        for prob in probabilities:
            if prob >= 0.7:
                risk_levels.append('High Risk')
            elif prob >= 0.4:
                risk_levels.append('Moderate Risk')
            else:
                risk_levels.append('Low Risk')
        
        return {
            'probabilities': probabilities,
            'predictions': predictions,
            'risk_levels': risk_levels
        }

## Load and Use Saved Model

In [None]:
# Load saved model
loaded_model = joblib.load(os.path.join(model_dir, 'final_model.joblib'))

# Verify predictions match
test_pred = loaded_model.predict_proba(X_test[:5])[:, 1]
original_pred = final_model.predict_proba(X_test[:5])[:, 1]
print(f"Predictions match: {np.allclose(test_pred, original_pred)}")

## Key Concepts

| Threshold | Priority | Trade-off |
|:----------|:---------|:----------|
| 0.35 | High Recall | More false alarms |
| 0.45-0.50 | Balanced | Good F1 score |
| 0.65+ | High Precision | May miss students |

## Risk Levels

| Level | Probability | Action |
|:------|:------------|:-------|
| High | >= 70% | Immediate intervention |
| Moderate | 40-70% | Proactive outreach |
| Low | < 40% | Standard support |