In [2]:
!pip install shap

Collecting shap
  Downloading shap-0.50.0-cp313-cp313-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.50.0-cp313-cp313-win_amd64.whl (549 kB)
   ---------------------------------------- 0.0/549.1 kB ? eta -:--:--
   ---------------------------------------- 549.1/549.1 kB 9.6 MB/s eta 0:00:00
Downloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap

   -------------------- ------------------- 1/2 [shap]
   -------------------- ------------------- 1/2 [shap]
   ---------------------------------------- 2/2 [shap]

Successfully installed shap-0.50.0 slicer-0.0.8


In [3]:
#1. Import Libraries
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

In [4]:
#2. Load Data & Trained Model Inputs
X_train = pd.read_csv("../data/processed/X_train.csv")
X_test  = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").values.ravel()
y_test  = pd.read_csv("../data/processed/y_test.csv").values.ravel()

In [5]:
#3. Retrain Best Model (Random Forest)
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)
rf.fit(X_train, y_train)

In [6]:
#4. Initialize SHAP Explainer
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)

In [7]:
#5. Global Feature Importance (MOST IMPORTANT)
#5.1 SHAP Summary Plot
shap.summary_plot(
    shap_values[1],
    X_test,
    plot_type="dot"
)

AssertionError: The shape of the shap_values matrix does not match the shape of the provided data matrix.

In [None]:
#6. Bar Plot ‚Äì Average Impact
shap.summary_plot(
    shap_values[1],
    X_test,
    plot_type="bar"
)

In [None]:
#7. Individual Student Explanation (VERY IMPRESSIVE)
#7.1 Pick a High-Risk Student
high_risk_index = np.argmax(rf.predict_proba(X_test)[:,1])
high_risk_index

In [None]:
#7.2 SHAP Force Plot
shap.initjs()

shap.force_plot(
    explainer.expected_value[1],
    shap_values[1][high_risk_index],
    X_test.iloc[high_risk_index]
)

8. Business Interpretation (WRITE THIS AS MARKDOWN)

‚úçÔ∏è Add a Markdown cell:

Key Explainability Insights:

Extended inactivity significantly increases dropout risk

Consistent engagement reduces churn probability

Missed assessments act as strong early-warning signals

Academic performance complements behavioral indicators

9. How This Helps the Business (CRITICAL)

‚úçÔ∏è Markdown cell:

Actionable Interventions:

Auto-alert instructors when inactivity exceeds 14 days

Trigger mentoring for students missing assessments

Recommend engaging content for low-activity users

Prioritize high-risk students for early support

üî• This connects AI ‚Üí real decisions

In [12]:
#10. Save SHAP Feature Importance (Optional)
type(shap_values)
np.array(shap_values).shape
# Handle different SHAP output formats safely
if isinstance(shap_values, list):
    # Binary classification (older SHAP versions)
    shap_vals_churn = shap_values[1]
else:
    # Newer SHAP versions (3D array)
    shap_vals_churn = shap_values[:, :, 1]
shap_importance = pd.DataFrame({
    "feature": X_test.columns,
    "mean_abs_shap": np.abs(shap_vals_churn).mean(axis=0)
}).sort_values(by="mean_abs_shap", ascending=False)

In [13]:
shap_importance.to_csv(
    "../data/processed/shap_feature_importance.csv",
    index=False
)

In [14]:
shap_importance.head(10)

Unnamed: 0,feature,mean_abs_shap
40,final_result_Withdrawn,0.283978
39,final_result_Pass,0.072681
38,final_result_Fail,0.067441
5,last_active_day,0.051813
11,submissions,0.032312
8,max_score,0.017829
7,avg_score,0.0135
3,active_days,0.008662
9,min_score,0.007755
0,total_clicks,0.00483


‚úÖ END OF NOTEBOOK 06
What You Have Achieved:

‚úî Built explainable ML
‚úî Identified root causes of churn
‚úî Explained individual predictions
‚úî Delivered business-ready insigh