In [None]:
# Install XGBoost if you haven't already (uncomment if needed)
# !pip install xgboost

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

print("--- Configuring Model Training ---")

# 1. DEFINE LEAKY COLUMNS TO DROP
# These are the IDs causing the model to "memorize" instead of learn
id_cols_to_drop = [
    'Request ID', 
    'Case Id', 
    'Service Provider Id', 
    'Service Provider Contact Id',
    'Claim ID' # Just in case
]

# 2. PREPARE X and y
target_name = 'Wages Reliability'
targets_to_exclude = ['Annual Leave Reliability', 'Long Service Leave Reliability', 'Wages Reliability']

# Drop Targets AND the IDs
cols_to_drop = targets_to_exclude + id_cols_to_drop
# Only drop what actually exists
existing_drop = [c for c in cols_to_drop if c in df.columns]

X = df.drop(columns=existing_drop)
y = df[target_name]

# 3. SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Features: {X_train.shape[1]} columns")
print(f"Training Rows: {X_train.shape[0]}")

# 4. TRAIN XGBOOST (The Upgrade)
# n_estimators=500: More trees
# learning_rate=0.05: Learns slower but more accurately
# n_jobs=-1: Uses all CPU cores
model = XGBRegressor(
    n_estimators=500, 
    learning_rate=0.05, 
    max_depth=6, 
    random_state=42,
    n_jobs=-1
)

print("Training XGBoost... (This might take 30 seconds)")
model.fit(X_train, y_train)

# 5. EVALUATE
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n--- Final Model Performance ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R2 Score (Accuracy):       {r2:.4f}")

# 6. PLOT FEATURE IMPORTANCE (The "Clean" View)
# Now that IDs are gone, what REALLY matters?
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance, palette='magma')
plt.title(f'True Drivers of {target_name} (IDs Removed)')
plt.xlabel('Importance Score')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 1. SELECT FEATURES & TARGET
# The Target is what we want to predict
target_name = 'Wages Reliability'

# The Features (X) are everything EXCEPT the target columns
# We drop ALL reliability targets so the model doesn't "cheat" by looking at Annual Leave to predict Wages
drop_targets = ['Annual Leave Reliability', 'Long Service Leave Reliability', 'Wages Reliability']
X = df.drop(columns=drop_targets, errors='ignore')
y = df[target_name]

# 2. SPLIT DATA (80% Training, 20% Testing)
# random_state=42 ensures we get the same split every time (reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training on {X_train.shape[0]} rows. Testing on {X_test.shape[0]} rows.")

# 3. TRAIN THE MODEL
# n_estimators=100 means "build 100 decision trees"
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

print("‚úÖ Model Training Complete.")

# 4. EVALUATE PERFORMANCE
# Make predictions on the Test Set (data the model has never seen)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n--- Model Performance Results ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R2 Score (Accuracy):       {r2:.4f}")
print("---------------------------------")
print(f"Interpretation: On average, the model's reliability score is off by {mae*100:.2f}%.")

# 5. VISUALIZE FEATURE IMPORTANCE (The "Why")
# This shows which columns drove the decisions the most
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance, palette='viridis')
plt.title(f'Top 10 Predictors for {target_name}')
plt.xlabel('Importance Score')
plt.show()

In [None]:
# Diagnostic Cell: Where did the features go?

print("--- Checking for Key Features in Training Data ---")

# List of the 'Smart' features we expect to see
key_features = [
    'Provider_Reputation_Score', 
    'IP_Wage_to_ABS_Ratio', 
    'IP_Tenure_Years', 
    'IP Wages'
]

# 1. Do they exist?
for col in key_features:
    if col in X_train.columns:
        print(f"‚úÖ FOUND: {col}")
    else:
        print(f"‚ùå MISSING: {col} (This is why it's not in the chart!)")

# 2. If they exist, how weak are they?
# Let's look at their correlation with the Target
if 'Wages Reliability' in df.columns:
    print("\n--- Correlation with Target ---")
    # We check the original df because X_train is already split
    for col in key_features:
        if col in df.columns:
            corr = df[col].corr(df['Wages Reliability'])
            print(f"{col}: {corr:.4f}")

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

print("--- Pivoting to Binary Classification ---")

# 1. DEFINE THE THRESHOLD (Business Logic)
# User Proposal: Anything below 0.95 is "Unreliable" (Risk)
THRESHOLD = 0.95

# Create Binary Target: 1 = RISK (Unreliable), 0 = SAFE (Reliable)
# We usually define the "Positive Class" (1) as the thing we want to catch (The Risk)
y_class = (df['Wages Reliability'] < THRESHOLD).astype(int)

print(f"Reliability Threshold: {THRESHOLD*100}%")
print("Class Distribution:")
print(y_class.value_counts(normalize=True).rename({0: 'Safe (Majority)', 1: 'Risk (Minority)'}))

# 2. SPLIT DATA
# We use the same X features as before (IDs removed)
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42, stratify=y_class)
# Note: stratify=y_class ensures we have the same % of bad cases in train and test

# 3. CALCULATE SCALE_POS_WEIGHT
# This tells XGBoost: "Pay X times more attention to the minority class"
# Formula: Count(Majority) / Count(Minority)
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
print(f"\nImbalance Ratio: {ratio:.2f} (Model will weight 'Risk' cases {ratio:.2f}x more)")

# 4. TRAIN CLASSIFIER
model_class = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    scale_pos_weight=ratio, # <--- CRITICAL FIX FOR IMBALANCE
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

model_class.fit(X_train, y_train)

# 5. EVALUATE (Confusion Matrix)
y_pred_class = model_class.predict(X_test)

print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred_class, target_names=['Safe', 'Risk']))

# Visualize
plt.figure(figsize=(6, 5))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_class, display_labels=['Safe', 'Risk'], cmap='Blues', colorbar=False)
plt.title("Confusion Matrix (Did we catch the risks?)")
plt.show()

# 6. CHECK FEATURE IMPORTANCE (Again)
# Does the logic change when looking for anomalies?
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model_class.feature_importances_
}).sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance, palette='coolwarm')
plt.title(f'Top Predictors of UNRELIABILITY (< {THRESHOLD})')
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve

# 1. Get the Probability Scores (0% to 100% risk) instead of just Yes/No
y_probs = model_class.predict_proba(X_test)[:, 1]

# 2. Test different Thresholds
thresholds = [0.50, 0.60, 0.70, 0.80, 0.85, 0.90, 0.95]

print(f"{'Threshold':<10} | {'Precision (Accuracy of Flags)':<30} | {'Recall (Risks Caught)':<25}")
print("-" * 75)

for t in thresholds:
    # Apply the new threshold
    y_pred_new = (y_probs >= t).astype(int)
    
    # Calculate metrics
    report = classification_report(y_test, y_pred_new, output_dict=True)
    prec = report['1']['precision']
    rec = report['1']['recall']
    
    print(f"{t:.2f}       | {prec:.2%}                         | {rec:.2%}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visual Proof of "No Leakage"
plt.figure(figsize=(10, 6))

# We sample 2000 points just to keep the plot readable
sns.scatterplot(data=df.sample(2000, random_state=42), x='IP Wages', y='Wages Reliability', alpha=0.3)

plt.title("Leakage Test: IP Wage vs. Reliability")
plt.xlabel("IP Wage ($)")
plt.ylabel("Reliability Score (0.0 - 1.0)")
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visual Proof of "No Leakage"
plt.figure(figsize=(10, 6))

# We sample 2000 points just to keep the plot readable
sns.scatterplot(data=df.sample(2000, random_state=42), x='IP Wages', y='Wages Reliability', alpha=0.3)

plt.title("Leakage Test: IP Wage vs. Reliability")
plt.xlabel("IP Wage ($)")
plt.ylabel("Reliability Score (0.0 - 1.0)")
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
import re

print("--- Sanitizing Column Names for LightGBM ---")

def clean_col_names(df):
    # Regex: Replace anything that is NOT (^) a letter, number, or underscore
    # with an underscore.
    new_columns = [re.sub(r'[^A-Za-z0-9_]+', '_', col) for col in df.columns]
    df.columns = new_columns
    return df

# Apply to Train and Test sets
X_train = clean_col_names(X_train)
X_test = clean_col_names(X_test)

print("‚úÖ Columns sanitized. Example of new names:")
print(X_train.columns[:5].tolist())

In [None]:
# 1. Install LightGBM (if needed)
# !pip install lightgbm

from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

print("--- Benchmarking LightGBM ---")

# Calculate the scale_pos_weight equivalent for LightGBM
# LightGBM uses 'scale_pos_weight' just like XGBoost
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

# 2. Configure Model
lgbm = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    scale_pos_weight=ratio,  # Handle Imbalance
    random_state=42,
    n_jobs=-1,
    verbose=-1 # Silences warnings
)

# 3. Train
lgbm.fit(X_train, y_train)

# 4. Evaluate
y_pred_lgbm = lgbm.predict(X_test)

print("\n--- LightGBM Results ---")
print(classification_report(y_test, y_pred_lgbm, target_names=['Safe', 'Risk']))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

print("--- Benchmarking Logistic Regression (The Baseline) ---")

# 1. Logistic Regression requires Scaling (Standardizing inputs)
# XGBoost didn't care, but Logistic Regression breaks if Wages are 5000 and Ratio is 1.2
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Configure Model
# class_weight='balanced' is the Logistic equivalent of scale_pos_weight
log_reg = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)

# 3. Train
log_reg.fit(X_train_scaled, y_train)

# 4. Evaluate
y_pred_log = log_reg.predict(X_test_scaled)

print("\n--- Logistic Regression Results ---")
print(classification_report(y_test, y_pred_log, target_names=['Safe', 'Risk']))

# 5. "Coefficients" (The Explainer)
# In Logistic Regression, we don't get "Importance", we get "Coefficients" (Weights)
# Positive Coeff = Increases Risk. Negative Coeff = Decreases Risk.
coeffs = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': log_reg.coef_[0]
}).sort_values('Coefficient', ascending=False)

print("\n--- Top Risk Drivers (Logistic) ---")
print(coeffs.head(5))
print("\n--- Top Safety Drivers (Logistic) ---")
print(coeffs.tail(5))

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

print("--- Training Single Decision Tree for Visualization ---")

# 1. Train a shallow tree (max_depth=3)
# We keep it shallow so the diagram is readable by humans.
# We use class_weight='balanced' to handle the risk imbalance.
dt_model = DecisionTreeClassifier(
    max_depth=3, 
    class_weight='balanced', 
    random_state=42
)

dt_model.fit(X_train, y_train)

# 2. Evaluate (Just to see how it compares)
y_pred_dt = dt_model.predict(X_test)
print("\n--- Single Tree Performance ---")
print(classification_report(y_test, y_pred_dt, target_names=['Safe', 'Risk']))

# 3. PLOT THE LOGIC
plt.figure(figsize=(20, 10))
plot_tree(
    dt_model, 
    feature_names=X.columns,  
    class_names=['Safe', 'Risk'],
    filled=True,             # Color the boxes (Blue = Risk, Orange = Safe)
    rounded=True,
    fontsize=10
)
plt.title("The Business Logic Flowchart (Top 3 Levels)")
plt.show()

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score

print("--- STARTING GRAND MODEL TOURNAMENT ---")

# ==========================================
# 1. DATA PREP (Sanitization for LightGBM)
# ==========================================
def clean_col_names(df):
    return df.rename(columns=lambda x: re.sub(r'[^A-Za-z0-9_]+', '_', x))

X_train = clean_col_names(X_train)
X_test = clean_col_names(X_test)

# Calculate Imbalance Ratio for Boosting Models
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
print(f"Imbalance Ratio: {ratio:.2f}x")

# Store results here
leaderboard = []

# ==========================================
# 2. XGBOOST (The Favorite) - Multi-Threshold
# ==========================================
print("Training XGBoost...")
xgb_model = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, 
                          scale_pos_weight=ratio, random_state=42, n_jobs=-1, verbosity=0)
xgb_model.fit(X_train, y_train)
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]

# Test 3 Thresholds
for thresh in [0.50, 0.75, 0.90]:
    preds = (xgb_probs >= thresh).astype(int)
    leaderboard.append({
        'Model': f'XGBoost (Threshold {thresh})',
        'Recall (Catch Rate)': recall_score(y_test, preds),
        'Precision (Accuracy)': precision_score(y_test, preds),
        'F1-Score': f1_score(y_test, preds)
    })

# ==========================================
# 3. LIGHTGBM (The Speedster)
# ==========================================
print("Training LightGBM...")
lgbm_model = LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, 
                            scale_pos_weight=ratio, random_state=42, n_jobs=-1, verbose=-1)
lgbm_model.fit(X_train, y_train)
lgbm_preds = lgbm_model.predict(X_test)

leaderboard.append({
    'Model': 'LightGBM (Default)',
    'Recall (Catch Rate)': recall_score(y_test, lgbm_preds),
    'Precision (Accuracy)': precision_score(y_test, lgbm_preds),
    'F1-Score': f1_score(y_test, lgbm_preds)
})

# ==========================================
# 4. DECISION TREE (The White Box)
# ==========================================
print("Training Decision Tree...")
dt_model = DecisionTreeClassifier(max_depth=4, class_weight='balanced', random_state=42)
dt_model.fit(X_train, y_train)
dt_preds = dt_model.predict(X_test)

leaderboard.append({
    'Model': 'Decision Tree (Depth 4)',
    'Recall (Catch Rate)': recall_score(y_test, dt_preds),
    'Precision (Accuracy)': precision_score(y_test, dt_preds),
    'F1-Score': f1_score(y_test, dt_preds)
})

# ==========================================
# 5. LOGISTIC REGRESSION (The Baseline)
# ==========================================
print("Training Logistic Regression...")
# Needs scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
log_model.fit(X_train_scaled, y_train)
log_preds = log_model.predict(X_test_scaled)

leaderboard.append({
    'Model': 'Logistic Regression',
    'Recall (Catch Rate)': recall_score(y_test, log_preds),
    'Precision (Accuracy)': precision_score(y_test, log_preds),
    'F1-Score': f1_score(y_test, log_preds)
})

# ==========================================
# 6. FINAL RESULTS
# ==========================================
df_results = pd.DataFrame(leaderboard).sort_values('Recall (Catch Rate)', ascending=False)

print("\n--- üèÜ MODEL LEADERBOARD üèÜ ---")
# Display with nice formatting
display(df_results.style.background_gradient(cmap='Greens', subset=['Recall (Catch Rate)', 'Precision (Accuracy)']))

# Visual Comparison
plt.figure(figsize=(12, 6))
sns.barplot(data=df_results, x='Recall (Catch Rate)', y='Model', palette='viridis')
plt.title('Recall Comparison: Which model catches the most errors?')
plt.xlabel('Recall Score (0.0 - 1.0)')
plt.xlim(0, 1.0)
plt.axvline(0.80, color='red', linestyle='--', label='Target Recall (80%)')
plt.legend()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score

print("--- STARTING GRAND MODEL TOURNAMENT (Binary Classification) ---")

# ==========================================
# 0. PREPARE TARGET (The Binary Conversion)
# ==========================================
# Business Rule: If Reliability < 0.95, it is a RISK (1). Otherwise, SAFE (0).
THRESHOLD = 0.95
target_name = 'Wages Reliability'

print(f"Converting {target_name} to Binary Risk (Threshold < {THRESHOLD})...")
y_binary = (df[target_name] < THRESHOLD).astype(int)

print("Class Distribution:")
print(y_binary.value_counts(normalize=True).rename({0: 'Safe', 1: 'Risk'}))

# Prepare Features (Drop Targets and IDs)
# Note: Ensure you define 'df' from previous cleaning steps
ids_to_drop = ['Request ID', 'Case Id', 'Service Provider Id', 'Service Provider Contact Id', 'Claim ID']
targets_to_drop = ['Annual Leave Reliability', 'Long Service Leave Reliability', 'Wages Reliability']
X = df.drop(columns=ids_to_drop + targets_to_drop, errors='ignore')

# Split Data (Stratify ensures we keep the same % of risks in test set)
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify=y_binary)

# ==========================================
# 1. DATA SANITIZATION (Fixing LightGBM Error)
# ==========================================
def clean_col_names(df):
    # Replaces spaces, slashes, etc. with underscores
    return df.rename(columns=lambda x: re.sub(r'[^A-Za-z0-9_]+', '_', x))

X_train = clean_col_names(X_train)
X_test = clean_col_names(X_test)

# Calculate Imbalance Ratio for Boosting Models
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
print(f"Imbalance Ratio: {ratio:.2f}x (Models will weight Risk cases heavier)")

# Store results here
leaderboard = []

# ==========================================
# 2. XGBOOST (The Favorite)
# ==========================================
print("\nTraining XGBoost...")
xgb_model = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, 
                          scale_pos_weight=ratio, random_state=42, n_jobs=-1)
xgb_model.fit(X_train, y_train)
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]

# Test 3 Thresholds
for thresh in [0.50, 0.75, 0.90]:
    preds = (xgb_probs >= thresh).astype(int)
    leaderboard.append({
        'Model': f'XGBoost (Threshold {thresh})',
        'Recall (Catch Rate)': recall_score(y_test, preds),
        'Precision (Accuracy)': precision_score(y_test, preds),
        'F1-Score': f1_score(y_test, preds)
    })

# ==========================================
# 3. LIGHTGBM (The Speedster)
# ==========================================
print("Training LightGBM...")
lgbm_model = LGBMClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, 
                            scale_pos_weight=ratio, random_state=42, n_jobs=-1, verbose=-1)
lgbm_model.fit(X_train, y_train)
lgbm_preds = lgbm_model.predict(X_test)

leaderboard.append({
    'Model': 'LightGBM (Default)',
    'Recall (Catch Rate)': recall_score(y_test, lgbm_preds),
    'Precision (Accuracy)': precision_score(y_test, lgbm_preds),
    'F1-Score': f1_score(y_test, lgbm_preds)
})

# ==========================================
# 4. DECISION TREE (The White Box)
# ==========================================
print("Training Decision Tree...")
dt_model = DecisionTreeClassifier(max_depth=4, class_weight='balanced', random_state=42)
dt_model.fit(X_train, y_train)
dt_preds = dt_model.predict(X_test)

leaderboard.append({
    'Model': 'Decision Tree (Depth 4)',
    'Recall (Catch Rate)': recall_score(y_test, dt_preds),
    'Precision (Accuracy)': precision_score(y_test, dt_preds),
    'F1-Score': f1_score(y_test, dt_preds)
})

# ==========================================
# 5. LOGISTIC REGRESSION (The Baseline)
# ==========================================
print("Training Logistic Regression...")
# Needs scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
log_model.fit(X_train_scaled, y_train)
log_preds = log_model.predict(X_test_scaled)

leaderboard.append({
    'Model': 'Logistic Regression',
    'Recall (Catch Rate)': recall_score(y_test, log_preds),
    'Precision (Accuracy)': precision_score(y_test, log_preds),
    'F1-Score': f1_score(y_test, log_preds)
})

# ==========================================
# 6. FINAL RESULTS DISPLAY
# ==========================================
df_results = pd.DataFrame(leaderboard).sort_values('Recall (Catch Rate)', ascending=False)

print("\n--- üèÜ MODEL LEADERBOARD üèÜ ---")
display(df_results.style.background_gradient(cmap='Greens', subset=['Recall (Catch Rate)', 'Precision (Accuracy)']))

# Visual Comparison
plt.figure(figsize=(12, 6))
sns.barplot(data=df_results, x='Recall (Catch Rate)', y='Model', palette='viridis')
plt.title('Recall Comparison: Which model catches the most errors?')
plt.xlabel('Recall Score (0.0 - 1.0)')
plt.xlim(0, 1.0)
plt.axvline(0.80, color='red', linestyle='--', label='Target Recall (80%)')
plt.legend()
plt.show()

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

print("--- üïµÔ∏è‚Äç‚ôÄÔ∏è LEAKAGE DETECTION: THE QUALIFYING ROUND üïµÔ∏è‚Äç‚ôÄÔ∏è ---")

# 1. PREPARE TIME-SERIES DATA
# We need to fetch the Date column back from the raw dataset to sort chronologically
# Assuming 'df' is your clean data and 'df_abs' (or df_raw) has the dates.
# Let's try to find the date in df_main_raw (loaded in Cell 2)
try:
    # Create a temporary dataframe for testing
    df_leakage = X.copy() # X is our feature set
    
    # Re-attach the target
    df_leakage['Target_Risk'] = y_binary
    
    # Re-attach the Date (Critical Step)
    # We assume df_main_raw was defined in Cell 2. If not, reload your CSV here.
    df_leakage['Request_Date'] = pd.to_datetime(df_main_raw['Request Received Date'])
    
    # Sort by Date (Oldest -> Newest)
    df_leakage = df_leakage.sort_values('Request_Date')
    
    print("‚úÖ Successfully sorted data chronologically.")

    # 2. STRICT TIME SPLIT (No Shuffling)
    # Train on the Past (First 80%), Test on the Future (Last 20%)
    cutoff = int(len(df_leakage) * 0.8)
    
    # Split Features and Target
    X_time = df_leakage.drop(columns=['Target_Risk', 'Request_Date'])
    y_time = df_leakage['Target_Risk']
    
    X_train_time = X_time.iloc[:cutoff]
    y_train_time = y_time.iloc[:cutoff]
    
    X_test_time = X_time.iloc[cutoff:]
    y_test_time = y_time.iloc[cutoff:]
    
    print(f"Training on oldest {len(X_train_time)} claims.")
    print(f"Testing on newest {len(X_test_time)} claims (The Future).")

    # 3. TRAIN TEST MODEL
    # We use a quick XGBoost to see if it holds up
    print("\nTraining Time-Aware Model...")
    ratio_time = float(np.sum(y_train_time == 0)) / np.sum(y_train_time == 1)
    
    model_time = XGBClassifier(n_estimators=200, learning_rate=0.05, scale_pos_weight=ratio_time, random_state=42, n_jobs=-1)
    model_time.fit(X_train_time, y_train_time)
    
    # 4. EVALUATE
    y_pred_time = model_time.predict(X_test_time)
    
    print("\n--- ‚è≥ TIME-TRAVEL CHECK RESULTS ‚è≥ ---")
    print(classification_report(y_test_time, y_pred_time, target_names=['Safe', 'Risk']))
    
    # 5. VERDICT
    rec_score = recall_score(y_test_time, y_pred_time)
    if rec_score < 0.60:
        print("‚ùå FAILURE: Recall dropped significantly on future data. You likely have Time-Based Leakage.")
    elif rec_score < 0.80:
        print("‚ö†Ô∏è CAUTION: Performance dropped slightly. This is normal, but check your features.")
    else:
        print("‚úÖ PASS: Model generalizes well to the future. You are ready for the Tournament!")

except Exception as e:
    print(f"Could not run Time Check. Error: {e}")
    print("Did you reload 'df_main_raw' in Cell 2?")

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, recall_score

print("--- üïµÔ∏è‚Äç‚ôÄÔ∏è LEAKAGE DETECTION: THE QUALIFYING ROUND üïµÔ∏è‚Äç‚ôÄÔ∏è ---")

# 1. DEFINE TARGET & FEATURES (Locally for this test)
# We recreate X and y here because they haven't been defined in the notebook yet
THRESHOLD = 0.95
target_col = 'Wages Reliability'

# Create Binary Target
y_check = (df[target_col] < THRESHOLD).astype(int)

# Create Features (Drop Targets and potential IDs if they still exist)
cols_to_exclude = ['Annual Leave Reliability', 'Long Service Leave Reliability', 'Wages Reliability',
                   'Request ID', 'Case Id', 'Service Provider Id', 'Service Provider Contact Id', 'Claim ID']
existing_exclude = [c for c in cols_to_exclude if c in df.columns]
X_check = df.drop(columns=existing_exclude)

# 2. RECOVER THE DATE (The "Join" Step)
# We need to fetch the 'Request Received Date' from the RAW dataframe.
# We use .loc[df.index] to ensure we only get dates for the rows that survived data cleaning.
try:
    # Make sure df_main_raw is available (from Cell 2)
    if 'df_main_raw' not in locals():
        raise ValueError("df_main_raw is missing! Please re-run Cell 2.")

    # Create a temporary dataframe for sorting
    df_leakage = X_check.copy()
    df_leakage['Target_Risk'] = y_check
    
    # CRITICAL: Fetch the date using the index to match rows perfectly
    # Change 'Request Received Date' below if your raw column has a different name
    date_col_name = 'Request Received Date' 
    df_leakage['Request_Date'] = df_main_raw.loc[df.index, date_col_name]
    
    # Convert to datetime just in case
    df_leakage['Request_Date'] = pd.to_datetime(df_leakage['Request_Date'], errors='coerce')
    
    # Drop rows where date is missing (can't sort them)
    df_leakage = df_leakage.dropna(subset=['Request_Date'])
    
    # 3. SORT CHRONOLOGICALLY (Oldest -> Newest)
    df_leakage = df_leakage.sort_values('Request_Date')
    print(f"‚úÖ Successfully sorted {len(df_leakage)} claims by Date.")

    # 4. STRICT TIME SPLIT (First 80% vs Last 20%)
    cutoff = int(len(df_leakage) * 0.8)
    
    # Split Features (drop the temp date column) and Target
    X_time = df_leakage.drop(columns=['Target_Risk', 'Request_Date'])
    y_time = df_leakage['Target_Risk']
    
    X_train_time = X_time.iloc[:cutoff]
    y_train_time = y_time.iloc[:cutoff]
    
    X_test_time = X_time.iloc[cutoff:]
    y_test_time = y_time.iloc[cutoff:]
    
    print(f"Training on Oldest {len(X_train_time)} claims (Past).")
    print(f"Testing on Newest {len(X_test_time)} claims (Future).")

    # 5. TRAIN & TEST
    print("\nTraining Time-Aware Model (XGBoost)...")
    # Calculate class weight for this