In [None]:
# Install XGBoost if you haven't already (uncomment if needed)
# !pip install xgboost

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

print("--- Configuring Model Training ---")

# 1. DEFINE LEAKY COLUMNS TO DROP
# These are the IDs causing the model to "memorize" instead of learn
id_cols_to_drop = [
    'Request ID', 
    'Case Id', 
    'Service Provider Id', 
    'Service Provider Contact Id',
    'Claim ID' # Just in case
]

# 2. PREPARE X and y
target_name = 'Wages Reliability'
targets_to_exclude = ['Annual Leave Reliability', 'Long Service Leave Reliability', 'Wages Reliability']

# Drop Targets AND the IDs
cols_to_drop = targets_to_exclude + id_cols_to_drop
# Only drop what actually exists
existing_drop = [c for c in cols_to_drop if c in df.columns]

X = df.drop(columns=existing_drop)
y = df[target_name]

# 3. SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Features: {X_train.shape[1]} columns")
print(f"Training Rows: {X_train.shape[0]}")

# 4. TRAIN XGBOOST (The Upgrade)
# n_estimators=500: More trees
# learning_rate=0.05: Learns slower but more accurately
# n_jobs=-1: Uses all CPU cores
model = XGBRegressor(
    n_estimators=500, 
    learning_rate=0.05, 
    max_depth=6, 
    random_state=42,
    n_jobs=-1
)

print("Training XGBoost... (This might take 30 seconds)")
model.fit(X_train, y_train)

# 5. EVALUATE
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n--- Final Model Performance ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R2 Score (Accuracy):       {r2:.4f}")

# 6. PLOT FEATURE IMPORTANCE (The "Clean" View)
# Now that IDs are gone, what REALLY matters?
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance, palette='magma')
plt.title(f'True Drivers of {target_name} (IDs Removed)')
plt.xlabel('Importance Score')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 1. SELECT FEATURES & TARGET
# The Target is what we want to predict
target_name = 'Wages Reliability'

# The Features (X) are everything EXCEPT the target columns
# We drop ALL reliability targets so the model doesn't "cheat" by looking at Annual Leave to predict Wages
drop_targets = ['Annual Leave Reliability', 'Long Service Leave Reliability', 'Wages Reliability']
X = df.drop(columns=drop_targets, errors='ignore')
y = df[target_name]

# 2. SPLIT DATA (80% Training, 20% Testing)
# random_state=42 ensures we get the same split every time (reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training on {X_train.shape[0]} rows. Testing on {X_test.shape[0]} rows.")

# 3. TRAIN THE MODEL
# n_estimators=100 means "build 100 decision trees"
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

print("✅ Model Training Complete.")

# 4. EVALUATE PERFORMANCE
# Make predictions on the Test Set (data the model has never seen)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n--- Model Performance Results ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R2 Score (Accuracy):       {r2:.4f}")
print("---------------------------------")
print(f"Interpretation: On average, the model's reliability score is off by {mae*100:.2f}%.")

# 5. VISUALIZE FEATURE IMPORTANCE (The "Why")
# This shows which columns drove the decisions the most
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance, palette='viridis')
plt.title(f'Top 10 Predictors for {target_name}')
plt.xlabel('Importance Score')
plt.show()

In [None]:
# Diagnostic Cell: Where did the features go?

print("--- Checking for Key Features in Training Data ---")

# List of the 'Smart' features we expect to see
key_features = [
    'Provider_Reputation_Score', 
    'IP_Wage_to_ABS_Ratio', 
    'IP_Tenure_Years', 
    'IP Wages'
]

# 1. Do they exist?
for col in key_features:
    if col in X_train.columns:
        print(f"✅ FOUND: {col}")
    else:
        print(f"❌ MISSING: {col} (This is why it's not in the chart!)")

# 2. If they exist, how weak are they?
# Let's look at their correlation with the Target
if 'Wages Reliability' in df.columns:
    print("\n--- Correlation with Target ---")
    # We check the original df because X_train is already split
    for col in key_features:
        if col in df.columns:
            corr = df[col].corr(df['Wages Reliability'])
            print(f"{col}: {corr:.4f}")

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

print("--- Pivoting to Binary Classification ---")

# 1. DEFINE THE THRESHOLD (Business Logic)
# User Proposal: Anything below 0.95 is "Unreliable" (Risk)
THRESHOLD = 0.95

# Create Binary Target: 1 = RISK (Unreliable), 0 = SAFE (Reliable)
# We usually define the "Positive Class" (1) as the thing we want to catch (The Risk)
y_class = (df['Wages Reliability'] < THRESHOLD).astype(int)

print(f"Reliability Threshold: {THRESHOLD*100}%")
print("Class Distribution:")
print(y_class.value_counts(normalize=True).rename({0: 'Safe (Majority)', 1: 'Risk (Minority)'}))

# 2. SPLIT DATA
# We use the same X features as before (IDs removed)
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42, stratify=y_class)
# Note: stratify=y_class ensures we have the same % of bad cases in train and test

# 3. CALCULATE SCALE_POS_WEIGHT
# This tells XGBoost: "Pay X times more attention to the minority class"
# Formula: Count(Majority) / Count(Minority)
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)
print(f"\nImbalance Ratio: {ratio:.2f} (Model will weight 'Risk' cases {ratio:.2f}x more)")

# 4. TRAIN CLASSIFIER
model_class = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    scale_pos_weight=ratio, # <--- CRITICAL FIX FOR IMBALANCE
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

model_class.fit(X_train, y_train)

# 5. EVALUATE (Confusion Matrix)
y_pred_class = model_class.predict(X_test)

print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred_class, target_names=['Safe', 'Risk']))

# Visualize
plt.figure(figsize=(6, 5))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_class, display_labels=['Safe', 'Risk'], cmap='Blues', colorbar=False)
plt.title("Confusion Matrix (Did we catch the risks?)")
plt.show()

# 6. CHECK FEATURE IMPORTANCE (Again)
# Does the logic change when looking for anomalies?
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model_class.feature_importances_
}).sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance, palette='coolwarm')
plt.title(f'Top Predictors of UNRELIABILITY (< {THRESHOLD})')
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve

# 1. Get the Probability Scores (0% to 100% risk) instead of just Yes/No
y_probs = model_class.predict_proba(X_test)[:, 1]

# 2. Test different Thresholds
thresholds = [0.50, 0.60, 0.70, 0.80, 0.85, 0.90, 0.95]

print(f"{'Threshold':<10} | {'Precision (Accuracy of Flags)':<30} | {'Recall (Risks Caught)':<25}")
print("-" * 75)

for t in thresholds:
    # Apply the new threshold
    y_pred_new = (y_probs >= t).astype(int)
    
    # Calculate metrics
    report = classification_report(y_test, y_pred_new, output_dict=True)
    prec = report['1']['precision']
    rec = report['1']['recall']
    
    print(f"{t:.2f}       | {prec:.2%}                         | {rec:.2%}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visual Proof of "No Leakage"
plt.figure(figsize=(10, 6))

# We sample 2000 points just to keep the plot readable
sns.scatterplot(data=df.sample(2000, random_state=42), x='IP Wages', y='Wages Reliability', alpha=0.3)

plt.title("Leakage Test: IP Wage vs. Reliability")
plt.xlabel("IP Wage ($)")
plt.ylabel("Reliability Score (0.0 - 1.0)")
plt.grid(True, alpha=0.3)
plt.show()