In [None]:
# Install XGBoost if you haven't already (uncomment if needed)
# !pip install xgboost

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

print("--- Configuring Model Training ---")

# 1. DEFINE LEAKY COLUMNS TO DROP
# These are the IDs causing the model to "memorize" instead of learn
id_cols_to_drop = [
    'Request ID', 
    'Case Id', 
    'Service Provider Id', 
    'Service Provider Contact Id',
    'Claim ID' # Just in case
]

# 2. PREPARE X and y
target_name = 'Wages Reliability'
targets_to_exclude = ['Annual Leave Reliability', 'Long Service Leave Reliability', 'Wages Reliability']

# Drop Targets AND the IDs
cols_to_drop = targets_to_exclude + id_cols_to_drop
# Only drop what actually exists
existing_drop = [c for c in cols_to_drop if c in df.columns]

X = df.drop(columns=existing_drop)
y = df[target_name]

# 3. SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Features: {X_train.shape[1]} columns")
print(f"Training Rows: {X_train.shape[0]}")

# 4. TRAIN XGBOOST (The Upgrade)
# n_estimators=500: More trees
# learning_rate=0.05: Learns slower but more accurately
# n_jobs=-1: Uses all CPU cores
model = XGBRegressor(
    n_estimators=500, 
    learning_rate=0.05, 
    max_depth=6, 
    random_state=42,
    n_jobs=-1
)

print("Training XGBoost... (This might take 30 seconds)")
model.fit(X_train, y_train)

# 5. EVALUATE
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n--- Final Model Performance ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R2 Score (Accuracy):       {r2:.4f}")

# 6. PLOT FEATURE IMPORTANCE (The "Clean" View)
# Now that IDs are gone, what REALLY matters?
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance, palette='magma')
plt.title(f'True Drivers of {target_name} (IDs Removed)')
plt.xlabel('Importance Score')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 1. SELECT FEATURES & TARGET
# The Target is what we want to predict
target_name = 'Wages Reliability'

# The Features (X) are everything EXCEPT the target columns
# We drop ALL reliability targets so the model doesn't "cheat" by looking at Annual Leave to predict Wages
drop_targets = ['Annual Leave Reliability', 'Long Service Leave Reliability', 'Wages Reliability']
X = df.drop(columns=drop_targets, errors='ignore')
y = df[target_name]

# 2. SPLIT DATA (80% Training, 20% Testing)
# random_state=42 ensures we get the same split every time (reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training on {X_train.shape[0]} rows. Testing on {X_test.shape[0]} rows.")

# 3. TRAIN THE MODEL
# n_estimators=100 means "build 100 decision trees"
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

print("âœ… Model Training Complete.")

# 4. EVALUATE PERFORMANCE
# Make predictions on the Test Set (data the model has never seen)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n--- Model Performance Results ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R2 Score (Accuracy):       {r2:.4f}")
print("---------------------------------")
print(f"Interpretation: On average, the model's reliability score is off by {mae*100:.2f}%.")

# 5. VISUALIZE FEATURE IMPORTANCE (The "Why")
# This shows which columns drove the decisions the most
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance, palette='viridis')
plt.title(f'Top 10 Predictors for {target_name}')
plt.xlabel('Importance Score')
plt.show()