In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# 1. SELECT FEATURES & TARGET
# The Target is what we want to predict
target_name = 'Wages Reliability'

# The Features (X) are everything EXCEPT the target columns
# We drop ALL reliability targets so the model doesn't "cheat" by looking at Annual Leave to predict Wages
drop_targets = ['Annual Leave Reliability', 'Long Service Leave Reliability', 'Wages Reliability']
X = df.drop(columns=drop_targets, errors='ignore')
y = df[target_name]

# 2. SPLIT DATA (80% Training, 20% Testing)
# random_state=42 ensures we get the same split every time (reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training on {X_train.shape[0]} rows. Testing on {X_test.shape[0]} rows.")

# 3. TRAIN THE MODEL
# n_estimators=100 means "build 100 decision trees"
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

print("âœ… Model Training Complete.")

# 4. EVALUATE PERFORMANCE
# Make predictions on the Test Set (data the model has never seen)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n--- Model Performance Results ---")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R2 Score (Accuracy):       {r2:.4f}")
print("---------------------------------")
print(f"Interpretation: On average, the model's reliability score is off by {mae*100:.2f}%.")

# 5. VISUALIZE FEATURE IMPORTANCE (The "Why")
# This shows which columns drove the decisions the most
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance, palette='viridis')
plt.title(f'Top 10 Predictors for {target_name}')
plt.xlabel('Importance Score')
plt.show()