In [41]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import time
import warnings
warnings.filterwarnings("ignore")

In [42]:
# Load and Prepare Data
df = pd.read_csv('../data/processed/processed_data.csv')

#Separate features (X) from the target (y)
X = df.drop('Recycling Rate (%)', axis=1)
y = df['Recycling Rate (%)']

# Split data for training and validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_test.shape}")


Training set shape: (680, 22)
Validation set shape: (170, 22)


In [43]:
# The Baseline Model (Linear Regression)
# # Insight: We start with the simplest possible model, Linear Regression. Its performance
# gives us a "baseline" score. Any other model we build must be better than this
# to be considered useful.

# Feature Scaling 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [44]:
# Train the model 

lr_model =LinearRegression()
lr_model.fit(X_train_scaled, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [45]:
# Evaluate And Store the baseline RMSE

lr_preds = lr_model.predict(X_test_scaled)
baseline_rmse = np.sqrt(mean_squared_error(y_test, lr_preds))
print(f"\n>> Baseline Linear Regression RMSE: {baseline_rmse:.4f}")
print("   Insight: An error of ~16.5 is very high. This tells us that a simple straight-line model")
print("   is not sufficient to capture the complex, non-linear patterns in our data.")


>> Baseline Linear Regression RMSE: 16.5016
   Insight: An error of ~16.5 is very high. This tells us that a simple straight-line model
   is not sufficient to capture the complex, non-linear patterns in our data.


In [46]:
# We get high RMSE value for the model, which means the model is not good at predicting the target variable.
# We can use more advence model like Random Forest or Gradient Boosting to improve the model.

# We Use A More Powerful Tree-Based Model (Random Forest)

rf_model = RandomForestRegressor(n_estimators=100,n_jobs=-1,random_state=42)
rf_model.fit(X_train,y_train)


0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [47]:
# Evaluate And Compare
rf_preds = rf_model.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
print(f"\n>> Random Forest RMSE: {rf_rmse:.4f}")
print("Insight: Interestingly, the default Random Forest performed worse than our baseline.")
print("This highlights that more complexity is not always better without proper tuning.")



>> Random Forest RMSE: 17.8755
Insight: Interestingly, the default Random Forest performed worse than our baseline.
This highlights that more complexity is not always better without proper tuning.


In [48]:
# Train XGBoost with its default settings.
xgb_model_default = xgb.XGBRegressor(random_state=42)
xgb_model_default.fit(X_train, y_train)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [49]:

# Evaluate and compare
xgb_default_preds = xgb_model_default.predict(X_test)
xgb_default_rmse = np.sqrt(mean_squared_error(y_test, xgb_default_preds))
print(f"\n>> Default XGBoost RMSE: {xgb_default_rmse:.4f}")
print("Insight: The default XGBoost performs the worst. This proves that the power of advanced models is only unlocked through careful tuning.")



>> Default XGBoost RMSE: 19.9709
Insight: The default XGBoost performs the worst. This proves that the power of advanced models is only unlocked through careful tuning.


In [50]:
print("--- Optimizing XGBoost with RandomizedSearchCV ---")
start_time = time.time()

# Define the grid of parameters to search
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

# Set up and run the search
random_search = RandomizedSearchCV(
    estimator=xgb.XGBRegressor(random_state=42),
    param_distributions=param_grid, n_iter=30, cv=5, verbose=1,
    n_jobs=-1, scoring='neg_root_mean_squared_error', random_state=42
)
random_search.fit(X_train, y_train)
end_time = time.time()

print(f"\nTuning process finished in {end_time - start_time:.2f} seconds.")
print(f"Best parameters found: {random_search.best_params_}")

# Evaluate the tuned model
best_xgb_model = random_search.best_estimator_
tuned_preds = best_xgb_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, tuned_preds))

print(f"\n>> Final Tuned XGBoost RMSE: {final_rmse:.4f}")
print("Insight: After tuning, the XGBoost RMSE is now our best score, but it did not significantly beat the baseline.")


--- Optimizing XGBoost with RandomizedSearchCV ---
Fitting 5 folds for each of 30 candidates, totalling 150 fits

Tuning process finished in 18.62 seconds.
Best parameters found: {'subsample': 0.7, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.7}

>> Final Tuned XGBoost RMSE: 16.8338
Insight: After tuning, the XGBoost RMSE is now our best score, but it did not significantly beat the baseline.


In [51]:
# --- Final Conclusion and Model Justification ---
print("Methodology Conclusion:")
print(f"1. We established a simple but effective baseline RMSE of {baseline_rmse:.4f} with Linear Regression.")
print(f"2. Our key finding is that even a highly tuned XGBoost model (RMSE: {final_rmse:.4f}) could not")
print("   significantly outperform the baseline. This suggests we have reached the")
print("   predictive limit of the features available in this dataset.")

print("\nFinal Model Choice: Tuned XGBoost Regressor")
print("Justification: Despite the similar RMSE, XGBoost is a more robust and theoretically sound")
print("model for this complex data and is more likely to generalize well to new, unseen data.")

# Retrain the final model on the ENTIRE dataset for maximum performance
print("\nRetraining the final model on all available data...")
final_model = random_search.best_estimator_
final_model.fit(X, y)

# Save the final model for our Flask application
joblib.dump(final_model, '../models/final_waste_recycling_model.pkl')
print("\nFinal model has been successfully trained and saved to '../models/final_waste_recycling_model.pkl'")


Methodology Conclusion:
1. We established a simple but effective baseline RMSE of 16.5016 with Linear Regression.
2. Our key finding is that even a highly tuned XGBoost model (RMSE: 16.8338) could not
   significantly outperform the baseline. This suggests we have reached the
   predictive limit of the features available in this dataset.

Final Model Choice: Tuned XGBoost Regressor
Justification: Despite the similar RMSE, XGBoost is a more robust and theoretically sound
model for this complex data and is more likely to generalize well to new, unseen data.

Retraining the final model on all available data...

Final model has been successfully trained and saved to '../models/final_waste_recycling_model.pkl'
