# 03_Yield_Prediction.ipynb – FarmPulse

## 1️ Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

## 2️ Load Dataset & Pipeline

In [4]:
df = pd.read_csv("/workspaces/FarmPulse-Smart-Agriculture-App/Data/processed/FarmPulse_featured.csv")
full_pipeline = joblib.load("/workspaces/FarmPulse-Smart-Agriculture-App/models/preprocessing_pipeline.pkl")
print("Dataset shape:", df.shape)

Dataset shape: (9986, 22)


## 3️ Define Features & Target

In [6]:
X = df.drop("Yield_Per_Hectare (tons)", axis=1)
y = df["Yield_Per_Hectare (tons)"]

In [7]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")


Train: (7988, 21), Test: (1998, 21)


## 4️ Create Full ML Pipeline – Random Forest

In [8]:
rf_pipeline = Pipeline(steps=[
    ("preprocessing", full_pipeline),
    ("model", RandomForestRegressor(n_estimators=200, random_state=42))
])

# Train
rf_pipeline.fit(X_train, y_train)

# Predict
y_pred_rf = rf_pipeline.predict(X_test)

# Evaluate
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest RMSE:", rmse_rf)
print("Random Forest R²:", r2_rf)

Random Forest RMSE: 0.4998966902818053
Random Forest R²: 0.9937314661222143


## 5️ Feature Importance – Random Forest

In [15]:
# Extract preprocessing step
preprocessor = rf_pipeline.named_steps["preprocessing"].named_steps["preprocessing"]

# 1️⃣ Numeric features
numeric_features = preprocessor.transformers_[0][2]

# 2️⃣ Categorical features (one-hot)
cat_pipeline = preprocessor.transformers_[1][1]  # the pipeline
cat_features = preprocessor.transformers_[1][2]  # original categorical columns
ohe = cat_pipeline.named_steps["encoder"]
cat_feature_names = ohe.get_feature_names_out(cat_features)

# Combine all
feature_names = list(numeric_features) + list(cat_feature_names)

# Feature importances
importances = rf_pipeline.named_steps["model"].feature_importances_

# Plot
plt.figure(figsize=(10,6))
sns.barplot(x=importances, y=feature_names)
plt.title("Random Forest Feature Importance")
plt.tight_layout()
os.makedirs("reports/figures", exist_ok=True)
plt.savefig("reports/figures/feature_importance_rf.png")
plt.close()


## 6️ XGBoost Model

In [16]:
xgb_pipeline = Pipeline(steps=[
    ("preprocessing", full_pipeline),
    ("model", XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42))
])

xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_test)

rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost RMSE:", rmse_xgb)
print("XGBoost R²:", r2_xgb)

XGBoost RMSE: 0.5325598051044532
XGBoost R²: 0.9928855353703936


## 7️ Save final model 

In [17]:

joblib.dump(rf_pipeline, "/workspaces/FarmPulse-Smart-Agriculture-App/models/yield_model.pkl")
print("ML Model saved to models/yield_model.pkl")

ML Model saved to models/yield_model.pkl
