In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [2]:
df = pd.read_csv("sample_crop_yield.csv")
print(f"Dataset shape: {df.shape}")
print(df.head())

Dataset shape: (1200, 14)
  soil_type       crop        ph  organic_matter_pct  nitrogen_kg_per_ha  \
0     Peaty      Wheat  6.858952            2.031840           87.475326   
1     Peaty       Rice  5.962177            1.844218           89.322672   
2     Sandy      Maize  5.747881            2.547665           85.211525   
3     Sandy  Sugarcane  6.034530            1.906096           77.065594   
4      Silt     Potato  6.330931            0.684163          111.645871   

   phosphorus_kg_per_ha  potassium_kg_per_ha  avg_temp_c  rainfall_mm  \
0             36.097557            46.513856   27.498379  1093.758552   
1             35.859079            54.153350   26.157831   744.040322   
2             53.586779            86.012905   29.401948   411.401234   
3             30.269230            46.215190   33.420733   816.284516   
4             43.270191            64.149458   30.866384   639.875453   

   humidity_pct  irrigation_mm  fertilizer_cost_rs_per_ha  \
0     55.843875  

In [14]:
 #Separate Features and Target

target_col = "yield_kg_per_ha"
y = df[target_col]
X = df.drop(columns=[target_col])

# Detect categorical vs numerical columns
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["number"]).columns.tolist()

print(f"Categorical columns: {cat_cols}")
print(f"Numerical columns: {num_cols}")

Categorical columns: ['soil_type', 'crop']
Numerical columns: ['ph', 'organic_matter_pct', 'nitrogen_kg_per_ha', 'phosphorus_kg_per_ha', 'potassium_kg_per_ha', 'avg_temp_c', 'rainfall_mm', 'humidity_pct', 'irrigation_mm', 'fertilizer_cost_rs_per_ha', 'previous_yield_kg_per_ha']


In [15]:
# Preprocessing
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ("num", StandardScaler(), num_cols)
])

In [16]:
# Model
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

In [17]:
# Pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model)
])


In [18]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [19]:
# Train Model
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [20]:
# Predictions
y_pred = pipeline.predict(X_test)

In [21]:
# Evaluation Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [11]:
print("\n📊 Model Evaluation:")
print(f"MAE  : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R²   : {r2:.4f}")


📊 Model Evaluation:
MAE  : 212.57
RMSE : 71728.60
R²   : 0.8383


In [23]:
# Cross-Validation for Robustness
cv_mae = -cross_val_score(pipeline, X, y, cv=5, scoring="neg_mean_absolute_error")
print(f"5-Fold CV MAE: {cv_mae.mean():.2f} ± {cv_mae.std():.2f}")

5-Fold CV MAE: 226.14 ± 11.40


In [24]:
# Save the trained pipeline
joblib.dump(pipeline, "crop_yield_model.joblib")
print("\n✅ Model saved as 'crop_yield_model.joblib'")


✅ Model saved as 'crop_yield_model.joblib'
