In [23]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

In [24]:
# Load and clean data
df = pd.read_csv("crop_ndvi_yield_large.csv")
df = df.dropna(subset=['NDVI', 'Yield'])

In [25]:
# Features and target
X = df[['Crop_Type', 'NDVI']]
y = df['Yield']

In [26]:
# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(), ['Crop_Type']),
    ('num', StandardScaler(), ['NDVI'])
])

In [27]:
# Models
models = {
    'Linear Regression': make_pipeline(preprocessor, LinearRegression()),
    'Random Forest': make_pipeline(preprocessor, RandomForestRegressor(n_estimators=100, random_state=42)),
    'Support Vector Regressor': make_pipeline(preprocessor, SVR())
}

In [28]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Train and evaluate
results = []
best_model = None
best_r2 = -float('inf')

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    
    # Manually calculate RMSE
    mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
    rmse = np.sqrt(mse)  # Root Mean Squared Error
    
    results.append({'Model': name, 'R2 Score': round(r2, 4), 'RMSE': round(rmse, 4)})

    if r2 > best_r2:
        best_r2 = r2
        best_model = name

In [30]:
# Show report
report_df = pd.DataFrame(results).sort_values(by='R2 Score', ascending=False)
print("\nModel Performance Report:")
print(report_df)

print(f"\nBest Model: {best_model} (R2: {best_r2})")


Model Performance Report:
                      Model  R2 Score    RMSE
2  Support Vector Regressor    0.9611  0.1948
0         Linear Regression    0.9519  0.2166
1             Random Forest    0.9449  0.2319

Best Model: Support Vector Regressor (R2: 0.961118725070321)


In [31]:
# Save best model
best_pipeline = models[best_model]
joblib.dump(best_pipeline, 'crop_yield_predictor.pkl')
print("Model saved as crop_yield_predictor.pkl")

Model saved as crop_yield_predictor.pkl
