In [7]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load and clean data
df = pd.read_csv("crop_ndvi_yield_large.csv")
df = df.dropna(subset=['NDVI', 'Yield'])

# ✅ Normalize crop type
df['Crop_Type'] = df['Crop_Type'].str.lower().str.strip()

# Features and target
X = df[['Crop_Type', 'NDVI']]
y = df['Yield']

# Preprocessing
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(), ['Crop_Type']),
    ('num', StandardScaler(), ['NDVI'])
])

# Models - using explicit Pipeline instead of make_pipeline for clarity
models = {
    'Linear Regression': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]),
    'Random Forest': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ]),
    'Support Vector Regressor': Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', SVR())
    ])
}

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate
results = []
best_model = None
best_r2 = -float('inf')

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    results.append({'Model': name, 'R2 Score': round(r2, 4), 'RMSE': round(rmse, 4)})

    if r2 > best_r2:
        best_r2 = r2
        best_model = model  # Store the actual model object, not just the name

# Show report
report_df = pd.DataFrame(results).sort_values(by='R2 Score', ascending=False)
print("\nModel Performance Report:")
print(report_df)

print(f"\nBest Model R2: {best_r2}")

# Save best model
joblib.dump(best_model, 'crop_yield_predictor.pkl')
print("Model saved as crop_yield_predictor.pkl")

# Test loading and prediction
loaded_model = joblib.load('crop_yield_predictor.pkl')
print("\nLoaded model type:", type(loaded_model))

# Create some test data in the correct format
test_data = pd.DataFrame({
    'Crop_Type': ['wheat'],  # ✅ lowercase
    'NDVI': [0.75]
})

# Make prediction
try:
    prediction = loaded_model.predict(test_data)
    print("Test prediction:", prediction)
except Exception as e:
    print("Prediction error:", str(e))


Model Performance Report:
                      Model  R2 Score    RMSE
2  Support Vector Regressor    0.9611  0.1948
0         Linear Regression    0.9519  0.2166
1             Random Forest    0.9449  0.2319

Best Model R2: 0.961118725070321
Model saved as crop_yield_predictor.pkl

Loaded model type: <class 'sklearn.pipeline.Pipeline'>
Test prediction: [4.84990405]
