# Agricultural Crop Yield Prediction Model

## Mission: Optimize Agricultural Production for Food Security
This model predicts crop yields based on environmental and agricultural factors to help farmers and agricultural planners optimize production and ensure food security.

In [8]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Data Loading and Exploration

In [None]:
# Load the dataset
df = pd.read_csv('agriculture_crop_yield.csv')

print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()
print("\nFirst 5 rows:")
df.head()

ParserError: Error tokenizing data. C error: Expected 1 fields in line 9, saw 2


In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

print("\nBasic statistics:")
df.describe()

## Data Visualization and Analysis

In [None]:
# Visualization 1: Distribution of target variable
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(df['Yield_tons_per_hectare'], bins=50, alpha=0.7, color='skyblue')
plt.title('Distribution of Crop Yield')
plt.xlabel('Yield (tons per hectare)')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.boxplot(df['Yield_tons_per_hectare'])
plt.title('Boxplot of Crop Yield')
plt.ylabel('Yield (tons per hectare)')

plt.tight_layout()
plt.show()

In [None]:
# Visualization 2: Correlation heatmap
# First, convert categorical variables to numeric for correlation analysis
df_numeric = df.copy()
le = LabelEncoder()

categorical_cols = ['Region', 'Soil_Type', 'Crop', 'Weather_Condition']
for col in categorical_cols:
    df_numeric[col] = le.fit_transform(df_numeric[col])

# Convert boolean to int
df_numeric['Fertilizer_Used'] = df_numeric['Fertilizer_Used'].astype(int)
df_numeric['Irrigation_Used'] = df_numeric['Irrigation_Used'].astype(int)

plt.figure(figsize=(12, 10))
correlation_matrix = df_numeric.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Correlation Heatmap of All Variables')
plt.tight_layout()
plt.show()

# Show correlations with target variable
target_corr = correlation_matrix['Yield_tons_per_hectare'].sort_values(ascending=False)
print("Correlations with Yield:")
print(target_corr)

In [None]:
# Visualization 3: Key relationships
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Rainfall vs Yield
axes[0,0].scatter(df['Rainfall_mm'], df['Yield_tons_per_hectare'], alpha=0.5)
axes[0,0].set_xlabel('Rainfall (mm)')
axes[0,0].set_ylabel('Yield (tons/hectare)')
axes[0,0].set_title('Rainfall vs Crop Yield')

# Temperature vs Yield
axes[0,1].scatter(df['Temperature_Celsius'], df['Yield_tons_per_hectare'], alpha=0.5, color='orange')
axes[0,1].set_xlabel('Temperature (°C)')
axes[0,1].set_ylabel('Yield (tons/hectare)')
axes[0,1].set_title('Temperature vs Crop Yield')

# Fertilizer usage impact
df.boxplot(column='Yield_tons_per_hectare', by='Fertilizer_Used', ax=axes[1,0])
axes[1,0].set_xlabel('Fertilizer Used')
axes[1,0].set_ylabel('Yield (tons/hectare)')
axes[1,0].set_title('Fertilizer Usage vs Crop Yield')

# Irrigation usage impact
df.boxplot(column='Yield_tons_per_hectare', by='Irrigation_Used', ax=axes[1,1])
axes[1,1].set_xlabel('Irrigation Used')
axes[1,1].set_ylabel('Yield (tons/hectare)')
axes[1,1].set_title('Irrigation Usage vs Crop Yield')

plt.tight_layout()
plt.show()

## Feature Engineering and Data Preprocessing

In [None]:
# Feature engineering
print("Original features:", df.columns.tolist())

# Convert boolean columns to numeric
df['Fertilizer_Used'] = df['Fertilizer_Used'].astype(int)
df['Irrigation_Used'] = df['Irrigation_Used'].astype(int)

# Create dummy variables for categorical columns
df_encoded = pd.get_dummies(df, columns=['Region', 'Soil_Type', 'Crop', 'Weather_Condition'], drop_first=True)

print("\nFeatures after encoding:", df_encoded.columns.tolist())
print("\nDataset shape after encoding:", df_encoded.shape)

In [None]:
# Prepare features and target
X = df_encoded.drop('Yield_tons_per_hectare', axis=1)
y = df_encoded['Yield_tons_per_hectare']

print("Feature columns:")
for i, col in enumerate(X.columns):
    print(f"{i+1}. {col}")

print(f"\nTotal features: {len(X.columns)}")
print(f"Target variable: Yield_tons_per_hectare")

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data standardization completed.")
print(f"Training data mean: {X_train_scaled.mean():.4f}")
print(f"Training data std: {X_train_scaled.std():.4f}")

## Model Training and Evaluation

In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42, max_depth=10),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
}

# Train and evaluate models
results = {}
predictions = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    if name == 'Linear Regression':
        model.fit(X_train_scaled, y_train)
        y_pred_train = model.predict(X_train_scaled)
        y_pred_test = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
    
    # Calculate metrics
    train_mse = mean_squared_error(y_train, y_pred_train)
    test_mse = mean_squared_error(y_test, y_pred_test)
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    
    results[name] = {
        'train_mse': train_mse,
        'test_mse': test_mse,
        'train_r2': train_r2,
        'test_r2': test_r2
    }
    
    predictions[name] = y_pred_test
    
    print(f"Train MSE: {train_mse:.4f}")
    print(f"Test MSE: {test_mse:.4f}")
    print(f"Train R²: {train_r2:.4f}")
    print(f"Test R²: {test_r2:.4f}")

In [None]:
# Plot loss curves (MSE)
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
model_names = list(results.keys())
train_mse = [results[name]['train_mse'] for name in model_names]
test_mse = [results[name]['test_mse'] for name in model_names]

x = np.arange(len(model_names))
width = 0.35

plt.bar(x - width/2, train_mse, width, label='Train MSE', alpha=0.8)
plt.bar(x + width/2, test_mse, width, label='Test MSE', alpha=0.8)
plt.xlabel('Models')
plt.ylabel('Mean Squared Error')
plt.title('Model Performance - MSE Comparison')
plt.xticks(x, model_names, rotation=45)
plt.legend()

plt.subplot(1, 2, 2)
train_r2 = [results[name]['train_r2'] for name in model_names]
test_r2 = [results[name]['test_r2'] for name in model_names]

plt.bar(x - width/2, train_r2, width, label='Train R²', alpha=0.8)
plt.bar(x + width/2, test_r2, width, label='Test R²', alpha=0.8)
plt.xlabel('Models')
plt.ylabel('R² Score')
plt.title('Model Performance - R² Comparison')
plt.xticks(x, model_names, rotation=45)
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Find best model
best_model_name = max(results.keys(), key=lambda x: results[x]['test_r2'])
best_model = models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"Best Test R²: {results[best_model_name]['test_r2']:.4f}")
print(f"Best Test MSE: {results[best_model_name]['test_mse']:.4f}")

## Linear Regression Visualization

In [None]:
# Scatter plot for Linear Regression predictions
lr_model = models['Linear Regression']
lr_predictions = predictions['Linear Regression']

plt.figure(figsize=(12, 5))

# Before and after comparison
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_test, alpha=0.5, label='Perfect Prediction', color='red')
plt.scatter(y_test, lr_predictions, alpha=0.5, label='Linear Regression')
plt.xlabel('Actual Yield')
plt.ylabel('Predicted Yield')
plt.title('Linear Regression: Actual vs Predicted')
plt.legend()
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)

# Residuals plot
plt.subplot(1, 2, 2)
residuals = y_test - lr_predictions
plt.scatter(lr_predictions, residuals, alpha=0.5)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Yield')
plt.ylabel('Residuals')
plt.title('Residuals Plot')

plt.tight_layout()
plt.show()

## Model Prediction on Test Data

In [None]:
# Make prediction on one test sample
sample_idx = 0
sample_data = X_test.iloc[sample_idx:sample_idx+1]
actual_yield = y_test.iloc[sample_idx]

print("Sample Input Data:")
for col, val in sample_data.iloc[0].items():
    print(f"{col}: {val}")

print(f"\nActual Yield: {actual_yield:.2f} tons/hectare")

# Predict with best model
if best_model_name == 'Linear Regression':
    sample_scaled = scaler.transform(sample_data)
    predicted_yield = best_model.predict(sample_scaled)[0]
else:
    predicted_yield = best_model.predict(sample_data)[0]

print(f"Predicted Yield ({best_model_name}): {predicted_yield:.2f} tons/hectare")
print(f"Prediction Error: {abs(actual_yield - predicted_yield):.2f} tons/hectare")

## Save the Best Model

In [None]:
# Save the best model and scaler
joblib.dump(best_model, '../../best_yield_model.pkl')
joblib.dump(scaler, '../../scaler.pkl')

# Save feature names for API
feature_names = X.columns.tolist()
joblib.dump(feature_names, '../../feature_names.pkl')

print(f"Best model ({best_model_name}) saved successfully!")
print(f"Scaler saved successfully!")
print(f"Feature names saved successfully!")
print(f"\nModel Performance Summary:")
print(f"- Test R² Score: {results[best_model_name]['test_r2']:.4f}")
print(f"- Test MSE: {results[best_model_name]['test_mse']:.4f}")

## Model Interpretation

### Key Findings:
1. **Dataset**: Agricultural crop yield prediction with environmental and farming factors
2. **Features**: Weather conditions, soil type, crop type, fertilizer/irrigation usage
3. **Target**: Crop yield in tons per hectare
4. **Best Model**: The model with highest R² score on test data
5. **Performance**: Model performance indicates good predictive capability for agricultural planning

### Business Impact:
- Helps farmers optimize resource allocation
- Supports agricultural policy decisions
- Contributes to food security planning
- Enables precision agriculture practices