# Crop Recommendation System Using Machine Learning
## Using Soil and Climate Data

This notebook builds a machine learning model to recommend the most suitable crop based on:
- **N** (Nitrogen in soil)
- **P** (Phosphorus in soil)
- **K** (Potassium in soil)
- **Temperature**
- **Humidity**
- **pH** (Soil acidity)
- **Rainfall**

## 1. Import Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("‚úì All libraries imported successfully!")

## 2. Load and Explore the Dataset

In [None]:
# Load datasets
data1 = pd.read_csv('Datasets/crop_data1.csv')
data2 = pd.read_csv('Datasets/crop_data2.csv')

# Combine datasets
data = pd.concat([data1, data2], ignore_index=True)

print(f"Total samples: {len(data)}")
print(f"\nDataset shape: {data.shape}")
print(f"\nFirst few rows:")
print(data.head())

## 3. Check for Missing Values and Data Info

In [None]:
# Check missing values
print("Missing values:")
print(data.isnull().sum())

# Data types
print("\nData types:")
print(data.dtypes)

# Statistical summary
print("\nStatistical Summary:")
print(data.describe())

## 4. Visualize the Data Distribution

In [None]:
# Create histograms for each feature
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
fig.suptitle('Distribution of Features', fontsize=16, fontweight='bold')

features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label']
axes = axes.flatten()

for idx, feature in enumerate(features):
    axes[idx].hist(data[feature], bins=30, color='skyblue', edgecolor='black')
    axes[idx].set_title(f'{feature} Distribution')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print("‚úì Data distributions visualized!")

## 5. Check Unique Crops

In [None]:
# Get unique crops
unique_crops = data['label'].unique()
print(f"Number of unique crops: {len(unique_crops)}")
print(f"\nCrops available: {sorted(unique_crops)}")

# Count of each crop
print("\nCrop distribution:")
print(data['label'].value_counts().sort_values(ascending=False))

## 6. Data Preprocessing

In [None]:
# Separate features and target
X = data[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
y = data['label']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\n‚úì Data preprocessed and scaled!")

## 7. Split Data into Train and Test Sets

In [None]:
# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
print(f"\nTraining set: {X_train.shape[1]} features")

## 8. Train Multiple ML Models

In [None]:
# Dictionary to store models and results
models = {}
results = {}

# 1. Gaussian Naive Bayes
print("Training Gaussian Naive Bayes...")
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
models['Gaussian Naive Bayes'] = gnb
results['Gaussian Naive Bayes'] = accuracy_gnb
print(f"‚úì Accuracy: {accuracy_gnb:.4f}\n")

# 2. Decision Tree
print("Training Decision Tree...")
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
models['Decision Tree'] = dt
results['Decision Tree'] = accuracy_dt
print(f"‚úì Accuracy: {accuracy_dt:.4f}\n")

# 3. Random Forest
print("Training Random Forest...")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
models['Random Forest'] = rf
results['Random Forest'] = accuracy_rf
print(f"‚úì Accuracy: {accuracy_rf:.4f}\n")

# 4. Support Vector Machine
print("Training Support Vector Machine...")
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
models['SVM'] = svm
results['SVM'] = accuracy_svm
print(f"‚úì Accuracy: {accuracy_svm:.4f}\n")

# 5. Gradient Boosting
print("Training Gradient Boosting...")
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
models['Gradient Boosting'] = gb
results['Gradient Boosting'] = accuracy_gb
print(f"‚úì Accuracy: {accuracy_gb:.4f}\n")

print("="*50)
print("All models trained successfully!")
print("="*50)

## 9. Compare Model Performance

In [None]:
# Display results
print("\nüèÜ MODEL PERFORMANCE COMPARISON\n")
print("-" * 40)
for model_name, accuracy in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f"{model_name:.<25} {accuracy:.2%}")
print("-" * 40)

# Best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
best_accuracy = results[best_model_name]

print(f"\n‚úì BEST MODEL: {best_model_name}")
print(f"‚úì Accuracy: {best_accuracy:.2%}")

## 10. Visualize Model Comparison

In [None]:
# Bar chart of accuracies
plt.figure(figsize=(10, 6))
models_list = list(results.keys())
accuracies = list(results.values())

colors = ['#2ecc71' if acc == max(accuracies) else '#3498db' for acc in accuracies]
bars = plt.bar(models_list, accuracies, color=colors, edgecolor='black', linewidth=2)

plt.title('Model Performance Comparison', fontsize=14, fontweight='bold')
plt.xlabel('Models', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.ylim([0.9, 1.0])

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002,
             f'{acc:.2%}', ha='center', va='bottom', fontweight='bold')

plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.grid(axis='y', alpha=0.3)
plt.show()

## 11. Classification Report for Best Model

In [None]:
# Get predictions from best model
y_pred_best = best_model.predict(X_test)

print(f"\n{'='*60}")
print(f"Classification Report for {best_model_name}")
print(f"{'='*60}\n")
print(classification_report(y_test, y_pred_best))

## 12. Make Predictions on New Data

In [None]:
# Example: Predict crop for specific soil and climate conditions
def predict_crop(N, P, K, temperature, humidity, ph, rainfall):
    """
    Predict the best crop for given environmental conditions
    """
    # Create input data
    input_data = np.array([[N, P, K, temperature, humidity, ph, rainfall]])
    
    # Scale the input
    input_scaled = scaler.transform(input_data)
    
    # Predict using best model
    prediction = best_model.predict(input_scaled)
    
    return prediction[0]

# Test predictions
print("\nüåæ CROP PREDICTION EXAMPLES\n")
print("="*60)

# Example 1: High nitrogen, good rainfall
predicted_crop_1 = predict_crop(N=90, P=42, K=43, temperature=20.87, humidity=82.0, ph=6.5, rainfall=202.9)
print(f"\nConditions: High N, High humidity, Good rainfall")
print(f"Predicted Crop: {predicted_crop_1}")

# Example 2: Dry conditions
predicted_crop_2 = predict_crop(N=30, P=25, K=35, temperature=35.0, humidity=45.0, ph=7.0, rainfall=50.0)
print(f"\nConditions: Low N, Low humidity, Low rainfall")
print(f"Predicted Crop: {predicted_crop_2}")

# Example 3: Medium conditions
predicted_crop_3 = predict_crop(N=60, P=35, K=40, temperature=25.0, humidity=65.0, ph=6.8, rainfall=120.0)
print(f"\nConditions: Medium N, Medium humidity, Medium rainfall")
print(f"Predicted Crop: {predicted_crop_3}")

print("\n" + "="*60)

## 13. Feature Importance (for Random Forest)

In [None]:
# Get feature importance from Random Forest
feature_names = ['N', 'P', 'K', 'Temperature', 'Humidity', 'pH', 'Rainfall']
importances = rf.feature_importances_

# Sort by importance
indices = np.argsort(importances)[::-1]

print("\nüìä FEATURE IMPORTANCE (Random Forest)\n")
print("-" * 40)
for i, idx in enumerate(indices, 1):
    print(f"{i}. {feature_names[idx]:.<20} {importances[idx]:.4f}")
print("-" * 40)

# Visualize
plt.figure(figsize=(10, 6))
plt.barh(range(len(indices)), importances[indices], color='#3498db', edgecolor='black')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Importance', fontsize=12)
plt.title('Feature Importance in Random Forest', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.grid(axis='x', alpha=0.3)
plt.show()

## 14. Summary and Conclusions

In [None]:
print("\n" + "="*60)
print("CROP RECOMMENDATION SYSTEM - SUMMARY")
print("="*60)

print(f"""
üìä DATASET INFORMATION:
  ‚Ä¢ Total Samples: {len(data)}
  ‚Ä¢ Features: 7 (N, P, K, Temperature, Humidity, pH, Rainfall)
  ‚Ä¢ Target Classes: {len(unique_crops)} crops

üìà MODEL PERFORMANCE:
  ‚Ä¢ Best Model: {best_model_name}
  ‚Ä¢ Accuracy: {best_accuracy:.2%}
  ‚Ä¢ Test Set Size: {len(X_test)} samples

üéØ KEY FEATURES:
  ‚Ä¢ Preprocessed features using StandardScaler
  ‚Ä¢ Trained 5 different ML algorithms
  ‚Ä¢ Evaluated using accuracy and classification metrics
  ‚Ä¢ Made predictions on new environmental conditions

üí° INSIGHTS:
  ‚Ä¢ Most important feature: {feature_names[indices[0]]}
  ‚Ä¢ Can predict crop type with {best_accuracy:.2%} confidence
  ‚Ä¢ Model is ready for deployment

‚úÖ PROJECT COMPLETE!
""")

print("="*60)