In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
import pickle

# Define crop dictionary for manual label mapping
crop_dict = {
    'rice': 1,
    'maize': 2,
    'jute': 3,
    'cotton': 4,
    'coconut': 5,
    'papaya': 6,
    'orange': 7,
    'apple': 8,
    'muskmelon': 9,
    'watermelon': 10,
    'grapes': 11,
    'mango': 12,
    'banana': 13,
    'pomegranate': 14,
    'lentil': 15,
    'blackgram': 16,
    'mungbean': 17,
    'mothbeans': 18,
    'pigeonpeas': 19,
    'kidneybeans': 20,
    'chickpea': 21,
    'coffee': 22
}

# Load Data
crop = pd.read_csv("Crop_recommendation.csv")

# Map string labels to numerical values
crop['label'] = crop['label'].map(crop_dict)

## Exploratory Data Analysis (EDA)

In [2]:
print("Dataset Shape:", crop.shape)
print("\nDataset Info:")
crop.info()
print("\nMissing Values:\n", crop.isnull().sum())
print("\nDuplicate Rows:", crop.duplicated().sum())
print("\nSummary Statistics:\n", crop.describe())

# Debug: Check unique labels after mapping
print("\nUnique Labels in Dataset (after mapping):\n", crop['label'].unique())

# Visualization 1: Distribution of Features
plt.figure(figsize=(15, 10))
features = ['N', 'P', 'K', 'temperature', 'humidity', 'ph']
colors = ['#8B4513', '#D2B48C', '#6B8E23', '#F5F5DC', '#A8B5A2', '#D4B996']
for i, (feature, color) in enumerate(zip(features, colors), 1):
    plt.subplot(2, 3, i)
    sns.histplot(crop[feature], bins=30, kde=True, color=color)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Visualization 2: Crop Distribution
plt.figure(figsize=(12, 6))
inv_crop_dict = {v: k for k, v in crop_dict.items()}
crop['label_name'] = crop['label'].map(inv_crop_dict)
sns.countplot(data=crop, x='label_name', palette=colors[:len(crop_dict)], order=crop['label_name'].value_counts().index)
plt.title('Distribution of Crops in Dataset')
plt.xlabel('Crop')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Visualization 3: Pair Plot
sns.pairplot(crop, hue='label_name', palette=colors[:len(crop_dict)], diag_kind='kde')
plt.suptitle('Pairwise Relationships of Features by Crop', y=1.02)
plt.show()

# Visualization 4: Correlation Heatmap
numeric_crop = crop.drop('label_name', axis=1).select_dtypes(include=['number'])
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_crop.corr(), annot=True, cmap='YlOrBr', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Features')
plt.tight_layout()
plt.show()

# Drop the temporary label_name column
crop = crop.drop('label_name', axis=1)

## Preprocessing

In [3]:
# Drop 'rainfall' (assuming domain knowledge justifies this; otherwise, analyze feature importance)
crop = crop.drop('rainfall', axis=1)

# Correlation Analysis
numeric_crop = crop.select_dtypes(include=['number'])
print("\nCorrelation Matrix:\n", numeric_crop.corr())

# Prepare features and target
X = crop.drop('label', axis=1)
y = crop['label']

# Encode labels (optional, since already numerical, but kept for consistency)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Debug: Check LabelEncoder classes
print("\nLabelEncoder Classes:", le.classes_)
print("Type of LabelEncoder Classes:", type(le.classes_[0]))

## Model Training

In [4]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Scale features using StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

# Initialize RandomForestClassifier
randclf = RandomForestClassifier(random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=randclf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Best model
randclf = grid_search.best_estimator_
print("\nBest Parameters from GridSearchCV:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Cross-validation scores
cv_scores = cross_val_score(randclf, X_train_scaled, y_train, cv=5, scoring='accuracy')
print("\nCross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())
print("Standard Deviation CV Score:", cv_scores.std())

## Model Evaluation

In [5]:
# Evaluate on test set
y_pred = randclf.predict(X_test_scaled)

# Confusion Matrix
crop_labels = list(crop_dict.keys())
print("\nCrop Labels for Confusion Matrix:", crop_labels)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(12, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=crop_labels, yticklabels=crop_labels)
plt.title('Confusion Matrix for Crop Recommendation System', pad=20)
plt.xlabel('Predicted Crops')
plt.ylabel('Actual Crops')
plt.tight_layout()
plt.show()

# Detailed classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=crop_labels))

# Feature Importance
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': randclf.feature_importances_})
print("\nFeature Importance:\n", feature_importance.sort_values(by='Importance', ascending=False))

# Visualization: Feature Importance Bar Plot
plt.figure(figsize=(8, 6))
colors = ['#8B4513', '#D2B48C', '#6B8E23', '#F5F5DC', '#A8B5A2', '#D4B996']
sns.barplot(data=feature_importance.sort_values(by='Importance', ascending=False), x='Importance', y='Feature', palette=colors[:len(X.columns)])
plt.title('Feature Importance in Random Forest Model')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

## Recommendation Function

In [6]:
# Recommendation Function (returns crop name as string)
def recommendation(N, P, K, temperature, humidity, ph):
    features = np.array([[N, P, K, temperature, humidity, ph]])
    print(f"Raw features: {features}")
    
    sc_features = sc.transform(features)
    print(f"After StandardScaler: {sc_features}")
    
    prediction = randclf.predict(sc_features)
    print(f"Raw prediction: {prediction}")
    
    # Convert numerical prediction to crop name using crop_dict
    inv_crop_dict = {v: k for k, v in crop_dict.items()}
    return inv_crop_dict[le.inverse_transform(prediction)[0]]

## Test Recommendation with Visualization

In [7]:
N = 90
P = 42
K = 43
temperature = 20.879744
humidity = 82.002744
ph = 6.502985
predict = recommendation(N, P, K, temperature, humidity, ph)
print("\nPredicted Crop:", predict)

# Visualization: Prediction Confidence Distribution (Approximated)
predictions = randclf.predict_proba(X_test_scaled)
sample_confidence = predictions[np.argmax(predictions, axis=1) == y_test][0]  # Confidence for correct predictions
plt.figure(figsize=(8, 6))
sns.barplot(x=range(len(sample_confidence)), y=sample_confidence, palette=colors)
plt.title('Confidence Scores for Sample Prediction')
plt.xlabel('Crop Index')
plt.ylabel('Confidence Score')
plt.xticks(ticks=range(len(crop_labels)), labels=crop_labels, rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Save the Model and Scalers

In [8]:
pickle.dump(randclf, open('model.pkl', 'wb'))
pickle.dump(sc, open('standscaler.pkl', 'wb'))
pickle.dump(le, open('labelencoder.pkl', 'wb'))