In [6]:
import pandas as pd
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score

# Load Data
data = pd.read_csv('Crop_recommendation.csv')
va = data['label'].value_counts()
print(va)

label
rice           100
maize          100
chickpea       100
kidneybeans    100
pigeonpeas     100
mothbeans      100
mungbean       100
blackgram      100
lentil         100
pomegranate    100
banana         100
mango          100
grapes         100
watermelon     100
muskmelon      100
apple          100
orange         100
papaya         100
coconut        100
cotton         100
jute           100
coffee         100
Name: count, dtype: int64


In [7]:
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])  

X = data.drop('label', axis=1)
y = data['label']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize Input Features
    ('model', RandomForestClassifier(n_estimators=100, max_depth=6))
])

# Train Pipeline
pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Convert Predictions Back to Crop Names
y_pred_labels = le.inverse_transform(y_pred)

print("Predicted Crops:", y_pred_labels[:10])
print(f'Random Forest Model Accuracy Score: {accuracy:.4f}')

Predicted Crops: ['muskmelon' 'watermelon' 'papaya' 'papaya' 'apple' 'mango' 'apple'
 'mothbeans' 'mungbean' 'lentil']
Random Forest Model Accuracy Score: 0.9788


In [8]:
from sklearn.model_selection import cross_val_score, KFold

kf = KFold(n_splits=10, shuffle=True, random_state=42)  # 5-fold cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='accuracy')

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
print("Standard Deviation CV Accuracy:", cv_scores.std())

Cross-Validation Scores: [0.99350649 0.98051948 1.         0.98701299 1.         0.98051948
 0.98701299 1.         0.99350649 0.96103896]
Mean CV Accuracy: 0.9883116883116884
Standard Deviation CV Accuracy: 0.01154310963287739


In [9]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='accuracy')

print("Stratified CV Scores:", cv_scores)
print("Mean Stratified CV Accuracy:", cv_scores.mean())

Stratified CV Scores: [0.99675325 0.97077922 0.98701299 0.98701299 0.99025974]
Mean Stratified CV Accuracy: 0.9863636363636363


## hyperparameter tuning 

In [10]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [5, 6, 7],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=skf, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
best_pipeline = grid_search.best_estimator_
y_pred_best = best_pipeline.predict(X_test)
print("Best Accuracy:", accuracy_score(y_test, y_pred_best))

Best Parameters: {'model__max_depth': 7, 'model__n_estimators': 200}
Best Accuracy: 0.990909090909091


## sample data prediction

In [11]:
import numpy as np
sample_data = np.array([ [ 1150, 12, 43, 36, 35.95, 30.9, 50] ])

# Predict Crop Recommendation
predicted_label = best_pipeline.predict(sample_data)
crop_name = le.inverse_transform(predicted_label)[0]  # Extract the single string

print(f"Recommended Crop: {crop_name}")

Recommended Crop: watermelon




In [12]:
import numpy as np
import pandas as pd

# Your sample inputs
samples = np.array([
    [90, 42, 43, 20.88, 82.00, 6.50, 202.94],      # Rice
    [60, 55, 44, 23.00, 82.32, 6.80, 263.96],      # Maize
    [40, 60, 80, 18.87, 14.65, 7.13, 67.63],       # Chickpea
    [3, 76, 8, 20.82, 17.85, 6.72, 79.21],         # Kidneybeans
    [100, 10, 50, 25.59, 94.73, 6.45, 112.36],     # Watermelon
    [120, 20, 50, 27.95, 92.34, 6.63, 97.57],      # Muskmelon
    [20, 130, 200, 22.94, 92.40, 5.58, 104.63],    # Apple
    [20, 30, 30, 31.21, 50.35, 5.98, 94.70],       # Mango
    [100, 20, 30, 25.63, 58.76, 6.79, 162.58],     # Coffee
    [120, 40, 20, 23.85, 79.64, 7.35, 117.43]      # Cotton
])

expected_crops = [
    'rice', 'maize', 'chickpea', 'kidneybeans', 'watermelon',
    'muskmelon', 'apple', 'mango', 'coffee', 'cotton'
]

# Convert to DataFrame with feature names (to avoid warnings)
feature_names = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']  # Adjust if your column names differ
samples_df = pd.DataFrame(samples, columns=feature_names)

# Predict using your trained best_pipeline
predictions = best_pipeline.predict(samples_df)
crop_names = le.inverse_transform(predictions)

# Display results
for i, (pred, expected) in enumerate(zip(crop_names, expected_crops)):
    print(f"Sample {i+1}: Predicted = {pred}, Expected = {expected}")

Sample 1: Predicted = rice, Expected = rice
Sample 2: Predicted = rice, Expected = maize
Sample 3: Predicted = chickpea, Expected = chickpea
Sample 4: Predicted = kidneybeans, Expected = kidneybeans
Sample 5: Predicted = watermelon, Expected = watermelon
Sample 6: Predicted = papaya, Expected = muskmelon
Sample 7: Predicted = apple, Expected = apple
Sample 8: Predicted = mango, Expected = mango
Sample 9: Predicted = coffee, Expected = coffee
Sample 10: Predicted = cotton, Expected = cotton


In [16]:
import sklearn
print(sklearn.__version__)
# Load the Standalone Model
# with open("model_artifacts/final_model.pkl", "rb") as model_file:
#     pipeline, le = pickle.load(model_file)

# Example Input Data (N, P, K, Temperature, Humidity, pH, Rainfall)


1.7.1


In [18]:
import joblib

joblib.dump((best_pipeline), "model_artifacts/model.joblib")
joblib.dump((le),"model_artifacts/le.joblib")
print("Final Model (with Preprocessing) Saved Successfully as 'final_model.joblib'!")

Final Model (with Preprocessing) Saved Successfully as 'final_model.joblib'!


## further testing of model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score

# Load Data
data = pd.read_csv('Crop_recommendation.csv')

# Encode the target variable
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])

# Split features and target
X = data.drop('label', axis=1)
y = data['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Define pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(n_estimators=100, max_depth=6))
])

# Train initial pipeline
pipeline.fit(X_train, y_train)

# Evaluate initial model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
y_pred_labels = le.inverse_transform(y_pred)
print("Predicted Crops (first 10):", y_pred_labels[:10])
print(f'Random Forest Model Accuracy Score: {accuracy:.4f}')

# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='accuracy')
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
print("Standard Deviation CV Accuracy:", cv_scores.std())

# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring='accuracy')
print("Stratified CV Scores:", cv_scores)
print("Mean Stratified CV Accuracy:", cv_scores.mean())

# Grid Search
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [5, 6, 7],
}
grid_search = GridSearchCV(pipeline, param_grid, cv=skf, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

# Best model evaluation
best_pipeline = grid_search.best_estimator_
y_pred_best = best_pipeline.predict(X_test)
print("Best Accuracy on Test Set:", accuracy_score(y_test, y_pred_best))

# Sample Prediction with DataFrame to avoid warnings
feature_names = X.columns
sample_data = np.array([[68, 58, 38, 23.223974, 83.033227, 6.336254, 221.209196]])
sample_data_df = pd.DataFrame(sample_data, columns=feature_names)
predicted_label = best_pipeline.predict(sample_data_df)
crop_name = le.inverse_transform(predicted_label)[0]
print("Predicted Label (encoded):", predicted_label)
print("Label Encoder Classes:", le.classes_)
print(f"Recommended Crop: {crop_name}")


Predicted Crops (first 10): ['muskmelon' 'watermelon' 'papaya' 'papaya' 'apple' 'mango' 'apple'
 'mothbeans' 'mungbean' 'lentil']
Random Forest Model Accuracy Score: 0.9788
Cross-Validation Scores: [0.98701299 0.99025974 0.99025974 0.99675325 0.98701299]
Mean CV Accuracy: 0.9902597402597403
Standard Deviation CV Accuracy: 0.003556639983799771
Stratified CV Scores: [0.99675325 0.97077922 0.99675325 0.97727273 0.99675325]
Mean Stratified CV Accuracy: 0.9876623376623378
Best Parameters: {'model__max_depth': 7, 'model__n_estimators': 200}
Best Accuracy on Test Set: 0.9878787878787879
Predicted Label (encoded): [20]
Label Encoder Classes: ['apple' 'banana' 'blackgram' 'chickpea' 'coconut' 'coffee' 'cotton'
 'grapes' 'jute' 'kidneybeans' 'lentil' 'maize' 'mango' 'mothbeans'
 'mungbean' 'muskmelon' 'orange' 'papaya' 'pigeonpeas' 'pomegranate'
 'rice' 'watermelon']
Recommended Crop: rice


In [None]:
sample_data_raw = np.array([[10,34	,32,2.774637,66.413269	,6.780064,177.774507]])
# sample_data_raw_df = pd.DataFrame(sample_data_raw, columns=feature_names)
predicted_label_raw = best_pipeline.predict(sample_data_raw)
crop_name_raw = le.inverse_transform(predicted_label_raw)[0]
print(f"Recommended Crop (sample_data_raw): {crop_name_raw}")

Recommended Crop (sample_data_raw): mango




In [None]:
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder

# Load Dataset
data = pd.read_csv("Crop_recommendation.csv")

# Encode Crop Labels
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])

# Create Mapping of Index to Crop Name
label_mapping = {index: crop for index, crop in enumerate(le.classes_)}

# Save Mapping as JSON
with open("label_mapping.json", "w") as f:
    json.dump(label_mapping, f, indent=4)

print("Label mapping saved to label_mapping.json")



Label mapping saved to label_mapping.json
