In [1]:
import pandas as pd

# Load the dataset (adjust path if filename has typo; use 'Crop_recommendation.csv')
data_path = '../data/Crop_recommendation.csv'
df = pd.read_csv(data_path)

# Inspect
print(df.head())
print(df.info())
print(df['label'].value_counts())  # Crop distribution

    N   P   K  temperature   humidity        ph    rainfall label
0  90  42  43    20.879744  82.002744  6.502985  202.935536  rice
1  85  58  41    21.770462  80.319644  7.038096  226.655537  rice
2  60  55  44    23.004459  82.320763  7.840207  263.964248  rice
3  74  35  40    26.491096  80.158363  6.980401  242.864034  rice
4  78  42  42    20.130175  81.604873  7.628473  262.717340  rice
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB
None
label
ric

In [3]:
df = df.dropna()

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Features (X) and target (y)
X = df.drop('label', axis=1)
y = df['label']

# Scale all features (all numerical)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split into train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")

Training set shape: (1760, 7), Test set shape: (440, 7)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score,confusion_matrix

# create instances of all models
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': GaussianNB(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Bagging': BaggingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Extra Trees': ExtraTreeClassifier(),
}


for name, model in models.items():
    model.fit(X_train,y_train)
    ypred = model.predict(X_test)
    
    print(f"{name}  with accuracy : {accuracy_score(y_test,ypred)}")
    print("Confusion matrix : ",confusion_matrix(y_test,ypred))
    print("==========================================================")
    

Logistic Regression  with accuracy : 0.9636363636363636
Confusion matrix :  [[23  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 21  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 19  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 26  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 27  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 17  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 14  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  1  0  0 20  0  0  0  0  0  0  0  0  1  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  0 19  0  0  0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 11  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0  0 20  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 19  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  

In [11]:
import joblib

# Save
joblib.dump(model, '../model/crop_rec_model.pkl')
joblib.dump(scaler, '../model/scaler.pkl')

print("Model saved to model folder!")

Model saved to model folder!


In [14]:
import joblib
import pandas as pd
import numpy as np

# Load the model and scaler (assuming paths from your setup)
loaded_model = joblib.load('../model/crop_rec_model.pkl')
loaded_scaler = joblib.load('../model/scaler.pkl')

# Sample input (replace with your actual input values)
sample_input = pd.DataFrame([[87, 22, 13, 40.88, 12.00, 4.50, 102.94]], 
                            columns=['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall'])

# Scale the input
sample_input_scaled = loaded_scaler.transform(sample_input)

# Get probabilities for all classes
probs = loaded_model.predict_proba(sample_input_scaled)[0]

# Get the class names (crops)
classes = loaded_model.classes_

# Get indices of top 3 probabilities (descending order)
top3_indices = np.argsort(probs)[-3:][::-1]

# Get top 3 crops and their probabilities
top3_crops = [(classes[i], probs[i]) for i in top3_indices]

# Print the results
for crop, prob in top3_crops:
    print(f"Crop: {crop}, Probability: {prob:.4f}")

Crop: pigeonpeas, Probability: 1.0000
Crop: watermelon, Probability: 0.0000
Crop: rice, Probability: 0.0000




In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import numpy as np

# Train a probability-friendly model (predict_proba is well-defined)
rf_model = RandomForestClassifier(n_estimators=300, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred_rf = rf_model.predict(X_test)
print("RandomForest accuracy:", accuracy_score(y_test, y_pred_rf))

# Quick probability sanity check on one test sample
probs = rf_model.predict_proba(X_test[:1])[0]
print("Proba head:", [round(float(p), 4) for p in probs[:10]])
print("Sum:", float(np.sum(probs)))

# Save as production model (overwrite)
joblib.dump(rf_model, '../model/crop_rec_model.pkl')
print("Saved RandomForest model to ../model/crop_rec_model.pkl")
