## Supervised Learning

1. Problem Definition and Objective Setting

    Our goal is to classify which crop is most suitable given specific soil and weather conditions.

2. Data Collection and Preparation

    Load and inspect the data.

In [2]:
import pandas as pd

# Load dataset
data = pd.read_csv('crop_recommendation.csv')

# Quick data overview
print(data.head())
print(data['label'].value_counts())


    N   P   K  temperature   humidity        ph    rainfall label
0  90  42  43    20.879744  82.002744  6.502985  202.935536  rice
1  85  58  41    21.770462  80.319644  7.038096  226.655537  rice
2  60  55  44    23.004459  82.320763  7.840207  263.964248  rice
3  74  35  40    26.491096  80.158363  6.980401  242.864034  rice
4  78  42  42    20.130175  81.604873  7.628473  262.717340  rice
label
rice           100
maize          100
chickpea       100
kidneybeans    100
pigeonpeas     100
mothbeans      100
mungbean       100
blackgram      100
lentil         100
pomegranate    100
banana         100
mango          100
grapes         100
watermelon     100
muskmelon      100
apple          100
orange         100
papaya         100
coconut        100
cotton         100
jute           100
coffee         100
Name: count, dtype: int64


3. Data Preprocessing

    Handle missing values, encode categorical labels, and split the data.

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Fill missing values only for numeric columns
data.fillna(data.select_dtypes(include='number').mean(), inplace=True)

# Encode target variable (Crop/label) if it’s categorical
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Split features and target
X = data.drop('label', axis=1)
y = data['label']

# Split data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


In [5]:
X_train

array([[-8.14149162e-01, -8.22608476e-01, -4.17586751e-01, ...,
        -1.10914730e+00, -1.00850068e+00, -1.14762954e-01],
       [-5.99794073e-01, -5.52511028e-01, -3.98018725e-01, ...,
        -8.39738838e-01,  6.40463882e-01, -2.12947619e-01],
       [ 2.30831896e-01, -2.82413580e-01, -4.95858854e-01, ...,
        -8.13537964e-02, -9.78595756e-01, -4.59356367e-01],
       ...,
       [-1.08209302e+00, -5.22500201e-01, -3.39314648e-01, ...,
        -9.20572349e-01,  6.00471872e-04, -3.53408620e-02],
       [-1.08209302e+00,  2.11845263e+00,  3.04595380e+00, ...,
         3.79045864e-01, -1.48070939e-01, -5.55371242e-01],
       [-5.19410914e-01,  7.37954558e-01, -5.15426879e-01, ...,
        -4.34666852e-01,  6.91994073e-01, -8.79579938e-01]])

In [6]:
X_val

array([[ 0.07006558,  0.40783545,  0.07161389, ...,  0.98735509,
         0.14452437,  0.70492223],
       [ 1.83849506, -1.03268427,  0.01290981, ...,  0.57210327,
         0.37390708, -0.82474019],
       [-0.5462053 , -0.40245689, -0.51542688, ...,  0.65904539,
        -0.14238395, -0.88055647],
       ...,
       [-0.33185021, -0.88263013, -0.18277044, ...,  0.98828113,
        -0.05686553,  0.13673526],
       [ 1.27581295,  0.04770552, -0.26104255, ...,  0.38697818,
         0.03473976,  3.16692774],
       [-0.5462053 , -0.82260848, -0.76981121, ...,  0.98505646,
         1.41920673,  0.01767524]])

In [7]:
X_test

array([[-1.00170986, -1.03268427, -0.67197108, ...,  1.03567652,
         0.29156921,  0.10531156],
       [-0.84094355,  0.46785711, -0.65240306, ..., -2.2532899 ,
        -0.70344132,  0.09359675],
       [-1.13568179, -1.27277089, -0.78937924, ...,  1.03713991,
         0.10957597,  0.17228478],
       ...,
       [-0.33185021, -0.8526193 , -0.24147452, ...,  1.06511878,
        -1.17989855,  0.09713946],
       [-0.38543898, -1.27277089, -0.24147452, ...,  0.63146481,
         0.30584783,  0.08472861],
       [-0.43902776,  2.14846345,  3.0459538 , ...,  0.38298874,
        -0.47764686, -0.51401723]])

4. Model Selection

    Select a classification model. Here, we’ll use a Random Forest classifier, which often works well with tabular data.

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Initialize model
model = RandomForestClassifier(random_state=42)

5. Model Training with Fine-Tuning

    Train the model and then fine-tune the number of trees in the forest (using n_estimators).

In [9]:
from sklearn.metrics import accuracy_score

# Initial model training
model.fit(X_train, y_train)

# Evaluate initial model performance on validation set
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Initial Validation Accuracy: {val_accuracy:.2f}')

# Fine-tuning: Try different numbers of trees
fine_tuned_accuracies = []
for n_estimators in [50, 100, 150, 200, 250]:
    fine_tuned_model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    fine_tuned_model.fit(X_train, y_train)
    y_val_pred = fine_tuned_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    fine_tuned_accuracies.append((n_estimators, val_accuracy))
    print(f'Validation Accuracy with n_estimators={n_estimators}: {val_accuracy:.2f}')

# Select the best n_estimators value
best_n, best_accuracy = max(fine_tuned_accuracies, key=lambda x: x[1])
print(f'\nBest n_estimators after fine-tuning: {best_n}, with Validation Accuracy: {best_accuracy:.2f}')

# Re-Train the final model with the best-found parameter
final_model = RandomForestClassifier(n_estimators=best_n, random_state=42)
final_model.fit(X_train, y_train)


Initial Validation Accuracy: 0.99
Validation Accuracy with n_estimators=50: 0.99
Validation Accuracy with n_estimators=100: 0.99
Validation Accuracy with n_estimators=150: 0.99
Validation Accuracy with n_estimators=200: 0.99
Validation Accuracy with n_estimators=250: 0.99

Best n_estimators after fine-tuning: 150, with Validation Accuracy: 0.99


6. Model Evaluation

    Evaluate the final model on the validation set.

In [10]:
# Evaluate the fine-tuned model on the validation set
y_val_pred = final_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Final Validation Accuracy: {val_accuracy:.2f}')


Final Validation Accuracy: 0.99


7. Model Testing

    Test the model on the test dataset.

In [11]:
# Test the final model on the test set
y_test_pred = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.2f}')


Test Accuracy: 0.99


8. Model Deployment

    Save the final model for deployment.

In [76]:
import joblib

# Save the model
joblib.dump(final_model, 'crop_recommendation_model.joblib')


['crop_recommendation_model.joblib']

9. Monitoring and Maintenance

    For monitoring, you can write a function that logs predictions and performance.

In [78]:

# Load the model 
loaded_model = joblib.load('crop_recommendation_model.joblib')

In [79]:
sample = [[-1.00170986, -1.03268427, -0.67197108, -1.57036581,  1.03567652,
         0.29156921,  0.10531156]]

import numpy as np

# Convert to a NumPy array
sample_features = np.array(sample)

In [80]:
# Ensure features is a 2D array
if sample_features.ndim == 1:
    sample_features = sample_features.reshape(1, -1)

In [81]:
prediction = loaded_model.predict(sample_features)

In [82]:
print(prediction.shape)

(1,)


In [83]:
crop_name = label_encoder.inverse_transform(prediction)  # Decode back to original crop names
print(f'Recommended Crop: {crop_name[0]}')

Recommended Crop: orange
