In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data = pd.read_csv('/kaggle/input/final-combined-dataset/filtered_dataset.csv')
data.head()

Unnamed: 0,Soil_color,Nitrogen,Phosphorus,Potassium,pH,Rainfall,Temperature,Crop,Fertilizer
0,Black,75.0,50.0,100.0,6.5,1000.0,20.0,Sugarcane,Urea
1,Black,80.0,50.0,100.0,6.5,1000.0,20.0,Sugarcane,Urea
2,Black,85.0,50.0,100.0,6.5,1000.0,20.0,Sugarcane,Urea
3,Black,90.0,50.0,100.0,6.5,1000.0,20.0,Sugarcane,Urea
4,Black,95.0,50.0,100.0,6.5,1000.0,20.0,Sugarcane,Urea


In [3]:
# columns_to_remove = ['District_Name', 'Fertilizer', 'Link']
# data = data.drop(columns=columns_to_remove)
# data.isnull().sum()

In [4]:
# Information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5073 entries, 0 to 5072
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Soil_color   5073 non-null   object 
 1   Nitrogen     5073 non-null   float64
 2   Phosphorus   5073 non-null   float64
 3   Potassium    5073 non-null   float64
 4   pH           5073 non-null   float64
 5   Rainfall     5073 non-null   float64
 6   Temperature  5073 non-null   float64
 7   Crop         5073 non-null   object 
 8   Fertilizer   5073 non-null   object 
dtypes: float64(6), object(3)
memory usage: 356.8+ KB


In [5]:
# # Exclude non-numeric columns
# numeric_dataset = data.select_dtypes(include=['float64', 'int64'])

# # Calculate correlation matrix
# corr = numeric_dataset.corr()
# print(corr)

In [6]:
# import seaborn as sns
# sns.heatmap(corr,annot=True,cbar=True , cmap='coolwarm')

In [7]:
data['Soil_color'].unique()

array(['Black', 'Red ', 'Medium Brown', 'Dark Brown', 'Red',
       'Reddish Brown', 'Light Brown', 'Brown', 'Reddish'], dtype=object)

In [8]:
# Define a mapping dictionary for Soil_Color
soil_color_mapping = {
    'Black': 1,
    'Red ': 2,  # Notice the trailing space, keep it as it appears in the data
    'Medium Brown': 3,
    'Dark Brown': 4,
    'Red': 5,
    'Light Brown': 6,
    'Reddish Brown': 7,
    'Brown' : 8,
    'Reddish': 9
}

# Map the Soil_Color values to integers
data['Soil_color'] = data['Soil_color'].map(soil_color_mapping)

In [9]:
data.head()

Unnamed: 0,Soil_color,Nitrogen,Phosphorus,Potassium,pH,Rainfall,Temperature,Crop,Fertilizer
0,1,75.0,50.0,100.0,6.5,1000.0,20.0,Sugarcane,Urea
1,1,80.0,50.0,100.0,6.5,1000.0,20.0,Sugarcane,Urea
2,1,85.0,50.0,100.0,6.5,1000.0,20.0,Sugarcane,Urea
3,1,90.0,50.0,100.0,6.5,1000.0,20.0,Sugarcane,Urea
4,1,95.0,50.0,100.0,6.5,1000.0,20.0,Sugarcane,Urea


In [10]:
data['Crop'].unique()

array(['Sugarcane', 'Jowar', 'Cotton', 'Rice', 'Wheat', 'Groundnut',
       'Maize', 'Urad', 'Soybean', 'Turmeric', 'Grapes', 'Potato',
       'Tomato'], dtype=object)

In [11]:
# Define a mapping dictionary for Crop
crop_mapping = {
    'Sugarcane': 1,
    'Jowar': 2,
    'Cotton': 3,
    'Rice': 4,
    'Wheat': 5,
    'Groundnut': 6,
    'Maize': 7,
    'Potato': 8,
    'Urad': 9,
    'Tomato': 10,
    'Soybean': 11,
    'Turmeric': 12,
    'Grapes': 13,
}

# Map the Crop values to integers
data['Crop'] = data['Crop'].map(crop_mapping)
data.head()

Unnamed: 0,Soil_color,Nitrogen,Phosphorus,Potassium,pH,Rainfall,Temperature,Crop,Fertilizer
0,1,75.0,50.0,100.0,6.5,1000.0,20.0,1,Urea
1,1,80.0,50.0,100.0,6.5,1000.0,20.0,1,Urea
2,1,85.0,50.0,100.0,6.5,1000.0,20.0,1,Urea
3,1,90.0,50.0,100.0,6.5,1000.0,20.0,1,Urea
4,1,95.0,50.0,100.0,6.5,1000.0,20.0,1,Urea


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [13]:
X = data[['Soil_color', 'Nitrogen', 'Phosphorus', 'Potassium', 'pH', 'Rainfall', 'Temperature']]
y = data['Crop']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Check for NaN values in the specified columns
nan_counts = X.isna().sum()

# Display the count of NaN values for each column
print(nan_counts)

Soil_color     0
Nitrogen       0
Phosphorus     0
Potassium      0
pH             0
Rainfall       0
Temperature    0
dtype: int64


In [15]:
# Create and fit the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)


In [16]:
# Make predictions
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

In [17]:
# Evaluate the model
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

print("\nClassification Report for Testing Data:\n", classification_report(y_test, y_test_pred))


Training Accuracy: 1.0
Testing Accuracy: 0.9960591133004926

Classification Report for Testing Data:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00       194
           2       0.99      1.00      0.99        73
           3       1.00      1.00      1.00       130
           4       1.00      1.00      1.00        59
           5       0.99      1.00      0.99       175
           6       0.97      1.00      0.98        31
           7       1.00      0.97      0.99        71
           8       1.00      1.00      1.00        98
           9       1.00      0.96      0.98        23
          10       1.00      1.00      1.00       104
          11       1.00      1.00      1.00        13
          12       1.00      0.93      0.96        14
          13       1.00      1.00      1.00        30

    accuracy                           1.00      1015
   macro avg       1.00      0.99      0.99      1015
weighted avg       1.00      1.0

In [18]:
import pickle
# Save the trained model to a file using pickle
with open('rf_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)

print("Model saved successfully!")

Model saved successfully!
