In [4]:
import pandas as pd

# Load the dataset
data = pd.read_csv("dataset1.csv")  # use your CSV filename

# Display first 5 rows
data.head()


Unnamed: 0,N,P,K,pH,EC,OC,S,Zn,Fe,Cu,Mn,B,Output
0,138,8.6,560,7.46,0.62,0.7,5.9,0.24,0.31,0.77,8.71,0.11,0
1,213,7.5,338,7.62,0.75,1.06,25.4,0.3,0.86,1.54,2.89,2.29,0
2,163,9.6,718,7.59,0.51,1.11,14.3,0.3,0.86,1.57,2.7,2.03,0
3,157,6.8,475,7.64,0.58,0.94,26.0,0.34,0.54,1.53,2.65,1.82,0
4,270,9.9,444,7.63,0.4,0.86,11.8,0.25,0.76,1.69,2.43,2.26,1


In [5]:
# Basic info about columns, types, missing values
data.info()

# Summary statistics
data.describe()

# Check missing values
data.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 880 entries, 0 to 879
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   N       880 non-null    int64  
 1   P       880 non-null    float64
 2   K       880 non-null    int64  
 3   pH      880 non-null    float64
 4   EC      880 non-null    float64
 5   OC      880 non-null    float64
 6   S       880 non-null    float64
 7   Zn      880 non-null    float64
 8   Fe      880 non-null    float64
 9   Cu      880 non-null    float64
 10  Mn      880 non-null    float64
 11  B       880 non-null    float64
 12  Output  880 non-null    int64  
dtypes: float64(10), int64(3)
memory usage: 89.5 KB


N         0
P         0
K         0
pH        0
EC        0
OC        0
S         0
Zn        0
Fe        0
Cu        0
Mn        0
B         0
Output    0
dtype: int64

In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load dataset
data = pd.read_csv("dataset1.csv")  # use your CSV file name

# Display first 5 rows
print(data.head())

# Check missing values
print(data.isnull().sum())

# Numeric columns to preprocess
numeric_cols = ['N','P','K','pH','EC','OC','S','Zn','Fe','Cu','Mn','B']

# Fill missing numeric values with mean
for col in numeric_cols:
    data[col].fillna(data[col].mean(), inplace=True)

# Encode Output column if it's categorical
# Example:
# data['Output'] = data['Output'].map({'Low':0,'Medium':1,'High':2})

# Normalize numeric columns
scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# Save preprocessed CSV
data.to_csv("preprocessed_soil_fertility.csv", index=False)

print("Preprocessing complete. Preprocessed file saved as 'preprocessed_soil_fertility.csv'")


     N    P    K    pH    EC    OC     S    Zn    Fe    Cu    Mn     B  Output
0  138  8.6  560  7.46  0.62  0.70   5.9  0.24  0.31  0.77  8.71  0.11       0
1  213  7.5  338  7.62  0.75  1.06  25.4  0.30  0.86  1.54  2.89  2.29       0
2  163  9.6  718  7.59  0.51  1.11  14.3  0.30  0.86  1.57  2.70  2.03       0
3  157  6.8  475  7.64  0.58  0.94  26.0  0.34  0.54  1.53  2.65  1.82       0
4  270  9.9  444  7.63  0.40  0.86  11.8  0.25  0.76  1.69  2.43  2.26       1
N         0
P         0
K         0
pH        0
EC        0
OC        0
S         0
Zn        0
Fe        0
Cu        0
Mn        0
B         0
Output    0
dtype: int64
Preprocessing complete. Preprocessed file saved as 'preprocessed_soil_fertility.csv'


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# 1. Load dataset
data = pd.read_csv("dataset1.csv")

# 2. Preprocessing
numeric_cols = ['N','P','K','pH','EC','OC','S','Zn','Fe','Cu','Mn','B']
for col in numeric_cols:
    data[col].fillna(data[col].mean(), inplace=True)

# Normalize numeric columns
scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# Encode target
le = LabelEncoder()
data['Output'] = le.fit_transform(data['Output'])

# 3. Features and target
X = data[numeric_cols]
y = data['Output']

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 5. Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 6. Predict & evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 7. Save preprocessed data and model
data.to_csv("preprocessed_soil_fertility.csv", index=False)
joblib.dump(model, "soil_fertility_model.pkl")
print("Preprocessing complete and model saved.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Accuracy: 0.8863636363636364
              precision    recall  f1-score   support

           0       0.90      0.97      0.94        78
           1       0.87      0.91      0.89        88
           2       0.00      0.00      0.00        10

    accuracy                           0.89       176
   macro avg       0.59      0.63      0.61       176
weighted avg       0.84      0.89      0.86       176

Preprocessing complete and model saved.


In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# =========================
# 1. Load dataset
# =========================
data = pd.read_csv("dataset1.csv")

# =========================
# 2. Preprocessing
# =========================
numeric_cols = ['N','P','K','pH','EC','OC','S','Zn','Fe','Cu','Mn','B']

# Fill missing values with mean
for col in numeric_cols:
    data[col].fillna(data[col].mean(), inplace=True)

# Normalize numeric columns
scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# Encode target column if categorical
le = LabelEncoder()
data['Output'] = le.fit_transform(data['Output'])

# =========================
# 3. Split features & target
# =========================
X = data[numeric_cols]
y = data['Output']

# =========================
# 4. Train-test split
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# =========================
# 5. Train model
# =========================
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# =========================
# 6. Evaluate model
# =========================
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# =========================
# 7. Save preprocessed data and model
# =========================
data.to_csv("preprocessed_soil_fertility.csv", index=False)
joblib.dump(model, "soil_fertility_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")
print("Preprocessing complete and model saved.")

# =========================
# 8. Predict new sample
# =========================
# Replace these values with your new soil sample
new_sample = pd.DataFrame({
    'N': [50],
    'P': [20],
    'K': [30],
    'pH': [6.5],
    'EC': [0.5],
    'OC': [0.8],
    'S': [10],
    'Zn': [1],
    'Fe': [2],
    'Cu': [0.5],
    'Mn': [1.2],
    'B': [0.3]
})

# Load saved scaler & label encoder
scaler = joblib.load("scaler.pkl")
le = joblib.load("label_encoder.pkl")
model = joblib.load("soil_fertility_model.pkl")

# Scale new sample
new_sample_scaled = scaler.transform(new_sample)

# Predict
prediction = model.predict(new_sample_scaled)
predicted_label = le.inverse_transform(prediction)
print("Predicted Fertility Level:", predicted_label[0])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Accuracy: 0.8863636363636364
              precision    recall  f1-score   support

           0       0.90      0.97      0.94        78
           1       0.87      0.91      0.89        88
           2       0.00      0.00      0.00        10

    accuracy                           0.89       176
   macro avg       0.59      0.63      0.61       176
weighted avg       0.84      0.89      0.86       176

Preprocessing complete and model saved.
Predicted Fertility Level: 0




In [14]:
# Example: multiple soil samples
new_samples = pd.DataFrame([
    {'N': 50, 'P': 20, 'K': 30, 'pH': 6.5, 'EC': 0.5, 'OC': 0.8, 'S': 10, 'Zn': 1, 'Fe': 2, 'Cu': 0.5, 'Mn': 1.2, 'B': 0.3},
    {'N': 30, 'P': 15, 'K': 20, 'pH': 7.0, 'EC': 0.6, 'OC': 0.7, 'S': 8, 'Zn': 0.8, 'Fe': 1.5, 'Cu': 0.4, 'Mn': 1.0, 'B': 0.2},
    {'N': 70, 'P': 25, 'K': 35, 'pH': 6.0, 'EC': 0.4, 'OC': 1.0, 'S': 12, 'Zn': 1.2, 'Fe': 2.5, 'Cu': 0.6, 'Mn': 1.5, 'B': 0.4}
])


In [15]:
# Load saved scaler & label encoder & model
scaler = joblib.load("scaler.pkl")
le = joblib.load("label_encoder.pkl")
model = joblib.load("soil_fertility_model.pkl")

# Scale features
new_samples_scaled = scaler.transform(new_samples)

# Predict
predictions = model.predict(new_samples_scaled)
predicted_labels = le.inverse_transform(predictions)

# Display results
for i, label in enumerate(predicted_labels):
    print(f"Sample {i+1}: Predicted Fertility Level = {label}")


Sample 1: Predicted Fertility Level = 0
Sample 2: Predicted Fertility Level = 0
Sample 3: Predicted Fertility Level = 0




In [17]:
import pandas as pd
import joblib

# Columns to use
numeric_cols = ['N','P','K','pH','EC','OC','S','Zn','Fe','Cu','Mn','B']

# Load the dataset (already used for training)
new_samples = pd.read_csv("dataset1.csv")

# Ensure columns match numeric_cols
new_samples_features = new_samples[numeric_cols]

# Load saved scaler, model, and label encoder
scaler = joblib.load("scaler.pkl")
model = joblib.load("soil_fertility_model.pkl")
le = joblib.load("label_encoder.pkl")

# Scale features
new_samples_scaled = scaler.transform(new_samples_features)

# Predict
predictions = model.predict(new_samples_scaled)
predicted_labels = le.inverse_transform(predictions)

# Add predictions to dataframe
new_samples['Predicted_Fertility'] = predicted_labels

# Save predictions
new_samples.to_csv("predicted_soil_fertility.csv", index=False)
print("Predictions saved to 'predicted_soil_fertility.csv'.")



Predictions saved to 'predicted_soil_fertility.csv'.




In [18]:
# Scale features
new_samples_scaled = scaler.transform(new_samples_features)

# Convert back to DataFrame with same column names
new_samples_scaled_df = pd.DataFrame(new_samples_scaled, columns=numeric_cols)

# Predict
predictions = model.predict(new_samples_scaled_df)
predicted_labels = le.inverse_transform(predictions)


In [19]:
# Scale features and keep column names
new_samples_scaled = scaler.transform(new_samples_features)
new_samples_scaled_df = pd.DataFrame(new_samples_scaled, columns=numeric_cols)

# Predict
predictions = model.predict(new_samples_scaled_df)
predicted_labels = le.inverse_transform(predictions)

# Add predictions to original dataframe
new_samples['Predicted_Fertility'] = predicted_labels

# Save predictions
new_samples.to_csv("predicted_soil_fertility.csv", index=False)
print("Predictions saved to 'predicted_soil_fertility.csv'.")


Predictions saved to 'predicted_soil_fertility.csv'.


In [20]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# =========================
# 1. Load dataset
# =========================
data = pd.read_csv("dataset1.csv")

# =========================
# 2. Preprocessing
# =========================
numeric_cols = ['N','P','K','pH','EC','OC','S','Zn','Fe','Cu','Mn','B']

# Fill missing values with mean
for col in numeric_cols:
    data[col].fillna(data[col].mean(), inplace=True)

# Normalize numeric columns
scaler = StandardScaler()
data[numeric_cols] = scaler.fit_transform(data[numeric_cols])

# Encode target column
le = LabelEncoder()
data['Output'] = le.fit_transform(data['Output'])

# =========================
# 3. Features & target
# =========================
X = data[numeric_cols]
y = data['Output']

# =========================
# 4. Train-test split
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# =========================
# 5. Train model
# =========================
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# =========================
# 6. Evaluate model
# =========================
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# =========================
# 7. Save model, scaler, and label encoder
# =========================
joblib.dump(model, "soil_fertility_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")
print("Model, scaler, and label encoder saved.")

# =========================
# 8. Predict fertility for all rows in dataset
# =========================
# Keep original data for predictions
new_samples = pd.read_csv("dataset1.csv")
new_samples_features = new_samples[numeric_cols]

# Load saved objects
scaler = joblib.load("scaler.pkl")
model = joblib.load("soil_fertility_model.pkl")
le = joblib.load("label_encoder.pkl")

# Scale features and keep column names to avoid warnings
new_samples_scaled = scaler.transform(new_samples_features)
new_samples_scaled_df = pd.DataFrame(new_samples_scaled, columns=numeric_cols)

# Predict
predictions = model.predict(new_samples_scaled_df)
predicted_labels = le.inverse_transform(predictions)

# Add predictions to original dataframe
new_samples['Predicted_Fertility'] = predicted_labels

# Save predictions
new_samples.to_csv("predicted_soil_fertility.csv", index=False)
print("Predictions saved to 'predicted_soil_fertility.csv'.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


Accuracy: 0.8863636363636364
              precision    recall  f1-score   support

           0       0.90      0.97      0.94        78
           1       0.87      0.91      0.89        88
           2       0.00      0.00      0.00        10

    accuracy                           0.89       176
   macro avg       0.59      0.63      0.61       176
weighted avg       0.84      0.89      0.86       176

Model, scaler, and label encoder saved.
Predictions saved to 'predicted_soil_fertility.csv'.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
