# 🌱 Soil Fertility Prediction (CSE Dataset)

### Workflow:
1. Import libraries & load dataset  
2. Dataset structure & missing values  
3. Visualizations (distribution, correlation, target balance)  
4. Preprocessing pipeline (impute + scale)  
5. Train/Test split  
6. Train models (RandomForest, SVM, LogisticRegression)  
7. Evaluate performance  
8. Feature importance (for tree-based models)  
9. Save best model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib

df = pd.read_csv("dataset1.csv")
df.head()

ModuleNotFoundError: No module named 'sklearn'

In [None]:
print("Shape:", df.shape)
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())
df.describe()

In [None]:
plt.figure(figsize=(8,5))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
df.hist(figsize=(15,12), bins=30, edgecolor="black")
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
sns.countplot(x="Output", data=df, palette="viridis")
plt.title("Output Class Distribution")
plt.show()

df["Output"].value_counts()

In [None]:
FEATURES = ['N','P','K','pH','EC','OC','S','Zn','Fe','Cu','Mn','B']
TARGET = "Output"

X = df[FEATURES]
y = df[TARGET]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, FEATURES)
    ]
)

X_preprocessed = preprocessor.fit_transform(X)
print("Preprocessed shape:", X_preprocessed.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_preprocessed, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape, " Test size:", X_test.shape)

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

svm_model = SVC(kernel="rbf", probability=True, random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

In [None]:
def evaluate_model(name, y_true, y_pred):
    print(f"🔹 {name} Model Performance 🔹")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

evaluate_model("Random Forest", y_test, rf_pred)
evaluate_model("SVM", y_test, svm_pred)
evaluate_model("Logistic Regression", y_test, log_pred)

In [None]:
importances = rf_model.feature_importances_
feature_importance = pd.DataFrame({"feature": FEATURES, "importance": importances})
feature_importance = feature_importance.sort_values("importance", ascending=False)

sns.barplot(data=feature_importance, x="importance", y="feature", palette="mako")
plt.title("RandomForest Feature Importance")
plt.show()

feature_importance

In [None]:
joblib.dump(rf_model, "soil_fertility_rf_model.joblib")
with open("feature_order.txt", "w") as f:
    f.write("\n".join(FEATURES))

print("✅ Best model and feature order saved!")