In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the CSV file
df = pd.read_csv("soil_pollution_diseases.csv")

# Select the input and output features
input_cols = ["Pollutant_Type", "Pollutant_Concentration_mg_kg", "Soil_pH", "Temperature_C"]
output_cols = ["Disease_Type", "Disease_Severity"]

# Drop rows with missing values in these columns
df = df[input_cols + output_cols].dropna()

# Encode categorical input and output columns
label_encoders = {}
for col in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Split data into input X and outputs y1, y2
X = df[input_cols]
y1 = df["Disease_Type"]
y2 = df["Disease_Severity"]

# Split for Disease_Type prediction
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)
clf1 = RandomForestClassifier(random_state=42)
clf1.fit(X_train, y1_train)
y1_pred = clf1.predict(X_test)
print("Disease_Type Classification Report:")
print(classification_report(y1_test, y1_pred))

# Split for Disease_Severity prediction
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)
clf2 = RandomForestClassifier(random_state=42)
clf2.fit(X_train, y2_train)
y2_pred = clf2.predict(X_test)
print("Disease_Severity Classification Report:")
print(classification_report(y2_test, y2_pred))


Disease_Type Classification Report:
              precision    recall  f1-score   support

           0       0.21      0.22      0.21       119
           1       0.20      0.21      0.20       117
           2       0.18      0.17      0.18       116
           3       0.25      0.21      0.23       122
           4       0.19      0.22      0.21       126

    accuracy                           0.21       600
   macro avg       0.21      0.21      0.21       600
weighted avg       0.21      0.21      0.21       600

Disease_Severity Classification Report:
              precision    recall  f1-score   support

           0       0.31      0.30      0.30       209
           1       0.38      0.34      0.36       204
           2       0.27      0.31      0.29       187

    accuracy                           0.32       600
   macro avg       0.32      0.32      0.32       600
weighted avg       0.32      0.32      0.32       600



In [3]:
import joblib

# Save both models
joblib.dump(clf1, "disease_type_model.pkl")
joblib.dump(clf2, "disease_severity_model.pkl")

# Save label encoders (needed for encoding inputs and decoding outputs)
joblib.dump(label_encoders, "label_encoders.pkl")

['label_encoders.pkl']