In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from collections import Counter
import joblib


data = pd.read_csv("iot_sensor_dataset.csv")


X = data.drop(columns=["OutbreakRisk"])
y = data["OutbreakRisk"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


numeric_features = ["Water_pH", "Turbidity_NTU", "Chlorine_mg_L", "EColi_MPN",
                    "Rainfall_mm", "AvgTemperature_C"]
categorical_features = ["BacterialPresence"]

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


#  Handle class imbalance dynamically with SMOTE

max_count = max(Counter(y_train).values())
smote_strategy = {cls: max_count for cls in Counter(y_train).keys()}
smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)

#  Classifier
classifier = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    random_state=42
)

#  Full pipeline
pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", smote),
    ("classifier", classifier)
])

#  Train pipeline
pipeline.fit(X_train, y_train)

#  Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

#  Save the pipeline
joblib.dump(pipeline, "final_pipeline_balanced.joblib")
print("Pipeline saved as 'final_pipeline_balanced.joblib'")


Classification Report:
               precision    recall  f1-score   support

        High       1.00      0.99      1.00       152
         Low       0.85      0.96      0.90        49
      Medium       0.98      0.96      0.97       199

    accuracy                           0.97       400
   macro avg       0.95      0.97      0.96       400
weighted avg       0.97      0.97      0.97       400

Confusion Matrix:
 [[151   0   1]
 [  0  47   2]
 [  0   8 191]]
Pipeline saved as 'final_pipeline_balanced.joblib'


In [7]:
import pandas as pd
import joblib

# Load pipeline
pipeline = joblib.load('final_pipeline_balanced.joblib')

def predict_risk(sample_data: dict):

    df = pd.DataFrame([sample_data])

    pred_class = pipeline.predict(df)[0]
    pred_proba = pipeline.predict_proba(df)[0]

    # Get the probability corresponding to predicted class
    class_labels = pipeline.classes_
    pred_idx = list(class_labels).index(pred_class)
    pred_confidence = pred_proba[pred_idx] * 100

    return pred_class, round(pred_confidence, 2)


sample = {
    'Water_pH': 7.0,
    'Turbidity_NTU': 0.4,
    'Chlorine_mg_L': 0.35,
    'EColi_MPN': 0,
    'Rainfall_mm': 20,
    'AvgTemperature_C': 28,
    'BacterialPresence': 'No'
}

risk, confidence = predict_risk(sample)
print(f"Predicted Risk: {risk} ({confidence}%)")


Predicted Risk: Low (90.52%)


In [8]:
import pandas as pd
import joblib

pipeline = joblib.load('final_pipeline_balanced.joblib')

sample = {
    'Water_pH': 7.2,
    'Turbidity_NTU': 1.0,
    'Chlorine_mg_L': 0.25,
    'EColi_MPN': 5,
    'Rainfall_mm': 25,
    'AvgTemperature_C': 27,
    'BacterialPresence': 'No'
}

df = pd.DataFrame([sample])

pred_class = pipeline.predict(df)[0]
pred_proba = pipeline.predict_proba(df)[0]

class_labels = pipeline.classes_
pred_idx = list(class_labels).index(pred_class)
pred_confidence = pred_proba[pred_idx] * 100

print(f"Predicted Risk: {pred_class} ({pred_confidence:.2f}%)")


Predicted Risk: Medium (85.75%)


In [9]:
import pandas as pd
import joblib

pipeline = joblib.load('final_pipeline_balanced.joblib')

sample = {
    'Water_pH': 6.0,
    'Turbidity_NTU': 4.5,
    'Chlorine_mg_L': 0.05,
    'EColi_MPN': 150,
    'Rainfall_mm': 40,
    'AvgTemperature_C': 30,
    'BacterialPresence': 'Yes'
}

df = pd.DataFrame([sample])

pred_class = pipeline.predict(df)[0]
pred_proba = pipeline.predict_proba(df)[0]

class_labels = pipeline.classes_
pred_idx = list(class_labels).index(pred_class)
pred_confidence = pred_proba[pred_idx] * 100

print(f"Predicted Risk: {pred_class} ({pred_confidence:.2f}%)")


Predicted Risk: High (99.45%)
