In [12]:
import pandas as pd
import numpy as np
import random

# State → Village mapping
STATE_VILLAGE_MAP = {
    'Arunachal Pradesh': ['Ziro', 'Bomdila', 'Mechuka', 'Anini', 'Daporijo', 'Pasighat', 'Tuting'],
    'Assam': ['Majuli', 'Sualkuchi', 'Hajo', 'Sivasagar', 'Barpeta', 'Tezpur', 'Goalpara', 'Digboi'],
    'Manipur': ['Moirang', 'Andro', 'Noney', 'Ukhrul', 'Kakching', 'Bishnupur', 'Moreh'],
    'Meghalaya': ['Mawlynnong', 'Cherrapunji', 'Nongriat', 'Dawki', 'Jowai', 'Baghmara', 'Tura'],
    'Mizoram': ['Champhai', 'Hmuifang', 'Reiek', 'Falkawn', 'Thenzawl', 'Kolasib', 'Serchhip'],
    'Nagaland': ['Khonoma', 'Touphema', 'Longwa', 'Mokokchung', 'Wokha', 'Mon', 'Phek'],
    'Tripura': ['Udaipur', 'Ambassa', 'Kailashahar', 'Dharmanagar', 'Amarpur', 'Belonia', 'Sabroom']
}

# Risk logic function
def assign_risk(row):
    # High risk conditions
    if row['EColi_MPN'] > 50 or row['BacterialPresence'] == "Yes" or (row['Turbidity_NTU'] > 5 and row['Chlorine_mg_L'] < 0.2):
        return "High"
    # Medium risk conditions
    elif (row['Water_pH'] < 6.5 or row['Water_pH'] > 8.5) or (0.2 <= row['Chlorine_mg_L'] <= 0.3) or (1 <= row['EColi_MPN'] <= 50) or (row['Turbidity_NTU'] > 1):
        return "Medium"
    # Low risk conditions
    elif (6.5 <= row['Water_pH'] <= 8.5) and (0.3 < row['Chlorine_mg_L'] <= 1.0) and (row['EColi_MPN'] < 1) and (row['BacterialPresence'] == "No") and (row['Turbidity_NTU'] <= 1):
        return "Low"
    # Fallback
    else:
        return "Medium"

# Dataset generation
def generate_iot_dataset(n_low=500, n_medium=1000, n_high=500, random_state=42):
    np.random.seed(random_state)
    random.seed(random_state)
    rows = []
    states = list(STATE_VILLAGE_MAP.keys())

    # Low Risk
    for i in range(n_low):
        state = random.choice(states)
        village = random.choice(STATE_VILLAGE_MAP[state])
        row = {
            "SampleID": i + 1,
            "State": state,
            "Village": village,
            "Water_pH": round(random.uniform(6.5, 8.5), 2),
            "Turbidity_NTU": round(random.uniform(0.0, 1.0), 2),
            "Chlorine_mg_L": round(random.uniform(0.3, 1.0), 2),
            "BacterialPresence": "No",
            "EColi_MPN": round(random.uniform(0.0, 0.99), 2),
            "Rainfall_mm": round(random.uniform(0, 50), 2),
            "AvgTemperature_C": round(random.uniform(20, 30), 2),
        }
        row["OutbreakRisk"] = assign_risk(row)
        rows.append(row)

    # Medium Risk
    for i in range(n_medium):
        state = random.choice(states)
        village = random.choice(STATE_VILLAGE_MAP[state])
        row = {
            "SampleID": n_low + i + 1,
            "State": state,
            "Village": village,
            "Water_pH": round(random.choice([random.uniform(5.5, 6.4), random.uniform(8.6, 9.5)]), 2),
            "Turbidity_NTU": round(random.uniform(1.1, 5.0), 2),
            "Chlorine_mg_L": round(random.uniform(0.2, 0.3), 2),
            "BacterialPresence": "No",
            "EColi_MPN": round(random.uniform(1, 50), 2),
            "Rainfall_mm": round(random.uniform(20, 80), 2),
            "AvgTemperature_C": round(random.uniform(25, 35), 2),
        }
        row["OutbreakRisk"] = assign_risk(row)
        rows.append(row)

    # High Risk
    for i in range(n_high):
        state = random.choice(states)
        village = random.choice(STATE_VILLAGE_MAP[state])
        row = {
            "SampleID": n_low + n_medium + i + 1,
            "State": state,
            "Village": village,
            "Water_pH": round(random.uniform(4.5, 9.5), 2),
            "Turbidity_NTU": round(random.uniform(5.1, 10.0), 2),
            "Chlorine_mg_L": round(random.uniform(0.0, 0.19), 2),
            "BacterialPresence": "Yes",
            "EColi_MPN": round(random.uniform(51, 200), 2),
            "Rainfall_mm": round(random.uniform(50, 150), 2),
            "AvgTemperature_C": round(random.uniform(25, 40), 2),
        }
        row["OutbreakRisk"] = assign_risk(row)
        rows.append(row)

    # Create DataFrame & shuffle
    df = pd.DataFrame(rows)
    df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    return df

# Generate & Save Dataset
df = generate_iot_dataset()
df.to_csv("iot_sensor_dataset_regions.csv", index=False)
print(" Dataset generated and saved as iot_sensor_dataset_with_low_med_high_regions.csv")
print(df['OutbreakRisk'].value_counts())
print(df.head(10))


 Dataset generated and saved as iot_sensor_dataset_with_low_med_high_regions.csv
OutbreakRisk
Medium    1001
High       500
Low        499
Name: count, dtype: int64
   SampleID      State      Village  Water_pH  Turbidity_NTU  Chlorine_mg_L  \
0      1861  Meghalaya        Jowai      5.61           8.50           0.07   
1       354    Manipur    Bishnupur      8.02           0.05           0.74   
2      1334      Assam       Tezpur      5.62           1.91           0.25   
3       906    Tripura  Kailashahar      5.97           4.13           0.26   
4      1290   Nagaland     Touphema      5.78           4.01           0.27   
5      1274    Mizoram     Thenzawl      8.94           4.87           0.27   
6       939      Assam    Sualkuchi      6.32           2.01           0.27   
7      1732  Meghalaya     Baghmara      5.89           5.92           0.14   
8        66  Meghalaya        Dawki      7.16           0.47           0.34   
9      1324  Meghalaya   Mawlynnong      6.09

In [13]:
import pandas as pd

df = pd.read_csv("iot_sensor_dataset_regions.csv")
print(df["OutbreakRisk"].value_counts())


OutbreakRisk
Medium    1001
High       500
Low        499
Name: count, dtype: int64


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from collections import Counter
import joblib

# Load dataset (with State + Village present)
data = pd.read_csv("iot_sensor_dataset_regions.csv")

# Features and target
X = data.drop(columns=["OutbreakRisk", "State", "Village"])  # ❌ exclude regions from training
y = data["OutbreakRisk"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Define IoT features
numeric_features = [
    "Water_pH",
    "Turbidity_NTU",
    "Chlorine_mg_L",
    "EColi_MPN",
    "Rainfall_mm",
    "AvgTemperature_C"
]

categorical_features = ["BacterialPresence"]

# Transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Handle class imbalance with SMOTE
max_count = max(Counter(y_train).values())
smote_strategy = {cls: max_count for cls in Counter(y_train).keys()}
smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)

# Classifier
classifier = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    random_state=42
)

# Full pipeline
pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor),
    ("smote", smote),
    ("classifier", classifier)
])

# Train pipeline
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save pipeline
joblib.dump(pipeline, "final_pipeline_balanced.joblib")  # ✅ same name you use in prediction
print("✅ Pipeline saved as 'final_pipeline_balanced.joblib'")


Classification Report:
               precision    recall  f1-score   support

        High       1.00      1.00      1.00       100
         Low       1.00      1.00      1.00       100
      Medium       1.00      1.00      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400

Confusion Matrix:
 [[100   0   0]
 [  0 100   0]
 [  0   0 200]]
✅ Pipeline saved as 'final_pipeline_balanced.joblib'


In [18]:
import pandas as pd
import joblib

# Load trained pipeline
pipeline = joblib.load("final_pipeline_balanced.joblib")

def predict_with_region(sample_data: dict, state: str, village: str):
    # Only IoT features go into the model
    iot_features = {k: sample_data[k] for k in [
        "Water_pH", "Turbidity_NTU", "Chlorine_mg_L",
        "EColi_MPN", "Rainfall_mm", "AvgTemperature_C", "BacterialPresence"
    ]}

    df = pd.DataFrame([iot_features])

    # Predict
    pred_class = pipeline.predict(df)[0]
    pred_proba = pipeline.predict_proba(df)[0]

    # Confidence
    class_labels = pipeline.classes_
    pred_idx = list(class_labels).index(pred_class)
    pred_confidence = pred_proba[pred_idx] * 100

    # Final output (with region just for display)
    return f"Predicted Risk: {pred_class} ({pred_confidence:.2f}%) in {village}, {state}"


# ✅ Example usage
sample = {
    "Water_pH": 7.0,
    "Turbidity_NTU": 0.4,
    "Chlorine_mg_L": 0.35,
    "EColi_MPN": 0,
    "Rainfall_mm": 20,
    "AvgTemperature_C": 28,
    "BacterialPresence": "No"
}

print(predict_with_region(sample, state="Assam", village="Majuli"))


Predicted Risk: Low (100.00%) in Majuli, Assam


In [20]:
import pandas as pd
import joblib

# Load trained pipeline
pipeline = joblib.load("final_pipeline_balanced_with_regions.joblib")

# 🔴 High Risk Sample
sample_high = {
    "State": "Assam",
    "Village": "Majuli",
    "Water_pH": 6.0,
    "Turbidity_NTU": 4.5,
    "Chlorine_mg_L": 0.05,
    "BacterialPresence": "Yes",
    "EColi_MPN": 150,
    "Rainfall_mm": 40,
    "AvgTemperature_C": 30
}

# 🟡 Medium Risk Sample
sample_medium = {
    "State": "Meghalaya",
    "Village": "Cherrapunji",
    "Water_pH": 6.2,
    "Turbidity_NTU": 3.5,
    "Chlorine_mg_L": 0.25,
    "BacterialPresence": "No",
    "EColi_MPN": 20,
    "Rainfall_mm": 60,
    "AvgTemperature_C": 29
}

# 🟢 Low Risk Sample
sample_low = {
    "State": "Assam",
    "Village": "Sivasagar",
    "Water_pH": 7.0,
    "Turbidity_NTU": 0.4,
    "Chlorine_mg_L": 0.35,
    "EColi_MPN": 0.35,
    "Rainfall_mm": 20,
    "AvgTemperature_C": 28,
    "BacterialPresence": "No"
}

# Function to predict and print nicely
def predict_outbreak(sample):
    df = pd.DataFrame([sample])
    pred_class = pipeline.predict(df)[0]
    pred_proba = pipeline.predict_proba(df)[0]

    class_labels = pipeline.classes_
    pred_idx = list(class_labels).index(pred_class)
    pred_confidence = pred_proba[pred_idx] * 100

    print(
        f"Predicted Risk: {pred_class} ({pred_confidence:.2f}%) "
        f"in {sample['Village']}, {sample['State']}"
    )

# Run predictions
print("\n🔴 High Risk Sample:")
predict_outbreak(sample_high)

print("\n🟡 Medium Risk Sample:")
predict_outbreak(sample_medium)

print("\n🟢 Low Risk Sample:")
predict_outbreak(sample_low)



🔴 High Risk Sample:
Predicted Risk: High (73.48%) in Majuli, Assam

🟡 Medium Risk Sample:
Predicted Risk: Medium (95.88%) in Cherrapunji, Meghalaya

🟢 Low Risk Sample:
Predicted Risk: Low (94.96%) in Sivasagar, Assam
