# Water Pollution Detection – Week 1 (Environmental Monitoring & Pollution Control)
This notebook builds baseline ML models to predict **Potability** (0=Not potable, 1=Potable) from water quality indicators. It includes EDA, preprocessing, model training (Logistic Regression, Random Forest), evaluation, and model export. Use the provided synthetic dataset or replace with your own (same columns).

## 1. Setup

In [1]:

# If running in a new environment, ensure packages are installed:
# !pip install pandas scikit-learn matplotlib joblib

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
import matplotlib.pyplot as plt
import joblib

BASE_DIR = "/mnt/data/Water_Pollution_Detection_Week1"
DATA_PATH = os.path.join(BASE_DIR, "data", "water_quality.csv")
print("Using data at:", DATA_PATH)


ModuleNotFoundError: No module named 'pandas'

## 2. Load Data

In [None]:

df = pd.read_csv(DATA_PATH)
print(df.head())
print("\nShape:", df.shape)
print("\nMissing values per column:\n", df.isna().sum())


## 3. Quick EDA

In [None]:

print(df.describe().T)

print("\nClass balance:")
print(df['Potability'].value_counts(normalize=True))

numeric_cols = [c for c in df.columns if c != 'Potability']
for col in numeric_cols:
    plt.figure()
    df[col].hist(bins=30)
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()


## 4. Train/Test Split

In [None]:

X = df.drop(columns=['Potability'])
y = df['Potability'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


## 5. Preprocessing & Baseline Models

In [None]:

from sklearn.metrics import ConfusionMatrixDisplay

numeric_features = X_train.columns.tolist()

preprocess = ColumnTransformer(
    transformers=[('num', Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features)]
)

logreg = Pipeline(steps=[('prep', preprocess),
                        ('clf', LogisticRegression(max_iter=500))])

rf = Pipeline(steps=[('prep', preprocess),
                    ('clf', RandomForestClassifier(n_estimators=300, random_state=42))])

models = {'LogisticRegression': logreg, 'RandomForest': rf}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    proba = model.predict_proba(X_test)[:,1]
    print(f"\n=== {name} ===")
    print(classification_report(y_test, preds, digits=4))
    cm = confusion_matrix(y_test, preds)
    print("Confusion Matrix:\n", cm)
    auc = roc_auc_score(y_test, proba)
    print("ROC AUC:", round(auc, 4))

    RocCurveDisplay.from_predictions(y_test, proba)
    plt.title(f"ROC – {name}")
    plt.show()


## 6. Feature Importance (Random Forest)

In [None]:

rf_model = models['RandomForest']
rf_est = rf_model.named_steps['clf']
importances = rf_est.feature_importances_
fi = pd.Series(importances, index=numeric_features).sort_values(ascending=False)
print(fi)

plt.figure()
fi.plot(kind='bar')
plt.title("Feature Importance – Random Forest")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()


## 7. Cross-Validation (Quick Check)

In [None]:

cv_scores = cross_val_score(models['RandomForest'], X, y, cv=5, scoring='roc_auc')
print("5-fold CV ROC AUC (RF):", np.round(cv_scores, 4))
print("Mean ± Std:", round(cv_scores.mean(), 4), "±", round(cv_scores.std(), 4))


## 8. Export Best Model

In [None]:

best_model = models['RandomForest']
export_path = os.path.join(BASE_DIR, "model_random_forest.joblib")
joblib.dump(best_model, export_path)
export_path


## 9. How to Replace with Your Own Dataset


- Replace `data/water_quality.csv` with your dataset having the same column names:
  `['ph','Hardness','Solids','Chloramines','Sulfate','Conductivity','Organic_carbon','Trihalomethanes','Turbidity','Potability']`  
- Ensure `Potability` is 0/1. If not, map labels accordingly.
- Re-run all cells.
