In [1]:
# Water Justice Watch: Community Water Equity & Alert Map

import pandas as pd
import numpy as np
import folium
from folium.plugins import MarkerCluster
import matplotlib.pyplot as plt


In [2]:
url = "cleaned_water_quality_subset.csv"
df = pd.read_csv(url, parse_dates=['dDATE'])

In [3]:
# Step 1: Define Unsafe Events
# ----------------------------------
df['low_DO'] = df['nDO'] < 5.0
df['hypoxic_DO'] = df['nDO'] < 2.0
df['unsafe_pH'] = (df['nPH'] < 6.5) | (df['nPH'] > 9.0)
df['high_conductivity'] = df['nSPCOND'] > 500
df['shallow_hot'] = (df['nDEPTH'] < 0.2) & (df['nTEMP'] > 22.0)

# Composite Stressor Score (1 point per issue)
df['risk_score'] = df[['low_DO', 'unsafe_pH', 'high_conductivity', 'shallow_hot']].sum(axis=1)

In [None]:
# Step 2: Aggregate by Location
# ----------------------------------
grouped = df.groupby(['Latitude', 'Longitude', 'Watershed', 'Waterbody'])
aggr = grouped.agg(
    total_obs=('nDO', 'count'),
    total_unsafe=('risk_score', lambda x: (x > 0).sum()),
    severe_cases=('hypoxic_DO', 'sum'),
    avg_risk_score=('risk_score', 'mean')
).reset_index()
aggr['percent_unsafe'] = (aggr['total_unsafe'] / aggr['total_obs']) * 100

In [None]:
# Step 3: Visual Map with Risk Scores
# ----------------------------------
m = folium.Map(location=[42.1, -71.8], zoom_start=8, tiles='CartoDB positron')
marker_cluster = MarkerCluster().add_to(m)

for i, row in aggr.iterrows():
    popup = (f"<b>Waterbody:</b> {row['Waterbody']}<br>"
             f"<b>Watershed:</b> {row['Watershed']}<br>"
             f"<b>Total Observations:</b> {row['total_obs']}<br>"
             f"<b>Unsafe Events:</b> {row['total_unsafe']}<br>"
             f"<b>Avg Risk Score:</b> {row['avg_risk_score']:.2f}<br>"
             f"<b>Severe Events:</b> {int(row['severe_cases'])}")

    color = 'green'
    if row['avg_risk_score'] > 1.5:
        color = 'orange'
    if row['avg_risk_score'] > 2.5:
        color = 'red'

    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=6,
        color=color,
        fill=True,
        fill_color=color,
        popup=folium.Popup(popup, max_width=300)
    ).add_to(marker_cluster)
    


In [6]:
# Save map to file
m.save("water_risk_map.html")

In [17]:
# Step 4: Top Risk Hotspots Table
# ----------------------------------
top10 = aggr.sort_values("avg_risk_score", ascending=False)
print("Top 10 High-Risk Locations:\n")
print(top10[['Waterbody', 'Watershed', 'avg_risk_score', 'percent_unsafe', 'severe_cases']])
with open("top10_table.txt", "w") as f:
    f.write(top10[['Waterbody', 'Watershed', 'avg_risk_score', 'percent_unsafe', 'severe_cases']].to_string())


Top 10 High-Risk Locations:

                          Waterbody   Watershed  avg_risk_score  \
1141                    Stony Brook   Merrimack        3.000000   
309               Cold Spring Brook  Blackstone        2.625000   
916               Unnamed Tributary    Chicopee        2.500000   
868                     Moose Brook    Chicopee        2.500000   
1004                     Vine Brook   Shawsheen        2.333333   
...                             ...         ...             ...   
701               Unnamed Tributary    Chicopee        0.000000   
142                     Fresh Brook    Cape Cod        0.000000   
706                 Larrywaug Brook  Housatonic        0.000000   
707   Middle Branch Westfield River   Westfield        0.000000   
0                        Mill Brook     Islands        0.000000   

      percent_unsafe  severe_cases  
1141           100.0             3  
309            100.0             1  
916            100.0             0  
868            100

In [20]:
# Step 5: Placeholder for Community Vulnerability Integration
# ----------------------------------
# Note: In production, this would involve merging shapefiles with census block data from EJScreen or CDC SVI.
# For now, we simulate with a vulnerability placeholder:
aggr['community_vulnerability'] = np.random.choice(['Low', 'Medium', 'High'], size=len(aggr), p=[0.4, 0.4, 0.2])

# Recreate top10 with the new column
top10 = aggr.sort_values("avg_risk_score", ascending=False).head(10)

In [21]:

# Step 6: Sample "Water Justice Alert" Memo
# ----------------------------------
def generate_alert(row):
    return (f"**ALERT**: The waterbody '{row['Waterbody']}' in the '{row['Watershed']}' watershed has a high average "
            f"risk score of {row['avg_risk_score']:.2f} and {row['severe_cases']} hypoxic events. This site also overlaps "
            f"with a '{row['community_vulnerability']}' vulnerability community. Recommended actions include targeted "
            f"monitoring and local remediation outreach.")

# Example alert for top hotspot
print("\nSample Water Justice Alert:\n")
print(generate_alert(top10.iloc[0]))


Sample Water Justice Alert:

**ALERT**: The waterbody 'Stony Brook' in the 'Merrimack' watershed has a high average risk score of 3.00 and 3 hypoxic events. This site also overlaps with a 'Medium' vulnerability community. Recommended actions include targeted monitoring and local remediation outreach.


# Predictive Modeling Ideas for Water Quality Risk

Based on the risk score and available data, here are several predictive modeling approaches:

## 1. **Time Series Forecasting**
- **Predict future risk scores** for specific locations
- **Forecast water quality parameters** (DO, pH, temperature) days/weeks ahead
- Use historical patterns, seasonality, and trends

## 2. **Classification Models**
- **Predict unsafe events** (binary: safe/unsafe based on risk_score > 0)
- **Predict severe hypoxic events** (risk_score >= 2 or hypoxic_DO)
- Early warning system for water quality degradation

## 3. **Regression Models**
- **Predict individual water quality parameters** (nDO, nPH, nTEMP, nSPCOND)
- **Predict risk score** as a continuous value
- Understand which factors drive water quality changes

## 4. **Spatial Prediction Models**
- **Predict risk at unmonitored locations** using spatial features
- **Interpolate water quality** across watersheds
- Use latitude/longitude, watershed characteristics, proximity to other sites

## 5. **Multi-Target Prediction**
- **Predict multiple parameters simultaneously** (multi-output regression)
- Capture correlations between DO, pH, temperature, conductivity

## 6. **Anomaly Detection**
- **Identify unusual water quality events** before they become critical
- Detect outliers that may indicate pollution events or equipment issues

## 7. **Causal/Intervention Analysis**
- **Predict impact of remediation efforts** on risk scores
- **What-if scenarios**: How would risk change if certain parameters improved?

Let's implement a few of these approaches below:


In [22]:
# Prepare data for predictive modeling
# ----------------------------------

# Create time-based features
df['year'] = df['dDATE'].dt.year
df['month'] = df['dDATE'].dt.month
df['day_of_year'] = df['dDATE'].dt.dayofyear
df['is_summer'] = df['month'].isin([6, 7, 8])
df['is_winter'] = df['month'].isin([12, 1, 2])

# Create lag features for time series (previous observation at same location)
df = df.sort_values(['Latitude', 'Longitude', 'dDATE'])
df['prev_risk_score'] = df.groupby(['Latitude', 'Longitude'])['risk_score'].shift(1)
df['prev_nDO'] = df.groupby(['Latitude', 'Longitude'])['nDO'].shift(1)
df['prev_nTEMP'] = df.groupby(['Latitude', 'Longitude'])['nTEMP'].shift(1)

# Create rolling statistics
df['risk_score_rolling_mean'] = df.groupby(['Latitude', 'Longitude'])['risk_score'].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean()
)

# Binary target for classification
df['is_unsafe'] = (df['risk_score'] > 0).astype(int)
df['is_high_risk'] = (df['risk_score'] >= 2).astype(int)

print("Data prepared for modeling!")
print(f"Total observations: {len(df)}")
print(f"Unsafe events: {df['is_unsafe'].sum()} ({df['is_unsafe'].mean()*100:.1f}%)")
print(f"High risk events: {df['is_high_risk'].sum()} ({df['is_high_risk'].mean()*100:.1f}%)")


Data prepared for modeling!
Total observations: 12596
Unsafe events: 4895 (38.9%)
High risk events: 1430 (11.4%)


In [23]:
# Model 1: Binary Classification - Predict Unsafe Events
# --------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Select features for modeling
feature_cols = [
    'nTEMP', 'nSPCOND', 'nTDS', 'nDEPTH',  # Current water quality
    'month', 'day_of_year', 'is_summer', 'is_winter',  # Temporal
    'Latitude', 'Longitude',  # Spatial
    'prev_risk_score', 'prev_nDO', 'prev_nTEMP',  # Lag features
    'risk_score_rolling_mean'  # Rolling stats
]

# Encode categorical variables
le_watershed = LabelEncoder()
df['Watershed_encoded'] = le_watershed.fit_transform(df['Watershed'].fillna('Unknown'))
feature_cols.append('Watershed_encoded')

# Prepare data (drop rows with missing target or key features)
model_df = df.dropna(subset=['is_unsafe'] + [col for col in feature_cols if col in df.columns]).copy()

X = model_df[feature_cols]
y = model_df['is_unsafe']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
rf_classifier.fit(X_train, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]

# Evaluate
print("=" * 60)
print("MODEL 1: Binary Classification - Predicting Unsafe Events")
print("=" * 60)
print(f"\nTraining samples: {len(X_train)}, Test samples: {len(X_test)}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Safe', 'Unsafe']))
print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.3f}")
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)
print(f"\nTop 10 Most Important Features:")
print(feature_importance.head(10))


MODEL 1: Binary Classification - Predicting Unsafe Events

Training samples: 6641, Test samples: 1661

Classification Report:
              precision    recall  f1-score   support

        Safe       0.96      0.96      0.96       865
      Unsafe       0.96      0.96      0.96       796

    accuracy                           0.96      1661
   macro avg       0.96      0.96      0.96      1661
weighted avg       0.96      0.96      0.96      1661


ROC-AUC Score: 0.995

Confusion Matrix:
[[832  33]
 [ 33 763]]

Top 10 Most Important Features:
                    feature  importance
13  risk_score_rolling_mean    0.405949
10          prev_risk_score    0.168696
1                   nSPCOND    0.112352
2                      nTDS    0.100096
3                    nDEPTH    0.053823
11                 prev_nDO    0.050569
9                 Longitude    0.025107
0                     nTEMP    0.021852
8                  Latitude    0.016209
12               prev_nTEMP    0.015214


In [24]:
# Model 2: Regression - Predict Risk Score
# -----------------------------------------
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Prepare regression target
regression_df = df.dropna(subset=['risk_score'] + [col for col in feature_cols if col in df.columns]).copy()

X_reg = regression_df[feature_cols]
y_reg = regression_df['risk_score']

# Split data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Train model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf_regressor.fit(X_train_reg, y_train_reg)

# Predictions
y_pred_reg = rf_regressor.predict(X_test_reg)

# Evaluate
print("=" * 60)
print("MODEL 2: Regression - Predicting Risk Score")
print("=" * 60)
print(f"\nTraining samples: {len(X_train_reg)}, Test samples: {len(X_test_reg)}")
print(f"\nR² Score: {r2_score(y_test_reg, y_pred_reg):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test_reg, y_pred_reg):.3f}")
print(f"Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test_reg, y_pred_reg)):.3f}")

# Feature importance
feature_importance_reg = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_regressor.feature_importances_
}).sort_values('importance', ascending=False)
print(f"\nTop 10 Most Important Features:")
print(feature_importance_reg.head(10))


MODEL 2: Regression - Predicting Risk Score

Training samples: 6641, Test samples: 1661

R² Score: 0.899
Mean Absolute Error: 0.110
Root Mean Squared Error: 0.249

Top 10 Most Important Features:
                    feature  importance
13  risk_score_rolling_mean    0.789262
3                    nDEPTH    0.049499
10          prev_risk_score    0.037745
11                 prev_nDO    0.030106
1                   nSPCOND    0.021769
0                     nTEMP    0.014563
2                      nTDS    0.012949
5               day_of_year    0.012158
12               prev_nTEMP    0.011474
8                  Latitude    0.007225


In [None]:
# Model 3: Predict Dissolved Oxygen (nDO) - Critical Water Quality Parameter
# ---------------------------------------------------------------------------
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

# Use other water quality parameters as features
do_features = ['nTEMP', 'nPH', 'nSPCOND', 'nTDS', 'nDEPTH', 
               'month', 'day_of_year', 'is_summer', 'is_winter',
               'Latitude', 'Longitude', 'Watershed_encoded',
               'prev_nDO', 'prev_nTEMP']

# Prepare data for DO prediction - drop rows with NaN in target
do_df = df.dropna(subset=['nDO']).copy()

# Select only features that exist in the dataframe
available_features = [col for col in do_features if col in do_df.columns]
X_do = do_df[available_features].copy()
y_do = do_df['nDO']

# Handle NaN values in features with imputation
imputer = SimpleImputer(strategy='mean')
X_do_imputed = pd.DataFrame(
    imputer.fit_transform(X_do),
    columns=X_do.columns,
    index=X_do.index
)

# Split data
X_train_do, X_test_do, y_train_do, y_test_do = train_test_split(
    X_do_imputed, y_do, test_size=0.2, random_state=42
)

# Train model
gb_do = GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=5)
gb_do.fit(X_train_do, y_train_do)

# Predictions
y_pred_do = gb_do.predict(X_test_do)

# Evaluate
print("=" * 60)
print("MODEL 3: Regression - Predicting Dissolved Oxygen (nDO)")
print("=" * 60)
print(f"\nTraining samples: {len(X_train_do)}, Test samples: {len(X_test_do)}")
print(f"\nR² Score: {r2_score(y_test_do, y_pred_do):.3f}")
print(f"Mean Absolute Error: {mean_absolute_error(y_test_do, y_pred_do):.3f}")
print(f"Root Mean Squared Error: {np.sqrt(mean_squared_error(y_test_do, y_pred_do)):.3f}")

# Feature importance
feature_importance_do = pd.DataFrame({
    'feature': available_features,
    'importance': gb_do.feature_importances_
}).sort_values('importance', ascending=False)
print(f"\nTop 10 Most Important Features:")
print(feature_importance_do.head(10))

MODEL 3: Regression - Predicting Dissolved Oxygen (nDO)

Training samples: 8840, Test samples: 2211

R² Score: 0.849
Mean Absolute Error: 0.754
Root Mean Squared Error: 1.219

Top 10 Most Important Features:
        feature  importance
12     prev_nDO    0.536489
0         nTEMP    0.164069
4        nDEPTH    0.150440
1           nPH    0.059475
6   day_of_year    0.025749
10    Longitude    0.015289
2       nSPCOND    0.014495
13   prev_nTEMP    0.011555
3          nTDS    0.009751
9      Latitude    0.007100


In [36]:
# Model 4: Time Series Forecasting - Predict Future Risk Scores
# ---------------------------------------------------------------
# For a specific location, predict risk score in future time periods

def forecast_risk_for_location(df, lat, lon, forecast_days=30, tolerance=0.001):
    """
    Forecast risk scores for a specific location using historical data
    Uses tolerance-based matching for lat/lon to handle floating point precision
    """
    # Use tolerance-based matching for lat/lon
    location_data = df[
        (np.abs(df['Latitude'] - lat) < tolerance) & 
        (np.abs(df['Longitude'] - lon) < tolerance)
    ].copy()
    location_data = location_data.sort_values('dDATE')
    
    if len(location_data) < 10:
        return None, f"Insufficient data for this location (found {len(location_data)} observations, need at least 10)"
    
    # Create time-based features
    location_data['days_since_start'] = (location_data['dDATE'] - location_data['dDATE'].min()).dt.days
    location_data['risk_score_lag1'] = location_data['risk_score'].shift(1)
    location_data['risk_score_lag7'] = location_data['risk_score'].shift(7)
    location_data['risk_score_ma7'] = location_data['risk_score'].rolling(window=7, min_periods=1).mean()
    
    # Prepare features
    ts_features = ['days_since_start', 'month', 'day_of_year', 'is_summer', 'is_winter',
                   'risk_score_lag1', 'risk_score_lag7', 'risk_score_ma7',
                   'nTEMP', 'nSPCOND']
    
    ts_df = location_data.dropna(subset=['risk_score'] + [f for f in ts_features if f in location_data.columns])
    
    if len(ts_df) < 5:
        return None, f"Insufficient data after cleaning (found {len(ts_df)} valid rows, need at least 5)"
    
    X_ts = ts_df[[f for f in ts_features if f in ts_df.columns]]
    y_ts = ts_df['risk_score']
    
    # Train simple model
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    model.fit(X_ts, y_ts)
    
    # Generate forecast (simplified - using last known values)
    last_row = ts_df.iloc[-1]
    forecast = []
    
    for day in range(1, forecast_days + 1):
        future_date = last_row['dDATE'] + pd.Timedelta(days=day)
        future_month = future_date.month
        future_day_of_year = future_date.dayofyear
        future_is_summer = future_month in [6, 7, 8]
        future_is_winter = future_month in [12, 1, 2]
        
        # Use last known values for other features (in practice, you'd forecast these too)
        forecast_features = np.array([[
            last_row['days_since_start'] + day,
            future_month,
            future_day_of_year,
            int(future_is_summer),
            int(future_is_winter),
            last_row['risk_score'],  # lag1
            last_row['risk_score_ma7'] if not pd.isna(last_row['risk_score_ma7']) else last_row['risk_score'],  # lag7
            last_row['risk_score_ma7'] if not pd.isna(last_row['risk_score_ma7']) else last_row['risk_score'],  # ma7
            last_row['nTEMP'] if not pd.isna(last_row['nTEMP']) else 15.0,
            last_row['nSPCOND'] if not pd.isna(last_row['nSPCOND']) else 400.0
        ]])
        
        pred = model.predict(forecast_features)[0]
        forecast.append({'date': future_date, 'predicted_risk': max(0, pred)})
    
    return pd.DataFrame(forecast), None

# Find a location with sufficient data for forecasting
# First, try locations from top10 that have enough observations
forecast_df = None
error = None
selected_location = None

# Filter top10 to only locations with at least 10 observations
top10_with_data = top10[top10['total_obs'] >= 5].head(10)
selected_location=[]
forecast_df_list=[]
print(len(top10_with_data))
if len(top10_with_data) > 0:
    for idx, row in top10_with_data.iterrows():
        lat, lon = row['Latitude'], row['Longitude']
        forecast_df, error = forecast_risk_for_location(df, lat, lon, forecast_days=30)
        if error is None:
            selected_location.append(row)
            forecast_df_list.append(forecast_df)
            

# If no top10 location worked, find any location with sufficient data
if error is not None:
    print(f"Top locations didn't have sufficient data. Searching for any suitable location...")
    # Find locations with at least 10 observations, sorted by risk score
    suitable_locations = aggr[aggr['total_obs'] >= 10].sort_values('avg_risk_score', ascending=False)
    
    for idx, row in suitable_locations.iterrows():
        lat, lon = row['Latitude'], row['Longitude']
        forecast_df, error = forecast_risk_for_location(df, lat, lon, forecast_days=30)
        if error is None:
            selected_location = row
            

# Display results
if error:
    print(f"Error: {error}")
    print(f"\nNote: Could not find any location with sufficient time series data for forecasting.")
    print(f"Consider reducing the minimum data requirement or using a different forecasting approach.")
else:
    for i in selected_location:
        print("=" * 60)
        print("MODEL 4: Time Series Forecast - Future Risk Scores")
        print("=" * 60)
        print(f"\nLocation: {i['Waterbody']}, {i['Watershed']}")
        print(f"Coordinates: ({i['Latitude']:.4f}, {i['Longitude']:.4f})")
        print(f"Historical Observations: {i['total_obs']}")
        print(f"Average Risk Score: {i['avg_risk_score']:.2f}")
        print(f"\n30-Day Forecast:")
        print(forecast_df_list[0].head(10))
        print(f"\nAverage Predicted Risk (next 30 days): {forecast_df_list[0]['predicted_risk'].mean():.2f}")
        print(f"Max Predicted Risk: {forecast_df_list[0]['predicted_risk'].max():.2f}")
        print(f"Min Predicted Risk: {forecast_df_list[0]['predicted_risk'].min():.2f}")

3
MODEL 4: Time Series Forecast - Future Risk Scores

Location: Otter River, Millers
Coordinates: (42.5644, -72.0115)
Historical Observations: 13
Average Risk Score: 2.15

30-Day Forecast:
        date  predicted_risk
0 2005-09-15        1.877556
1 2005-09-16        1.853215
2 2005-09-17        1.828874
3 2005-09-18        1.804532
4 2005-09-19        1.780191
5 2005-09-20        1.755850
6 2005-09-21        1.731509
7 2005-09-22        1.707167
8 2005-09-23        1.682826
9 2005-09-24        1.658485

Average Predicted Risk (next 30 days): 1.53
Max Predicted Risk: 1.88
Min Predicted Risk: 1.18
MODEL 4: Time Series Forecast - Future Risk Scores

Location: Sudbury Reservoir, Concord (SuAsCo)
Coordinates: (42.3019, -71.5129)
Historical Observations: 54
Average Risk Score: 2.02

30-Day Forecast:
        date  predicted_risk
0 2005-09-15        1.877556
1 2005-09-16        1.853215
2 2005-09-17        1.828874
3 2005-09-18        1.804532
4 2005-09-19        1.780191
5 2005-09-20        1



## Next Steps & Advanced Modeling Ideas

### Additional Models to Consider:

1. **LSTM/GRU for Time Series**: Use deep learning for more sophisticated temporal patterns
2. **Spatial Interpolation (Kriging)**: Predict water quality at unmonitored locations
3. **Multi-Output Regression**: Predict all water quality parameters simultaneously
4. **Anomaly Detection**: Use Isolation Forest or Autoencoders to detect unusual events
5. **Causal Inference**: Use methods like Difference-in-Differences to evaluate intervention impacts
6. **Ensemble Methods**: Combine multiple models for better predictions
7. **Feature Engineering**: 
   - Add weather data (temperature, precipitation)
   - Add upstream/downstream relationships
   - Add land use data (urbanization, agriculture)
   - Add distance to pollution sources

### Model Evaluation & Deployment:

- Cross-validation with time-based splits (respect temporal order)
- Feature importance analysis to understand drivers
- Model interpretability (SHAP values, LIME)
- Real-time prediction pipeline
- Alert system when predictions exceed thresholds
