In [2]:
import pandas as pd
from sqlalchemy import create_engine, text, inspect
import psycopg2
from datetime import datetime

# Database connection
engine = create_engine('postgresql://fluuser:flupass@postgres/flu_database')

print(" Connected to database")

 Connected to database


In [3]:
# WA DOH RHINO Data
print("\n" + "=" * 60)
print("COLLECTING WA DOH RHINO DATA")
print("=" * 60)

# WA DOH RHINO downloadable data
doh_rhino_url = "https://doh.wa.gov/sites/default/files/Data/Auto-Uploads/Respiratory-Illness/Respiratory_Disease_RHINO_Downloadable_Data.csv"

# ACH to Counties mapping - MUST match official WA county names exactly
ach_to_counties = {
    "Better Health Together": ["Spokane", "Stevens", "Pend Oreille", "Ferry"],
    "Cascade Pacific Action Alliance": ["Thurston", "Mason", "Grays Harbor", "Pacific", "Lewis"],
    "Elevate Health": ["Yakima", "Kittitas"],
    "Greater Health Now": ["Spokane"],  # Urban Spokane focus - duplicate with Better Health Together
    "Healthier Here": ["King"],
    "North Sound": ["Whatcom", "Skagit", "Snohomish", "San Juan", "Island"],
    "Olympic Community of Health": ["Clallam", "Jefferson", "Kitsap"],
    "Southwest Washington": ["Clark", "Skamania", "Klickitat", "Cowlitz", "Wahkiakum"],
    "Thriving Together NCW": ["Chelan", "Douglas", "Grant", "Okanogan"]
}

# Official WA State counties for validation
wa_counties = [
    "Adams", "Asotin", "Benton", "Chelan", "Clallam", "Clark", "Columbia", "Cowlitz",
    "Douglas", "Ferry", "Franklin", "Garfield", "Grant", "Grays Harbor", "Island",
    "Jefferson", "King", "Kitsap", "Kittitas", "Klickitat", "Lewis", "Lincoln",
    "Mason", "Okanogan", "Pacific", "Pend Oreille", "Pierce", "San Juan", "Skagit",
    "Skamania", "Snohomish", "Spokane", "Stevens", "Thurston", "Wahkiakum",
    "Walla Walla", "Whatcom", "Whitman", "Yakima"
]

try:
    df_doh_rhino = pd.read_csv(doh_rhino_url)

    # Add source column
    df_doh_rhino['source'] = 'WA_DOH_RHINO'

    print(f"\n✓ DOH RHINO data loaded: {len(df_doh_rhino)} records")
    print(f"  Original columns: {df_doh_rhino.columns.tolist()}")

    # Validate counties in mapping
    print(f"\n Validating County Mapping:")
    all_mapped_counties = set()
    for ach, counties in ach_to_counties.items():
        all_mapped_counties.update(counties)

    # Check for invalid county names
    invalid_counties = all_mapped_counties - set(wa_counties)
    if invalid_counties:
        print(f"     WARNING: Invalid county names found: {invalid_counties}")

    # Check for unmapped counties
    unmapped_counties = set(wa_counties) - all_mapped_counties
    if unmapped_counties:
        print(f"     WARNING: Counties not in any ACH: {sorted(unmapped_counties)}")
        print(f"      ({len(unmapped_counties)} counties: likely Pierce, Adams, Asotin, Benton, Columbia, Franklin, Garfield, Lincoln, Walla Walla, Whitman)")

    print(f"   ✓ {len(all_mapped_counties)} counties mapped across {len(ach_to_counties)} ACH regions")

    # Remove Statewide and Unassigned records before exploding
    original_count = len(df_doh_rhino)
    df_doh_rhino = df_doh_rhino[
        ~df_doh_rhino['Location'].isin(['Statewide', 'Unassigned ACH Region'])
    ].copy()
    removed_count = original_count - len(df_doh_rhino)
    print(f"\n🗑️  Removed {removed_count} Statewide/Unassigned records")
    print(f"   Remaining: {len(df_doh_rhino)} ACH region records")

    # Map ACH to counties and explode
    df_doh_rhino['county_list'] = df_doh_rhino['Location'].map(ach_to_counties)

    # Explode: create one row per county
    df_doh_rhino_exploded = df_doh_rhino.explode('county_list').reset_index(drop=True)

    # Rename county_list to county for clarity
    df_doh_rhino_exploded.rename(columns={'county_list': 'county'}, inplace=True)

    print(f"\n After County Explosion:")
    print(f"   - Original ACH records: {len(df_doh_rhino)}")
    print(f"   - Exploded county records: {len(df_doh_rhino_exploded)}")
    print(f"   - Expansion factor: {len(df_doh_rhino_exploded) / len(df_doh_rhino):.2f}x")

    # Verify unique counties
    unique_counties = df_doh_rhino_exploded['county'].unique()
    print(f"\n   Unique counties in data: {len(unique_counties)}")
    print(f"   Counties: {sorted(unique_counties)}")

    # Show county record counts
    print(f"\n Records per County:")
    county_counts = df_doh_rhino_exploded['county'].value_counts().sort_index()
    for county, count in county_counts.items():
        # Show which ACH regions include this county
        achs = [ach for ach, counties in ach_to_counties.items() if county in counties]
        ach_str = ", ".join(achs)
        print(f"   - {county}: {count:,} records (ACH: {ach_str})")

    # Date range
    print(f"\n Date Range:")
    print(f"   - From: {df_doh_rhino_exploded['Week Start'].min()}")
    print(f"   - To: {df_doh_rhino_exploded['Week End'].max()}")

    # Clean up the percentage data
    def clean_percentage(value):
        """Convert empty strings to NaN, keep numeric values"""
        if pd.isna(value):
            return None
        if isinstance(value, str):
            if value.strip() == '':
                return None
        try:
            return float(value)
        except:
            return None

    df_doh_rhino_exploded['1-Week Percent_cleaned'] = df_doh_rhino_exploded['1-Week Percent '].apply(clean_percentage)

    # Show data dimensions
    print(f"\n Data Dimensions:")
    print(f"   - Seasons: {df_doh_rhino_exploded['Season'].nunique()}")
    print(f"   - Counties: {df_doh_rhino_exploded['county'].nunique()}")
    print(f"   - Respiratory Illnesses: {', '.join(df_doh_rhino_exploded['Respiratory Illness Category'].unique())}")
    print(f"   - Care Types: {', '.join(df_doh_rhino_exploded['Care Type'].unique())}")
    print(f"   - Demographic Categories: {', '.join(df_doh_rhino_exploded['Demographic Category'].unique())}")

    # Example: Latest flu data by county
    latest_week = df_doh_rhino_exploded['Week End'].max()
    latest_flu_hosp = df_doh_rhino_exploded[
        (df_doh_rhino_exploded['Week End'] == latest_week) &
        (df_doh_rhino_exploded['Respiratory Illness Category'] == 'Flu') &
        (df_doh_rhino_exploded['Care Type'] == 'Hospitalizations') &
        (df_doh_rhino_exploded['Demographic Category'] == 'Overall')
    ].copy()

    if len(latest_flu_hosp) > 0:
        print(f"\n Latest Flu Hospitalizations by County ({latest_week}):")
        latest_flu_hosp_sorted = latest_flu_hosp.sort_values('1-Week Percent_cleaned', ascending=False)
        for _, row in latest_flu_hosp_sorted.head(10).iterrows():
            pct = row['1-Week Percent_cleaned']
            if pd.notna(pct):
                print(f"   - {row['county']}: {pct}% (from {row['Location']})")

    # Data quality
    total_rows = len(df_doh_rhino_exploded)
    data_rows = df_doh_rhino_exploded['1-Week Percent_cleaned'].notna().sum()
    empty_rows = total_rows - data_rows

    print(f"\n Data Quality:")
    print(f"   - Total records: {total_rows:,}")
    print(f"   - Records with data: {data_rows:,} ({data_rows/total_rows*100:.1f}%)")
    print(f"   - Empty/suppressed: {empty_rows:,} ({empty_rows/total_rows*100:.1f}%)")

    # Save
    output_path = '/app/data/raw/wa_doh_rhino.csv'
    df_doh_rhino_exploded.to_csv(output_path, index=False)
    print(f"\n Saved to: {output_path}")

    print("\n📋 Sample records (showing county-level data):")
    sample_cols = ['county', 'Location', 'Week Start', 'Week End', 'Respiratory Illness Category', 'Care Type', '1-Week Percent_cleaned']
    print(df_doh_rhino_exploded[sample_cols].head(20).to_string(index=False))

except Exception as e:
    print(f" Error: {e}")
    import traceback
    traceback.print_exc()


COLLECTING WA DOH RHINO DATA

✓ DOH RHINO data loaded: 7638 records
  Original columns: ['Season', 'Week Start', 'Week End', 'Week', 'Location', 'Respiratory Illness Category', 'Demographic Category', 'Demographic', 'Care Type', '1-Week Percent ', 'dtm_updated', 'source']

 Validating County Mapping:
      (10 counties: likely Pierce, Adams, Asotin, Benton, Columbia, Franklin, Garfield, Lincoln, Walla Walla, Whitman)
   ✓ 29 counties mapped across 9 ACH regions

🗑️  Removed 1860 Statewide/Unassigned records
   Remaining: 5778 ACH region records

 After County Explosion:
   - Original ACH records: 5778
   - Exploded county records: 19260
   - Expansion factor: 3.33x

   Unique counties in data: 29
   Counties: ['Chelan', 'Clallam', 'Clark', 'Cowlitz', 'Douglas', 'Ferry', 'Grant', 'Grays Harbor', 'Island', 'Jefferson', 'King', 'Kitsap', 'Kittitas', 'Klickitat', 'Lewis', 'Mason', 'Okanogan', 'Pacific', 'Pend Oreille', 'San Juan', 'Skagit', 'Skamania', 'Snohomish', 'Spokane', 'Stevens', '

In [4]:
## Collect census
print("=" * 60)
print("COLLECTING CENSUS DATA")
print("=" * 60)

# Download census data
census_url = "https://data.wa.gov/api/views/e6ip-wkqq/rows.csv?accessType=DOWNLOAD"

try:
    df_census = pd.read_csv(census_url)

    print(f"\n Census data loaded: {len(df_census)} counties")
    print(f" Columns: {df_census.columns.tolist()}")

    # Check for missing values in 2020 data
    missing_2020 = df_census['Population Density 2020'].isna().sum()
    print(f"\n Missing 2020 density values: {missing_2020}")

    # Show summary statistics
    print("\n 2020 Population Density Statistics:")
    print(df_census['Population Density 2020'].describe())

    # Show top 5 most dense counties
    print("\n Top 5 Most Dense Counties (2020):")
    top_counties = df_census.nlargest(5, 'Population Density 2020')[['County Name', 'Population Density 2020']]
    print(top_counties.to_string(index=False))

    # Save to raw data
    output_path = '/app/data/raw/wa_population_density.csv'
    df_census.to_csv(output_path, index=False)
    print(f"\n Saved to: {output_path}")

    print("\nFirst 5 rows:")
    print(df_census.head())

except Exception as e:
    print(f" Error: {e}")
    import traceback
    traceback.print_exc()

COLLECTING CENSUS DATA

 Census data loaded: 39 counties
 Columns: ['County Name', 'Population Density 1900', 'Population Density 1910', 'Population Density 1920', 'Population Density 1930', 'Population Density 1940', 'Population Density 1950', 'Population Density 1960', 'Population Density 1970', 'Population Density 1980', 'Population Density 1990', 'Population Density 2000', 'Population Density 2010', 'Population Density 2020']

 Missing 2020 density values: 0

 2020 Population Density Statistics:
count      39.000000
mean      149.612821
std       250.371017
min         3.220000
25%        17.550000
50%        36.990000
75%       104.950000
max      1073.020000
Name: Population Density 2020, dtype: float64

 Top 5 Most Dense Counties (2020):
County Name  Population Density 2020
       King                  1073.02
      Clark                   800.82
     Kitsap                   697.57
     Pierce                   551.81
     Island                   416.63

 Saved to: /app/data/r

In [5]:
##CDC Flu

import requests
print("\n" + "=" * 60)
print("COLLECTING CDC FLUVIEW DATA")
print("=" * 60)

# API endpoint
api_url = "https://api.delphi.cmu.edu/epidata/fluview/"

# Parameters - Get data from 2020 onwards
params = {
    'regions': 'wa',
    'epiweeks': '202001-202452'  # 2020 through 2024
}

try:
    # Make API request
    response = requests.get(api_url, params=params)
    data = response.json()

    # Check if successful
    if data['result'] == 1:
        df_fluview = pd.DataFrame(data['epidata'])

        print(f"\n FluView data loaded: {len(df_fluview)} weeks")
        print(f" Date range: {df_fluview['epiweek'].min()} to {df_fluview['epiweek'].max()}")

        # Show key columns
        print(f"\n Key columns:")
        key_cols = ['region', 'epiweek', 'num_ili', 'num_patients', 'wili']
        print(f"   {key_cols}")

        # Summary statistics
        print("\n ILI Statistics:")
        print(f"   - Average ILI cases per week: {df_fluview['num_ili'].mean():.0f}")
        print(f"   - Max ILI cases in a week: {df_fluview['num_ili'].max()}")
        print(f"   - Average % ILI: {df_fluview['wili'].mean():.2f}%")
        print(f"   - Max % ILI: {df_fluview['wili'].max():.2f}%")

        # Show weeks with highest ILI
        print("\n Top 5 Weeks by ILI Percentage:")
        top_ili = df_fluview.nlargest(5, 'wili')[['epiweek', 'num_ili', 'num_patients', 'wili']]
        print(top_ili.to_string(index=False))

        # Save to raw data
        output_path = '/app/data/raw/wa_fluview_data.csv'
        df_fluview.to_csv(output_path, index=False)
        print(f"\n Saved to: {output_path}")

        print("\nFirst 5 rows:")
        print(df_fluview.head())

    else:
        print(f" API Error: {data.get('message', 'Unknown error')}")

except Exception as e:
    print(f" Error: {e}")
    import traceback
    traceback.print_exc()


COLLECTING CDC FLUVIEW DATA

 FluView data loaded: 261 weeks
 Date range: 202001 to 202452

 Key columns:
   ['region', 'epiweek', 'num_ili', 'num_patients', 'wili']

 ILI Statistics:
   - Average ILI cases per week: 772
   - Max ILI cases in a week: 6043
   - Average % ILI: 1.72%
   - Max % ILI: 12.93%

 Top 5 Weeks by ILI Percentage:
 epiweek  num_ili  num_patients     wili
  202247     5945         45967 12.93320
  202248     6043         51505 11.73280
  202249     5284         49549 10.66420
  202250     4365         48328  9.03203
  202246     3563         47539  7.49490

 Saved to: /app/data/raw/wa_fluview_data.csv

First 5 rows:
  release_date region   issue  epiweek  lag  num_ili  num_patients  \
0   2021-10-08     wa  202139   202001   91     1449         20298   
1   2021-10-08     wa  202139   202002   90     1075         22028   
2   2021-10-08     wa  202139   202003   89      853         20215   
3   2021-10-08     wa  202139   202004   88      966         21871   
4   