In [1]:
import pandas as pd
from sqlalchemy import create_engine, text, inspect
import psycopg2
from datetime import datetime

# Database connection
engine = create_engine('postgresql://fluuser:flupass@postgres/flu_database')

print(" Connected to database")

 Connected to database


In [2]:
# WA DOH
print("\n" + "=" * 60)
print("COLLECTING WA DOH HOSPITALIZATION DATA")
print("=" * 60)

# WA DOH Hospital Use data
doh_hosp_url = "https://doh.wa.gov/sites/default/files/Data/Auto-Uploads/Respiratory-Illness/wahealth_hospitaluse_download.csv"

try:
    df_doh_hosp = pd.read_csv(doh_hosp_url)

    print(f"\n DOH Hospitalization data loaded: {len(df_doh_hosp)} weeks")
    print(f" Columns: {df_doh_hosp.columns.tolist()}")

    # Date range
    print(f"\n Date Range:")
    print(f"   - From: {df_doh_hosp['Date Range Start'].min()}")
    print(f"   - To: {df_doh_hosp['Date Range End'].max()}")

    # Clean up the data - handle <10 values
    def clean_numeric(value):
        """Convert <10 to 5 (midpoint), NA to NaN"""
        if pd.isna(value):
            return None
        if isinstance(value, str):
            if value.strip() == '<10':
                return 5  # Use midpoint for privacy-suppressed values
            if value.strip().upper() == 'NA':
                return None
        try:
            return float(value)
        except:
            return None

    # Apply cleaning to numeric columns
    numeric_cols = ['COVID19 7 day Avg Hospitalized', 'COVID19 7 day AVG in ICU bed',
                    'Influenza 7 day Avg Hospitalized', 'Influenza 7 day AVG in ICU bed']

    for col in numeric_cols:
        df_doh_hosp[f'{col}_cleaned'] = df_doh_hosp[col].apply(clean_numeric)

    # Statistics for Influenza (your main interest)
    flu_hosp = df_doh_hosp['Influenza 7 day Avg Hospitalized_cleaned'].dropna()

    print(f"\n Influenza Hospitalization Statistics:")
    print(f"   - Total weeks with data: {len(flu_hosp)}")
    print(f"   - Average hospitalized: {flu_hosp.mean():.1f}")
    print(f"   - Max hospitalized: {flu_hosp.max():.0f}")
    print(f"   - Recent (last week): {flu_hosp.iloc[-1]:.0f}")

    # Find peak flu weeks
    print(f"\n🔝 Top 5 Flu Hospitalization Weeks:")
    top_flu = df_doh_hosp.nlargest(5, 'Influenza 7 day Avg Hospitalized_cleaned')[
        ['Date Range Start', 'Date Range End', 'Influenza 7 day Avg Hospitalized']
    ]
    print(top_flu.to_string(index=False))

    # Show data quality
    total_rows = len(df_doh_hosp)
    flu_data_rows = df_doh_hosp['Influenza 7 day Avg Hospitalized'].notna().sum()
    flu_na_rows = (df_doh_hosp['Influenza 7 day Avg Hospitalized'] == 'NA').sum()
    flu_suppressed = (df_doh_hosp['Influenza 7 day Avg Hospitalized'].astype(str).str.contains('<10', na=False)).sum()

    print(f"\n Data Quality:")
    print(f"   - Total weeks: {total_rows}")
    print(f"   - Flu data available: {flu_data_rows} ({flu_data_rows/total_rows*100:.1f}%)")
    print(f"   - NA values: {flu_na_rows}")
    print(f"   - Privacy-suppressed (<10): {flu_suppressed}")

    # Save
    output_path = '/app/data/raw/wa_hospital_use.csv'
    df_doh_hosp.to_csv(output_path, index=False)
    print(f"\n Saved to: {output_path}")

    print("\nFirst 10 rows:")
    print(df_doh_hosp.head(10))

except Exception as e:
    print(f" Error: {e}")
    import traceback
    traceback.print_exc()


COLLECTING WA DOH HOSPITALIZATION DATA

 DOH Hospitalization data loaded: 264 weeks
 Columns: ['Date Range Start', 'Date Range End', 'COVID19 7 day Avg Hospitalized', 'COVID19 7 day AVG in ICU bed', 'Influenza 7 day Avg Hospitalized', 'Influenza 7 day AVG in ICU bed']

 Date Range:
   - From: 2020-09-27
   - To: 2025-10-18

 Influenza Hospitalization Statistics:
   - Total weeks with data: 216
   - Average hospitalized: 57.5
   - Max hospitalized: 513
   - Recent (last week): 5

🔝 Top 5 Flu Hospitalization Weeks:
Date Range Start Date Range End Influenza 7 day Avg Hospitalized
      2025-02-09     2025-02-15                              513
      2025-02-16     2025-02-22                              511
      2025-02-02     2025-02-08                              476
      2022-11-27     2022-12-03                              449
      2022-12-04     2022-12-10                              446

 Data Quality:
   - Total weeks: 264
   - Flu data available: 216 (81.8%)
   - NA values:

In [3]:
## Collect census
print("=" * 60)
print("COLLECTING CENSUS DATA")
print("=" * 60)

# Download census data
census_url = "https://data.wa.gov/api/views/e6ip-wkqq/rows.csv?accessType=DOWNLOAD"

try:
    df_census = pd.read_csv(census_url)

    print(f"\n Census data loaded: {len(df_census)} counties")
    print(f" Columns: {df_census.columns.tolist()}")

    # Check for missing values in 2020 data
    missing_2020 = df_census['Population Density 2020'].isna().sum()
    print(f"\n Missing 2020 density values: {missing_2020}")

    # Show summary statistics
    print("\n 2020 Population Density Statistics:")
    print(df_census['Population Density 2020'].describe())

    # Show top 5 most dense counties
    print("\n Top 5 Most Dense Counties (2020):")
    top_counties = df_census.nlargest(5, 'Population Density 2020')[['County Name', 'Population Density 2020']]
    print(top_counties.to_string(index=False))

    # Save to raw data
    output_path = '/app/data/raw/wa_population_density.csv'
    df_census.to_csv(output_path, index=False)
    print(f"\n Saved to: {output_path}")

    print("\nFirst 5 rows:")
    print(df_census.head())

except Exception as e:
    print(f" Error: {e}")
    import traceback
    traceback.print_exc()

COLLECTING CENSUS DATA

 Census data loaded: 39 counties
 Columns: ['County Name', 'Population Density 1900', 'Population Density 1910', 'Population Density 1920', 'Population Density 1930', 'Population Density 1940', 'Population Density 1950', 'Population Density 1960', 'Population Density 1970', 'Population Density 1980', 'Population Density 1990', 'Population Density 2000', 'Population Density 2010', 'Population Density 2020']

 Missing 2020 density values: 0

 2020 Population Density Statistics:
count      39.000000
mean      149.612821
std       250.371017
min         3.220000
25%        17.550000
50%        36.990000
75%       104.950000
max      1073.020000
Name: Population Density 2020, dtype: float64

 Top 5 Most Dense Counties (2020):
County Name  Population Density 2020
       King                  1073.02
      Clark                   800.82
     Kitsap                   697.57
     Pierce                   551.81
     Island                   416.63

 Saved to: /app/data/r

In [4]:
##CDC Flu

import requests
print("\n" + "=" * 60)
print("COLLECTING CDC FLUVIEW DATA")
print("=" * 60)

# API endpoint
api_url = "https://api.delphi.cmu.edu/epidata/fluview/"

# Parameters - Get data from 2020 onwards
params = {
    'regions': 'wa',
    'epiweeks': '202001-202452'  # 2020 through 2024
}

try:
    # Make API request
    response = requests.get(api_url, params=params)
    data = response.json()

    # Check if successful
    if data['result'] == 1:
        df_fluview = pd.DataFrame(data['epidata'])

        print(f"\n FluView data loaded: {len(df_fluview)} weeks")
        print(f" Date range: {df_fluview['epiweek'].min()} to {df_fluview['epiweek'].max()}")

        # Show key columns
        print(f"\n Key columns:")
        key_cols = ['region', 'epiweek', 'num_ili', 'num_patients', 'wili']
        print(f"   {key_cols}")

        # Summary statistics
        print("\n ILI Statistics:")
        print(f"   - Average ILI cases per week: {df_fluview['num_ili'].mean():.0f}")
        print(f"   - Max ILI cases in a week: {df_fluview['num_ili'].max()}")
        print(f"   - Average % ILI: {df_fluview['wili'].mean():.2f}%")
        print(f"   - Max % ILI: {df_fluview['wili'].max():.2f}%")

        # Show weeks with highest ILI
        print("\n Top 5 Weeks by ILI Percentage:")
        top_ili = df_fluview.nlargest(5, 'wili')[['epiweek', 'num_ili', 'num_patients', 'wili']]
        print(top_ili.to_string(index=False))

        # Save to raw data
        output_path = '/app/data/raw/wa_fluview_data.csv'
        df_fluview.to_csv(output_path, index=False)
        print(f"\n Saved to: {output_path}")

        print("\nFirst 5 rows:")
        print(df_fluview.head())

    else:
        print(f" API Error: {data.get('message', 'Unknown error')}")

except Exception as e:
    print(f" Error: {e}")
    import traceback
    traceback.print_exc()


COLLECTING CDC FLUVIEW DATA

 FluView data loaded: 261 weeks
 Date range: 202001 to 202452

 Key columns:
   ['region', 'epiweek', 'num_ili', 'num_patients', 'wili']

 ILI Statistics:
   - Average ILI cases per week: 772
   - Max ILI cases in a week: 6043
   - Average % ILI: 1.72%
   - Max % ILI: 12.93%

 Top 5 Weeks by ILI Percentage:
 epiweek  num_ili  num_patients     wili
  202247     5945         45967 12.93320
  202248     6043         51505 11.73280
  202249     5284         49549 10.66420
  202250     4365         48328  9.03203
  202246     3563         47539  7.49490

 Saved to: /app/data/raw/wa_fluview_data.csv

First 5 rows:
  release_date region   issue  epiweek  lag  num_ili  num_patients  \
0   2021-10-08     wa  202139   202001   91     1449         20298   
1   2021-10-08     wa  202139   202002   90     1075         22028   
2   2021-10-08     wa  202139   202003   89      853         20215   
3   2021-10-08     wa  202139   202004   88      966         21871   
4   