<a href="https://colab.research.google.com/github/leyli16/HousingPricePrediction/blob/fema_cleaning/Final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Library


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from string import ascii_letters
import matplotlib.pyplot as plt
import datetime as dt
import requests
from lxml import html
import math
import json
import re
import os

# 1. County Demographics Data Cleaning and Wrangling


## 1.1 Loading County Demogrphics Dataset

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("glozab/county-level-us-demographic-data-1990-2020")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/glozab/county-level-us-demographic-data-1990-2020/versions/1


In [None]:
files = os.listdir(path)
print("Files in dataset:", files)

Files in dataset: ['county_demographics.csv']


In [None]:
file_path = os.path.join(path, 'county_demographics.csv')
county_demographics_df = pd.read_csv(file_path)

print(county_demographics_df.head())

   year  fips  population  w_population  b_population  o_population  \
0  1990  1025       27289         15579         11643            35   
1  1990  1031       40293         32869          6950           160   
2  1990  1041       13598         10068          3516            11   
3  1990  1053       35526         24377         10050          1045   
4  1990  1101      209537        119702         87856           415   

   nh_population  hi_population  na_population  male_population  ...  \
0          27196             93              0            13052  ...   
1          39831            462              0            19673  ...   
2          13576             22              0             6421  ...   
3          35378            148              0            17454  ...   
4         207933           1604              0            98854  ...   

   age9_population_ratio  age10_population_ratio  age11_population_ratio  \
0                0.06263                 0.05552                

## 1.2 County Demographics Dataset Cleaning

In [None]:
county_demographics_df.dtypes

Unnamed: 0,0
year,int64
fips,int64
population,int64
w_population,int64
b_population,int64
o_population,int64
nh_population,int64
hi_population,int64
na_population,int64
male_population,int64


In [None]:
print(county_demographics_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97287 entries, 0 to 97286
Data columns (total 57 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     97287 non-null  int64  
 1   fips                     97287 non-null  int64  
 2   population               97287 non-null  int64  
 3   w_population             97287 non-null  int64  
 4   b_population             97287 non-null  int64  
 5   o_population             97287 non-null  int64  
 6   nh_population            97287 non-null  int64  
 7   hi_population            97287 non-null  int64  
 8   na_population            97287 non-null  int64  
 9   male_population          97287 non-null  int64  
 10  female_population        97287 non-null  int64  
 11  age0_population          97287 non-null  int64  
 12  age1_population          97287 non-null  int64  
 13  age2_population          97287 non-null  int64  
 14  age3_population       

In [None]:
print(county_demographics_df.isna().sum().sort_values(ascending=False))

year                       0
fips                       0
population                 0
w_population               0
b_population               0
o_population               0
nh_population              0
hi_population              0
na_population              0
male_population            0
female_population          0
age0_population            0
age1_population            0
age2_population            0
age3_population            0
age4_population            0
age5_population            0
age6_population            0
age7_population            0
age8_population            0
age9_population            0
age10_population           0
age11_population           0
age12_population           0
age13_population           0
age14_population           0
age15_population           0
age16_population           0
age17_population           0
age18_population           0
w_population_ratio         0
b_population_ratio         0
o_population_ratio         0
nh_population_ratio        0
hi_population_

### Rename Columns for Consistency

In [None]:
county_demographics_df.columns = county_demographics_df.columns.str.strip().str.lower().str.replace(' ', '_')

 ### Handle Duplicates

In [None]:
county_demographics_df = county_demographics_df.drop_duplicates()
county_demographics_df = county_demographics_df.dropna()
county_demographics_df.head()

Unnamed: 0,year,fips,population,w_population,b_population,o_population,nh_population,hi_population,na_population,male_population,...,age9_population_ratio,age10_population_ratio,age11_population_ratio,age12_population_ratio,age13_population_ratio,age14_population_ratio,age15_population_ratio,age16_population_ratio,age17_population_ratio,age18_population_ratio
0,1990,1025,27289,15579,11643,35,27196,93,0,13052,...,0.06263,0.05552,0.04998,0.04482,0.04167,0.03767,0.03324,0.02825,0.01843,0.01319
1,1990,1031,40293,32869,6950,160,39831,462,0,19673,...,0.07455,0.06031,0.0518,0.04894,0.04544,0.04145,0.0343,0.0273,0.0166,0.01258
2,1990,1041,13598,10068,3516,11,13576,22,0,6421,...,0.06582,0.05354,0.04574,0.04765,0.05104,0.05273,0.04633,0.03876,0.02456,0.01839
3,1990,1053,35526,24377,10050,1045,35378,148,0,17454,...,0.0682,0.05694,0.05174,0.04771,0.04712,0.04141,0.03502,0.02911,0.01942,0.01368
4,1990,1101,209537,119702,87856,415,207933,1604,0,98854,...,0.06966,0.05219,0.04392,0.04111,0.03967,0.03757,0.02881,0.02294,0.01483,0.01145


In [None]:
county_demographics_df.describe()

Unnamed: 0,year,fips,population,w_population,b_population,o_population,nh_population,hi_population,na_population,male_population,...,age9_population_ratio,age10_population_ratio,age11_population_ratio,age12_population_ratio,age13_population_ratio,age14_population_ratio,age15_population_ratio,age16_population_ratio,age17_population_ratio,age18_population_ratio
count,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,...,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0
mean,2005.009436,30412.85995,93659.57,75298.41,12524.06,1129.919044,80097.54,13562.03,0.0,45980.61,...,0.068098,0.067555,0.064839,0.060595,0.054948,0.04805,0.039687,0.030821,0.021798,0.020099
std,8.942825,15147.842995,303469.9,225952.3,57165.18,4815.43707,217156.2,105839.3,0.0,148924.0,...,0.011292,0.010417,0.011904,0.014153,0.014746,0.013994,0.011871,0.009476,0.007503,0.008808
min,1990.0,1001.0,55.0,29.0,0.0,0.0,49.0,0.0,0.0,29.0,...,0.0084,0.01342,0.00915,0.00771,0.00532,0.00484,0.00257,0.0,0.0,0.0
25%,1997.0,18183.0,10948.0,9225.5,95.0,40.0,10116.0,171.0,0.0,5445.0,...,0.05997,0.0604,0.05595,0.04896,0.04416,0.03862,0.03201,0.02468,0.01689,0.01421
50%,2005.0,29179.0,24902.0,21661.0,800.0,136.0,23270.0,635.0,0.0,12319.0,...,0.06851,0.06778,0.06546,0.06058,0.05277,0.04615,0.03847,0.02995,0.021,0.01859
75%,2013.0,45083.0,63479.0,56247.0,5440.0,550.0,59890.0,3148.0,0.0,31350.5,...,0.07577,0.0745,0.07312,0.0711,0.064605,0.05563,0.04594,0.03597,0.02567,0.02435
max,2020.0,99999.0,10094860.0,7301940.0,1437609.0,172454.0,5508975.0,4868179.0,0.0,4975410.0,...,0.16,0.15596,0.15301,0.25806,0.18182,0.18391,0.18358,0.1434,0.14444,0.1954


## 1.2 Loading Fips Code DataSet




In [None]:
from google.colab import files
uploaded = files.upload()
fips_df = pd.read_csv('state_and_county_fips_master.csv')
fips_df.head()

Saving state_and_county_fips_master.csv to state_and_county_fips_master.csv


Unnamed: 0,fips,name,state
0,0,UNITED STATES,
1,1000,ALABAMA,
2,1001,Autauga County,AL
3,1003,Baldwin County,AL
4,1005,Barbour County,AL


In [None]:
fips_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3195 entries, 0 to 3194
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   fips    3195 non-null   int64 
 1   name    3195 non-null   object
 2   state   3143 non-null   object
dtypes: int64(1), object(2)
memory usage: 75.0+ KB


## 1.3 Merging the county_demographics_df with fips_df

In [None]:
county_demographics_merged_df = county_demographics_df.merge(
    fips_df,
    on='fips',
    how='left'
)
county_demographics_merged_df.head()

Unnamed: 0,year,fips,population,w_population,b_population,o_population,nh_population,hi_population,na_population,male_population,...,age11_population_ratio,age12_population_ratio,age13_population_ratio,age14_population_ratio,age15_population_ratio,age16_population_ratio,age17_population_ratio,age18_population_ratio,name,state
0,1990,1025,27289,15579,11643,35,27196,93,0,13052,...,0.04998,0.04482,0.04167,0.03767,0.03324,0.02825,0.01843,0.01319,Clarke County,AL
1,1990,1031,40293,32869,6950,160,39831,462,0,19673,...,0.0518,0.04894,0.04544,0.04145,0.0343,0.0273,0.0166,0.01258,Coffee County,AL
2,1990,1041,13598,10068,3516,11,13576,22,0,6421,...,0.04574,0.04765,0.05104,0.05273,0.04633,0.03876,0.02456,0.01839,Crenshaw County,AL
3,1990,1053,35526,24377,10050,1045,35378,148,0,17454,...,0.05174,0.04771,0.04712,0.04141,0.03502,0.02911,0.01942,0.01368,Escambia County,AL
4,1990,1101,209537,119702,87856,415,207933,1604,0,98854,...,0.04392,0.04111,0.03967,0.03757,0.02881,0.02294,0.01483,0.01145,Montgomery County,AL


### Sanity Check After Merging

In [None]:
print(county_demographics_merged_df.columns)         # See all columns
print(county_demographics_merged_df[['fips', 'name']].head())  # Confirm correct matches
print(county_demographics_merged_df.isna().sum())    # Check for unmatched FIPS

Index(['year', 'fips', 'population', 'w_population', 'b_population',
       'o_population', 'nh_population', 'hi_population', 'na_population',
       'male_population', 'female_population', 'age0_population',
       'age1_population', 'age2_population', 'age3_population',
       'age4_population', 'age5_population', 'age6_population',
       'age7_population', 'age8_population', 'age9_population',
       'age10_population', 'age11_population', 'age12_population',
       'age13_population', 'age14_population', 'age15_population',
       'age16_population', 'age17_population', 'age18_population',
       'w_population_ratio', 'b_population_ratio', 'o_population_ratio',
       'nh_population_ratio', 'hi_population_ratio', 'na_population_ratio',
       'male_population_ratio', 'female_population_ratio',
       'age0_population_ratio', 'age1_population_ratio',
       'age2_population_ratio', 'age3_population_ratio',
       'age4_population_ratio', 'age5_population_ratio',
       'age6_popula

## 1.4 Cleaning the Merged Data

Checking for unmatched FIPS — that output tells us 174 rows in the county_demographics_df didn't find a match in fips_df, because name, state, and any other info from fips_df came back as NaN.

In [None]:
unmatched = county_demographics_merged_df[county_demographics_merged_df['name'].isna()]
print(unmatched['fips'].unique()[:10])

[ 2910  4910 51917  8911  2201  8912  8913  2280  2232  2010]


In [None]:
county_demographics_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97287 entries, 0 to 97286
Data columns (total 59 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     97287 non-null  int64  
 1   fips                     97287 non-null  int64  
 2   population               97287 non-null  int64  
 3   w_population             97287 non-null  int64  
 4   b_population             97287 non-null  int64  
 5   o_population             97287 non-null  int64  
 6   nh_population            97287 non-null  int64  
 7   hi_population            97287 non-null  int64  
 8   na_population            97287 non-null  int64  
 9   male_population          97287 non-null  int64  
 10  female_population        97287 non-null  int64  
 11  age0_population          97287 non-null  int64  
 12  age1_population          97287 non-null  int64  
 13  age2_population          97287 non-null  int64  
 14  age3_population       

Drop the unmatched rows.

In [None]:
county_demographics_merged_df = county_demographics_merged_df[county_demographics_merged_df['name'].notna()].reset_index(drop=True)
county_demographics_merged_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97113 entries, 0 to 97112
Data columns (total 59 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     97113 non-null  int64  
 1   fips                     97113 non-null  int64  
 2   population               97113 non-null  int64  
 3   w_population             97113 non-null  int64  
 4   b_population             97113 non-null  int64  
 5   o_population             97113 non-null  int64  
 6   nh_population            97113 non-null  int64  
 7   hi_population            97113 non-null  int64  
 8   na_population            97113 non-null  int64  
 9   male_population          97113 non-null  int64  
 10  female_population        97113 non-null  int64  
 11  age0_population          97113 non-null  int64  
 12  age1_population          97113 non-null  int64  
 13  age2_population          97113 non-null  int64  
 14  age3_population       

In [None]:
county_demographics_merged_df = county_demographics_merged_df.rename(columns={'name': 'county'})
county_demographics_merged_df = county_demographics_merged_df.dropna()
county_demographics_merged_df = county_demographics_merged_df.drop_duplicates()
county_demographics_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97113 entries, 0 to 97112
Data columns (total 59 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     97113 non-null  int64  
 1   fips                     97113 non-null  int64  
 2   population               97113 non-null  int64  
 3   w_population             97113 non-null  int64  
 4   b_population             97113 non-null  int64  
 5   o_population             97113 non-null  int64  
 6   nh_population            97113 non-null  int64  
 7   hi_population            97113 non-null  int64  
 8   na_population            97113 non-null  int64  
 9   male_population          97113 non-null  int64  
 10  female_population        97113 non-null  int64  
 11  age0_population          97113 non-null  int64  
 12  age1_population          97113 non-null  int64  
 13  age2_population          97113 non-null  int64  
 14  age3_population       

In [None]:
county_demographics_merged_df.head()

Unnamed: 0,year,fips,population,w_population,b_population,o_population,nh_population,hi_population,na_population,male_population,...,age11_population_ratio,age12_population_ratio,age13_population_ratio,age14_population_ratio,age15_population_ratio,age16_population_ratio,age17_population_ratio,age18_population_ratio,county,state
0,1990,1025,27289,15579,11643,35,27196,93,0,13052,...,0.04998,0.04482,0.04167,0.03767,0.03324,0.02825,0.01843,0.01319,Clarke County,AL
1,1990,1031,40293,32869,6950,160,39831,462,0,19673,...,0.0518,0.04894,0.04544,0.04145,0.0343,0.0273,0.0166,0.01258,Coffee County,AL
2,1990,1041,13598,10068,3516,11,13576,22,0,6421,...,0.04574,0.04765,0.05104,0.05273,0.04633,0.03876,0.02456,0.01839,Crenshaw County,AL
3,1990,1053,35526,24377,10050,1045,35378,148,0,17454,...,0.05174,0.04771,0.04712,0.04141,0.03502,0.02911,0.01942,0.01368,Escambia County,AL
4,1990,1101,209537,119702,87856,415,207933,1604,0,98854,...,0.04392,0.04111,0.03967,0.03757,0.02881,0.02294,0.01483,0.01145,Montgomery County,AL


# 2. FEMA Disaster Declarations Data Cleaning and Wrangling

In this section, we clean and wrangle FEMA disaster declaration data to prepare it for analysis. The raw data is filtered to include only disasters from the past 10 years, joined with ZIP codes using a county-level crosswalk, and aggregated to generate a ZIP-level disaster count feature for downstream housing price modeling.

The final output table, `zip_disaster_counts`, contains the number of FEMA disaster declarations per ZIP code over the last decade. This will be used as a static risk feature in our housing price model.





##2.1 Import Required Libraries



In [7]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import os
import requests

## 2.2 Load FEMA Disaster Declarations Data

Note: We loaded the full dataset here to understand the available columns. In the next step, we’ll select only the relevant ones.

In [8]:
fema_url = "https://www.fema.gov/api/open/v2/DisasterDeclarationsSummaries.csv"
fema_file_path = "DisasterDeclarationsSummaries.csv"

# Download the file if it doesn't already exist
if not os.path.exists(fema_file_path):
    response = requests.get(fema_url)
    with open(fema_file_path, "wb") as file:
        file.write(response.content)

# Load the entire dataset first
fema_raw_df = pd.read_csv(fema_file_path)

# Preview full column names
fema_raw_df.head()


  fema_raw_df = pd.read_csv(fema_file_path)


Unnamed: 0,femaDeclarationString,disasterNumber,state,declarationType,declarationDate,fyDeclared,incidentType,declarationTitle,ihProgramDeclared,iaProgramDeclared,...,placeCode,designatedArea,declarationRequestNumber,lastIAFilingDate,incidentId,region,designatedIncidentTypes,lastRefresh,hash,id
0,EM-3611-VI,3611,VI,EM,2024-08-18T00:00:00.000Z,2024,Tropical Storm,TROPICAL STORM ERNESTO,0,0,...,99010,St. Croix (Island) (County-equivalent),24126,,2024080901,2,"4,M,W,Z",2025-03-25T22:41:27.147Z,4a8baf2d2f9117b4381e823fd96a41ea5f5c72d3,488c7bb4-5f20-4108-b499-99ef33953617
1,FM-5529-OR,5529,OR,FM,2024-08-09T00:00:00.000Z,2024,Fire,LEE FALLS FIRE,0,0,...,99067,Washington (County),24122,,2024081001,10,R,2024-08-27T18:22:14.800Z,ae87cf3c6ed795015b714af7166c7c295b2b67c7,09e3f81a-5e16-4b72-b317-1c64e0cfa59c
2,FM-5528-OR,5528,OR,FM,2024-08-06T00:00:00.000Z,2024,Fire,ELK LANE FIRE,0,0,...,99031,Jefferson (County),24116,,2024080701,10,R,2024-08-27T18:22:14.800Z,432cf0995c47e3895cea696ede5621b810460501,59983f89-30bf-4888-b21b-62e8d57d9aac
3,FM-5527-OR,5527,OR,FM,2024-08-02T00:00:00.000Z,2024,Fire,MILE MARKER 132 FIRE,0,0,...,99017,Deschutes (County),24111,,2024080301,10,R,2024-08-27T18:22:14.800Z,2f21d90cb6bc64b0d4121aa3f18d852bbb4b11fa,8d13ecf0-bc2f-496b-8c9f-b2e73da832a0
4,EM-3611-VI,3611,VI,EM,2024-08-18T00:00:00.000Z,2024,Tropical Storm,TROPICAL STORM ERNESTO,0,0,...,99020,St. John (Island) (County-equivalent),24126,,2024080901,2,"4,M,W,Z",2025-03-25T22:41:27.147Z,049f9b3d480604687cd32784486d584b44381ff4,4576151c-0d18-478f-a342-1f89463a736e


## 2.3 Select Relevant Columns：

In [13]:
# Select only the columns we need
fema_df = fema_raw_df[[
    'disasterNumber', 'state', 'designatedArea', 'declarationDate',
    'fipsStateCode', 'fipsCountyCode'
]].copy()

# Convert to datetime
fema_df['declarationDate'] = pd.to_datetime(fema_df['declarationDate'])

# Preview cleaned dataframe
fema_df.head()


Unnamed: 0,disasterNumber,state,designatedArea,declarationDate,fipsStateCode,fipsCountyCode
0,3611,VI,St. Croix (Island) (County-equivalent),2024-08-18 00:00:00+00:00,78,10
1,5529,OR,Washington (County),2024-08-09 00:00:00+00:00,41,67
2,5528,OR,Jefferson (County),2024-08-06 00:00:00+00:00,41,31
3,5527,OR,Deschutes (County),2024-08-02 00:00:00+00:00,41,17
4,3611,VI,St. John (Island) (County-equivalent),2024-08-18 00:00:00+00:00,78,20


## 2.4 Filter Data from the Last 10 Years

In [17]:
from pandas.tseries.offsets import DateOffset

# Today's date and dynamic 10-year cutoff
today = pd.Timestamp.now(tz='UTC')
cutoff_date = today - DateOffset(years=10)

# Filter FEMA data
fema_recent_df = fema_df[fema_df['declarationDate'] >= cutoff_date].copy()

# Check the date range
print("Earliest disaster:", fema_recent_df['declarationDate'].min())
print("Latest disaster:", fema_recent_df['declarationDate'].max())
print("Total records:", fema_recent_df.shape[0])

# Preview
fema_recent_df.head()

Earliest disaster: 2015-04-08 00:00:00+00:00
Latest disaster: 2025-04-05 00:00:00+00:00
Total records: 23427


Unnamed: 0,disasterNumber,state,designatedArea,declarationDate,fipsStateCode,fipsCountyCode
0,3611,VI,St. Croix (Island) (County-equivalent),2024-08-18 00:00:00+00:00,78,10
1,5529,OR,Washington (County),2024-08-09 00:00:00+00:00,41,67
2,5528,OR,Jefferson (County),2024-08-06 00:00:00+00:00,41,31
3,5527,OR,Deschutes (County),2024-08-02 00:00:00+00:00,41,17
4,3611,VI,St. John (Island) (County-equivalent),2024-08-18 00:00:00+00:00,78,20


## 2.5 Clean FEMA Data (Standardize & Prepare for Join)

In [18]:
# Construct full 5-digit county-level FIPS code
fema_recent_df['fips'] = (
    fema_recent_df['fipsStateCode'].astype(str).str.zfill(2) +
    fema_recent_df['fipsCountyCode'].astype(str).str.zfill(3)
)

# Preview
fema_recent_df.head()

Unnamed: 0,disasterNumber,state,designatedArea,declarationDate,fipsStateCode,fipsCountyCode,fips
0,3611,VI,St. Croix (Island) (County-equivalent),2024-08-18 00:00:00+00:00,78,10,78010
1,5529,OR,Washington (County),2024-08-09 00:00:00+00:00,41,67,41067
2,5528,OR,Jefferson (County),2024-08-06 00:00:00+00:00,41,31,41031
3,5527,OR,Deschutes (County),2024-08-02 00:00:00+00:00,41,17,41017
4,3611,VI,St. John (Island) (County-equivalent),2024-08-18 00:00:00+00:00,78,20,78020


## 2.6 Load County-to-ZIP Mapping Table from Kaggle

In [20]:
# Install kagglehub if not already installed
!pip install kagglehub --quiet

import kagglehub
import os
import pandas as pd

# Download dataset from Kaggle
path = kagglehub.dataset_download("danofer/zipcodes-county-fips-crosswalk")

print("Path to dataset files:", path)

# List files
files = os.listdir(path)
print("Files in dataset:", files)

# Load the crosswalk CSV
file_path = os.path.join(path, "ZIP-COUNTY-FIPS_2017-06.csv")
zip_crosswalk_df = pd.read_csv(file_path)

zip_crosswalk_df['fips'] = zip_crosswalk_df['STCOUNTYFP'].astype(str).str.zfill(5)


# Preview
zip_crosswalk_df.head()


Path to dataset files: /root/.cache/kagglehub/datasets/danofer/zipcodes-county-fips-crosswalk/versions/1
Files in dataset: ['ZIP-COUNTY-FIPS_2017-06.csv']


Unnamed: 0,ZIP,COUNTYNAME,STATE,STCOUNTYFP,CLASSFP,fips
0,36003,Autauga County,AL,1001,H1,1001
1,36006,Autauga County,AL,1001,H1,1001
2,36067,Autauga County,AL,1001,H1,1001
3,36066,Autauga County,AL,1001,H1,1001
4,36703,Autauga County,AL,1001,H1,1001



## 2.7 Merge FEMA with ZIP (via FIPS)

In [26]:
fema_with_zip = fema_recent_df.merge(
    zip_crosswalk_df[['ZIP', 'fips']],
    on='fips',
    how='left'
)

# Drop FEMA rows with no ZIP match
#fema_with_zip = fema_with_zip.dropna(subset=['ZIP'])

# Preveiw
fema_with_zip.head()

Unnamed: 0,disasterNumber,state,designatedArea,declarationDate,fipsStateCode,fipsCountyCode,fips,ZIP
0,3611,VI,St. Croix (Island) (County-equivalent),2024-08-18 00:00:00+00:00,78,10,78010,850.0
1,3611,VI,St. Croix (Island) (County-equivalent),2024-08-18 00:00:00+00:00,78,10,78010,840.0
2,3611,VI,St. Croix (Island) (County-equivalent),2024-08-18 00:00:00+00:00,78,10,78010,820.0
3,5529,OR,Washington (County),2024-08-09 00:00:00+00:00,41,67,41067,97078.0
4,5529,OR,Washington (County),2024-08-09 00:00:00+00:00,41,67,41067,97133.0


In [27]:
print(f"Total FEMA records after 10-year filter: {fema_recent_df.shape[0]}")
print(f"FEMA records after ZIP merge: {fema_with_zip.shape[0]}")
print(f"Number of FEMA records with ZIP: {fema_with_zip['ZIP'].notna().sum()}")
print(f"Number of FEMA records missing ZIP: {fema_with_zip['ZIP'].isna().sum()}")


Total FEMA records after 10-year filter: 23427
FEMA records after ZIP merge: 389086
Number of FEMA records with ZIP: 388152
Number of FEMA records missing ZIP: 934


## 2.8: Merge Validation — FEMA and ZIP Mapping

In this section, we validate whether FEMA disaster records were successfully joined with ZIP codes using fips codes. We check the match rate, ZIP assignment distribution, and inspect records with missing ZIPs.



### 2.8.1 Match Rate Check
This checks whether the merge was successful for a significant portion of the data.

In [29]:
total_records = fema_with_zip.shape[0]
matched_zip_count = fema_with_zip['ZIP'].notna().sum()
missing_zip_count = fema_with_zip['ZIP'].isna().sum()
match_rate = matched_zip_count / total_records

print(f"Total FEMA records (10 yrs):     {total_records}")
print(f"Matched ZIPs:                    {matched_zip_count}")
print(f"Missing ZIPs:                    {missing_zip_count}")
print(f"ZIP Match Rate:                  {match_rate:.2%}")


Total FEMA records (10 yrs):     389086
Matched ZIPs:                    388152
Missing ZIPs:                    934
ZIP Match Rate:                  99.76%


### 2.8.2 ZIP Duplication per FIPS

One county (FIPS) may correspond to multiple ZIP codes. We inspect the distribution to ensure the ZIP mapping is reasonable.


In [30]:
zip_per_fips = (
    fema_with_zip[['fips', 'ZIP']]
    .drop_duplicates()
    .groupby('fips')
    .size()
    .reset_index(name='zip_count')
)

zip_per_fips['zip_count'].describe()


Unnamed: 0,zip_count
count,3277.0
mean,16.155935
std,18.770522
min,1.0
25%,7.0
50%,12.0
75%,18.0
max,494.0


###2.8.3 Missing ZIPs (Rows That Didn't Match)
Identify FEMA records from counties not covered in the ZIP mapping dataset.

In [31]:
missing_zip_df = fema_with_zip[fema_with_zip['ZIP'].isna()]
print(f"Number of FEMA records without ZIP after merge: {missing_zip_df.shape[0]}")
missing_zip_df[['state', 'designatedArea', 'fips']].drop_duplicates().head(10)


Number of FEMA records without ZIP after merge: 934


Unnamed: 0,state,designatedArea,fips
58,CA,Resighini Rancheria (Indian Reservation),6000
123,ID,Nez Perce Indian Reservation,16000
165,WA,Colville Indian Reservation,53000
198,WA,Yakama Reservation,53000
452,NM,Mescalero Tribe,35000
3083,MT,Crow Indian Reservation,30000
5627,OR,Warm Springs Indian Reservation,41000
6452,AZ,San Carlos Indian Reservation,4000
7091,CA,Morongo Indian Reservation,6000
11913,CA,Hopland Rancheria (Indian Reservation),6000


#### 2.8.3 ZIP Missing Analysis

After merging FEMA disaster data with ZIP codes using FIPS codes, **934** records could not be matched.

Top unmatched designatedArea examples:
*   Resighini Rancheria (Indian Reservation), CA
*   Nez Perce Indian Reservation, ID
*   Yakama Reservation, WA
*   Warm Springs Indian Reservation, OR
*   Crow Indian Reservation, MT
*   San Carlos Indian Reservation, AZ
*   Hopland Rancheria (Indian Reservation), CA



**Possible reasons:**
- Indian Reservations and tribal lands often don't align with standard county or ZIP boundaries

- Non-standard area names that don’t match ZIP datasets (e.g., Rancheria, Reservation)

**Action:** These records can be excluded


In [35]:
# Drop FEMA records with missing ZIPs
fema_with_zip['zip_matched'] = fema_with_zip['ZIP'].notna()
fema_matched_df = fema_with_zip[fema_with_zip['zip_matched']].copy().reset_index(drop=True)

# Preview
print(fema_matched_df.shape)
fema_matched_df.head()

(388152, 9)


Unnamed: 0,disasterNumber,state,designatedArea,declarationDate,fipsStateCode,fipsCountyCode,fips,ZIP,zip_matched
0,3611,VI,St. Croix (Island) (County-equivalent),2024-08-18 00:00:00+00:00,78,10,78010,850.0,True
1,3611,VI,St. Croix (Island) (County-equivalent),2024-08-18 00:00:00+00:00,78,10,78010,840.0,True
2,3611,VI,St. Croix (Island) (County-equivalent),2024-08-18 00:00:00+00:00,78,10,78010,820.0,True
3,5529,OR,Washington (County),2024-08-09 00:00:00+00:00,41,67,41067,97078.0,True
4,5529,OR,Washington (County),2024-08-09 00:00:00+00:00,41,67,41067,97133.0,True


###2.8.4 FEMA-ZIP Merge Summary

In [36]:
# FEMA-ZIP Merge Summary (auto-generated)

# Total rows
rows_before = fema_with_zip.shape[0]
rows_after = fema_matched_df.shape[0]

# ZIP matching stats
matched_rows = fema_with_zip['ZIP'].notna().sum()
missing_rows = fema_with_zip['ZIP'].isna().sum()
match_rate = matched_rows / rows_before
final_match_rate = 1.0  # because all rows in fema_matched_df have ZIPs

# ZIPs per FIPS (before filtering)
zip_per_fips = (
    fema_with_zip[['fips', 'ZIP']]
    .drop_duplicates()
    .groupby('fips')
    .size()
    .reset_index(name='zip_count')
)

# Summary stats
print("FEMA-ZIP Merge Summary:")
print(f"Total FEMA records (before filtering):   {rows_before}")
print(f"Total FEMA records (after filtering):    {rows_after}")
print(f"Matched ZIPs:                             {matched_rows}")
print(f"Missing ZIPs:                             {missing_rows}")
print(f"ZIP Match Rate:                           {match_rate:.2%}")
print(f"ZIP Match Rate (final dataset):           {final_match_rate:.2%}")
print(f"Unique county FIPS:                       {zip_per_fips.shape[0]}")
print(f"Median ZIPs per county:                   {zip_per_fips['zip_count'].median():.0f}")
print(f"Max ZIPs per county:                      {zip_per_fips['zip_count'].max()}")


FEMA-ZIP Merge Summary:
Total FEMA records (before filtering):   389086
Total FEMA records (after filtering):    388152
Matched ZIPs:                             388152
Missing ZIPs:                             934
ZIP Match Rate:                           99.76%
ZIP Match Rate (final dataset):           100.00%
Unique county FIPS:                       3277
Median ZIPs per county:                   12
Max ZIPs per county:                      494


##2.10 Disaster Count Aggregation by ZIP (Last 10 Years)

To support downstream analysis, we compute the number of FEMA disaster declarations in the past 10 years for each ZIP code. This gives a static risk exposure feature per ZIP that can be merged with housing data.

The aggregation is based on the cleaned FEMA dataset `fema_matched_df`, which includes only matched ZIP-level records between 2013 and 2025.

In [37]:
# Count the number of disasters per ZIP code
zip_disaster_counts = (
    fema_matched_df.groupby('ZIP')['disasterNumber']
    .nunique()
    .reset_index(name='disaster_count_10yrs')
)

# Format ZIPs as 5-digit strings
# zip_disaster_counts['ZIP'] = zip_disaster_counts['ZIP'].astype(int).astype(str).str.zfill(5)

# Make sure ZIP is stored as integer (drop decimals)
zip_disaster_counts['ZIP'] = zip_disaster_counts['ZIP'].astype(int)

# Reset index for cleanliness
zip_disaster_counts = zip_disaster_counts.reset_index(drop=True)

# Preview top ZIPs by disaster count
zip_disaster_counts.sort_values(by='disaster_count_10yrs', ascending=False).head(10)


Unnamed: 0,ZIP,disaster_count_10yrs
35885,91710,41
35884,91709,41
35934,91786,41
35920,91766,41
35623,90623,39
35625,90630,39
35626,90631,39
35630,90638,39
35680,90808,39
36711,93560,38


## Summary of Disaster Count Aggregation

The resulting table `zip_disaster_counts` contains the number of unique FEMA disaster declarations per ZIP code over the past 10 years. Each row represents a ZIP and its corresponding disaster exposure:

- **ZIP**: 5-digit ZIP code (int)
- **disaster_count_10yrs**: Total number of FEMA disaster declarations in that ZIP over the past 10 years

This table will serve as a **static risk feature** that can be joined with the main housing dataset using the ZIP code as key.
