# Severe Weather - Data Collection & Wrangling

Greg Welliver   

In [None]:
# Import relevant libraries and packages.
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import statsmodels.api as sm

from statsmodels.graphics.api import abline_plot
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model, preprocessing 
import warnings
from scipy import stats
import re
from glob import glob, iglob
from datetime import datetime
import string


## Data Collection

- storm files were collected from the Iowa Environmental Mesonet: https://mesonet.agron.iastate.edu/nws/
- file location for downloads: 
    https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/

In [None]:
### working code, make markdown for now
### All annual storm data files are saved on my local machine.  This code gathers all of the files and combines them into one file.
filenames = glob('../Data/*.csv')
print("There is a total of {} files.".format(len(filenames)))

target_path = '../Data/all_storm_data.csv'

try:
    # Read in Summary File is exists
    all_storm_data = pd.read_csv(target_path)
except:
    # Read in all Subfiles
    storm_data = [pd.read_csv(filepath) for filepath in filenames]
    all_storm_data = pd.concat(storm_data)
    
    # Create Summary File for faster processing
    #hot100_all.to_csv(target_path,index=False)

print("The total number of observations is {}.".format(len(all_storm_data)))
all_storm_data.head()

In [None]:
# load data
#df = pd.read_csv("../Data/StormEvents_details-ftp_v1.0_d2001_c20220425.csv")
df = pd.read_parquet("../Data/all_storm_data.pqt")
#df = pd.read_csv("../Data/all_storm_data4.csv")

In [None]:
df.info()

In [None]:
# code to split column
for row in df["STATE_FIPS"][:10]:
    res = row.split(".", 1)[0]
    print(res)

In [None]:
# drop unnecessary columns
df.drop(['CATEGORY', 'DATA_SOURCE', 'BEGIN_RANGE', 'BEGIN_AZIMUTH', 'BEGIN_LOCATION', 'END_RANGE', 'END_AZIMUTH', 'END_LOCATION', 'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE', 'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME', 'CZ_TIMEZONE', 'WFO', 'CZ_TYPE', 'DAMAGE_CROPS', 'CZ_NAME', 'SOURCE', 'BEGIN_DAY', 'END_YEARMONTH', 'END_DAY', 'END_TIME', 'EPISODE_ID', 'EVENT_ID', 'STATE_FIPS', 'CZ_FIPS', 'END_DATE_TIME'], axis=1, inplace=True)

#### Replace nulls in columns with NA

In [None]:
# Columns to replace nulls with NA:
cols_na = ['EVENT_NARRATIVE', 'EPISODE_NARRATIVE', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON', 'TOR_F_SCALE', 'MAGNITUDE_TYPE', 'FLOOD_CAUSE', 'STATE', 'STATE_FIPS']

In [None]:
for x in cols_na:
    print(df[x].isna().sum())

In [None]:
# WORKING, MARKDOWN UNTIL FINAL
for x in cols_na:
    df[x] = df[x].fillna('NA')

In [None]:
for x in cols_na:
    print(df[x].isna().sum())

#### Replace nulls in columns with 0

In [None]:
# Columns to replace nulls with 0:
cols_0 = ['MAGNITUDE', 'TOR_LENGTH', 'TOR_WIDTH', 'DAMAGE_PROPERTY', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT']

In [None]:
for x in cols_0:
    print(df[x].isna().sum())

In [None]:
for x in cols_0:
    df[x] = df[x].fillna(0)

In [None]:
for x in cols_0:
    print(df[x].isna().sum())

In [None]:
# remove NA values from state FIPS
df = df[df['STATE_FIPS'] != "NA"].reset_index()

# convert STATE_FIPS to INT so can use it for lookup later
for x in df['STATE_FIPS']:
    x = int(x)

In [None]:
df['STATE_FIPS'] = df['STATE_FIPS'].astype(int)


In [None]:
df['STATE_FIPS'] = df['STATE_FIPS'].astype(object)

In [None]:
df['CZ_FIPS'] = df['CZ_FIPS'].astype(str)

In [None]:
# add "0" or "00" to CZ FIPS so that it can be used to match later
for i in (range(len(df['CZ_FIPS']))):
    if len(df['CZ_FIPS'][i]) == 2:
#        df['CZ_FIPS'][i] = df['CZ_FIPS'][i].astype(str)
        df['CZ_FIPS'][i] = "0" + df['CZ_FIPS'][i]
#        print(df['CZ_FIPS'][i])
    elif len(df['CZ_FIPS'][i]) == 1:
        df['CZ_FIPS'][i] = "00" + df['CZ_FIPS'][i]
#        print(df['CZ_FIPS'][i])
#     else:
#         row

In [None]:
df['STATE_FIPS'] = df['STATE_FIPS'].astype(str)

In [None]:
# add "0" to state FIPS so that it can be used to match later
for i in (range(len(df['STATE_FIPS']))):
    if len(df['STATE_FIPS'][i]) == 1:
#        df['CZ_FIPS'][i] = df['CZ_FIPS'][i].astype(str)
        df['STATE_FIPS'][i] = "0" + df['STATE_FIPS'][i]
#        print(df['CZ_FIPS'][i])

In [None]:
# concatenate STATE FIPS and CZ FIPS into one column so that it can be used to match
df['ST_CT_FIPS'] = df['STATE_FIPS'].astype(str) + df['CZ_FIPS'].astype(str)

In [None]:
# remove all of the K's, M's, and B's in the DAMAGE_PROPERTY column and multiply them by appropriate values
d = {r"(\d)K$": r"\1*1000", r"M$": r"*1000000", r"B$": r"*1000000000", r"^K$": r"1000"}

#r stands for raw string
#dollar is end of the line

# for every key and value, run this code
for k,v in d.items():
     df["DAMAGE_PROPERTY"] = df["DAMAGE_PROPERTY"].str.replace(k, v, regex=True).fillna("0.0")
#df["DAMAGE_PROPERTY"].apply(eval)
df["DAMAGE_PROPERTY"] = df["DAMAGE_PROPERTY"].apply(eval)

In [None]:
# convert date strings to datetimes
df['BEGIN_DATE_TIME'] =  pd.to_datetime(df['BEGIN_DATE_TIME'])
df['END_DATE_TIME'] =  pd.to_datetime(df['END_DATE_TIME'])

In [None]:
# calculate duration of storm
df['DURATION'] = df['END_DATE_TIME'] - df['BEGIN_DATE_TIME']

# convert storm duration to minutes
for i in (range(len(df['DURATION']))):
    df['DURATION'][i] = df['DURATION'][i].total_seconds() / 60

In [None]:
# code to calculate coverage area of the storm

# calculate  beginning and end latitude difference
df['LAT_DIFF'] = (df['END_LAT'] - df['BEGIN_LAT']).abs()

# calculate  beginning and end longitude difference
df['LON_DIFF'] = (df['END_LON'] - df['END_LON']).abs()

# combine two columns to calculate total size of storm
df['STORM_AREA'] = df['LON_DIFF'] + df['LAT_DIFF']

# since we don't need the difference columns anymore, drop those. also END LAT and LON columsn, since don't need those either
df.drop(['LAT_DIFF', 'LON_DIFF', 'END_LON', 'END_LAT'], axis=1, inplace=True)

## Combine Population Density, Home Price data

In [None]:
#load the data file in it's current state
df = pd.read_parquet("../Data/all_storm_data7.pqt")

In [None]:
df.head()

In [None]:
df.head(20).T

In [None]:
df['ST_CT_FIPS'] = df['ST_CT_FIPS'].astype(str)
df['ST_CT_FIPS'] = df['ST_CT_FIPS'].str.zfill(5)
df.ST_CT_FIPS

In [None]:
df.head()

In [None]:
df.head().T

In [None]:
df.ST_CT_FIPS.sort_values(ascending=False)

In [None]:
#load the data the other data files
PopDen = pd.read_csv("../Data/Average_Household_Size_and_Population_Density_-_County_merge.csv")
HomePrice = pd.read_excel("../Data/HPI_AT_BDL_county_merge.xlsx")

In [None]:
PopDen = PopDen[PopDen['FIPS_CODE'].notnull()]
PopDen = PopDen.reset_index(drop=True)

In [None]:
# code to fix FIPS_CODE column in PopDen

# convert to int to get rid of decimals
PopDen['FIPS_CODE'] = PopDen['FIPS_CODE'].astype(int)

# pad additional zeroes
PopDen['FIPS_CODE'] = PopDen['FIPS_CODE'].astype(str)
PopDen['FIPS_CODE'] = PopDen['FIPS_CODE'].str.zfill(5)

# code to fix FIPS CODE column in HomePrice

# pad additional zeroes
HomePrice['FIPS code'] = HomePrice['FIPS code'].astype(str)
HomePrice['FIPS code'] = HomePrice['FIPS code'].str.zfill(5)

In [None]:
PopDen.head(20).T

In [None]:
HomePrice.head(20).T

In [None]:
# merge the population density data to the main dataframe
df = df.merge(PopDen['B01001_calc_PopDensity'], how = 'left',
                left_on = 'ST_CT_FIPS', right_on = PopDen['FIPS_CODE'])


In [None]:
# using pd.concat
# merge the population density data to the main dataframe
df = pd.concat([df, PopDen[['B01001_calc_PopDensity', 'Population']]],
                  keys = ['ST_CT_FIPS', 'FIPS_CODE'])


In [None]:
df.head(30).T

In [None]:
df['Population'].value_counts()

In [None]:
# merge the home price index data to the main dataframe
df = pd.merge(df, HomePrice,  how='left', left_on=['ST_CT_FIPS','YEAR'], right_on = ['FIPS code','Year'])
df.drop(['HPI with 2000 base', 'HPI with 1990 base', 'Annual Change (%)', 'Year', 'FIPS code', 'County', 'State',], axis=1, inplace=True)


In [None]:
df.head(30).T

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
# code to replace "." so that we can ultimately turn this into a number
new_list = []
for i in df['HPI']:
    a_string = str(i)
    new_string = a_string.translate(str.maketrans('', '', string.punctuation))
    new_list.append(new_string)

In [None]:
# fix newHPI column
df['newHPI'] = pd.DataFrame(new_list)
df['newHPI'] = df['newHPI'].replace('nan', 'NaN')
df['newHPI'] = df['newHPI'].replace('', 'NaN')

resources

CZ FIPS documentation: https://www.irsa.miami.edu/_assets/pdf/Documents/fips_statecounty_code.pdf

Population density: https://covid19.census.gov/datasets/21843f238cbb46b08615fc53e19e0daf_1/explore?location=2.632620%2C0.315550%2C1.00

Home price index: https://www.fhfa.gov/DataTools/Downloads/Pages/House-Price-Index-Datasets.aspx

maybe useful: https://www.nar.realtor/research-and-statistics/housing-statistics/county-median-home-prices-and-monthly-mortgage-payment
        
land values: https://www.nass.usda.gov/Publications/Todays_Reports/reports/land0822.pdf

data that I created:
 - concatenated state and county codes for indentification
 - storm duration
 - storm area
 - county population density (pulled from other dataset)
 - land values (pulled from other dataset)