# Severe Weather Capstone - Data Collection & Wrangling

Greg Welliver   

In [1]:
# Import relevant libraries and packages.
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import statsmodels.api as sm
from statsmodels.graphics.api import abline_plot
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model, preprocessing 
import warnings
from scipy import stats
import re
from glob import glob, iglob
from datetime import datetime



file location for downloads: 
    https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/

## Data Collection

- storm files were collected from the Iowa Environmental Mesonet: https://mesonet.agron.iastate.edu/nws/

### working code, make markdown for now
### All annual storm data files are saved on my local machine.  This code gathers all of the files and combines them into one file.
filenames = glob('../Data/*.csv')
print("There is a total of {} files.".format(len(filenames)))

target_path = '../Data/all_storm_data.csv'

try:
    # Read in Summary File is exists
    all_storm_data = pd.read_csv(target_path)
except:
    # Read in all Subfiles
    storm_data = [pd.read_csv(filepath) for filepath in filenames]
    all_storm_data = pd.concat(storm_data)
    
    # Create Summary File for faster processing
    hot100_all.to_csv(target_path,index=False)

print("The total number of observations is {}.".format(len(all_storm_data)))
all_storm_data.head()

In [2]:
# load data
#df = pd.read_csv("../Data/StormEvents_details-ftp_v1.0_d2001_c20220425.csv")
#df = pd.read_parquet("../Data/all_storm_data.pqt")
#df = pd.read_csv("../Data/all_storm_data4.csv")

In [3]:
#df.head()

In [4]:
#df.shape

In [5]:
#df.isna().sum()

In [6]:
#df.info()

for row in df["STATE_FIPS"][:10]:
    res = row.split(".", 1)[0]
    print(res)

# WORKING, MARKDOWN UNTIL FINAL
# drop unnecessary columns
df.drop(['CATEGORY', 'DATA_SOURCE', 'BEGIN_RANGE', 'BEGIN_AZIMUTH', 'BEGIN_LOCATION', 'END_RANGE', 'END_AZIMUTH', 'END_LOCATION', 'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE', 'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME', 'CZ_TIMEZONE', 'WFO', 'CZ_TYPE', 'DAMAGE_CROPS'
], axis=1, inplace=True)

# WORKING, MARKDOWN UNTIL FINAL
df['STATE_FIPS'] = df['STATE_FIPS'].astype(object)

# WORKING, MARKDOWN UNTIL FINAL
# Columns to replace nulls with NA:
cols_na = ['EVENT_NARRATIVE', 'EPISODE_NARRATIVE', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON', 'TOR_F_SCALE', 'MAGNITUDE_TYPE', 'FLOOD_CAUSE', 'STATE', 'STATE_FIPS']

for x in cols_na:
    print(df[x].isna().sum())

# WORKING, MARKDOWN UNTIL FINAL
for x in cols_na:
    df[x] = df[x].fillna('NA')

for x in cols_na:
    print(df[x].isna().sum())

# WORKING, MARKDOWN UNTIL FINAL
# Columns to replace nulls with 0:

cols_0 = ['MAGNITUDE', 'TOR_LENGTH', 'TOR_WIDTH', 'DAMAGE_PROPERTY', 'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT', 'DEATHS_INDIRECT']


for x in cols_0:
    print(df[x].isna().sum())

# WORKING, MARKDOWN UNTIL FINAL
for x in cols_0:
    df[x] = df[x].fillna(0)

for x in cols_0:
    print(df[x].isna().sum())

In [7]:
#df.isna().sum()

In [8]:
#df.dtypes

# WORKING, MARKDOWN UNTIL FINAL
# remove NA values from state FIPS
df = df[df['STATE_FIPS'] != "NA"].reset_index()

# convert STATE_FIPS to INT so can use it for lookup later
for x in df['STATE_FIPS']:
    x = int(x)

df['STATE_FIPS'] = df['STATE_FIPS'].astype(int)


df['STATE_FIPS'] = df['STATE_FIPS'].astype(object)

df['STATE_FIPS']

df['CZ_FIPS'] = df['CZ_FIPS'].astype(str)

df['CZ_FIPS']

df['CZ_FIPS'][:5]

In [9]:
#df[249750:249760]

df['CZ_FIPS'][249750:249760]

# WORKING, MARKDOWN UNTIL FINAL
# add "0" or "00" to CZ FIPS so that it can be used to match later
for i in (range(len(df['CZ_FIPS']))):
    if len(df['CZ_FIPS'][i]) == 2:
#        df['CZ_FIPS'][i] = df['CZ_FIPS'][i].astype(str)
        df['CZ_FIPS'][i] = "0" + df['CZ_FIPS'][i]
#        print(df['CZ_FIPS'][i])
    elif len(df['CZ_FIPS'][i]) == 1:
        df['CZ_FIPS'][i] = "00" + df['CZ_FIPS'][i]
#        print(df['CZ_FIPS'][i])
#     else:
#         row

df['STATE_FIPS'] = df['STATE_FIPS'].astype(str)

# WORKING, MARKDOWN UNTIL FINAL
# add "0" to state FIPS so that it can be used to match later
for i in (range(len(df['STATE_FIPS']))):
    if len(df['STATE_FIPS'][i]) == 1:
#        df['CZ_FIPS'][i] = df['CZ_FIPS'][i].astype(str)
        df['STATE_FIPS'][i] = "0" + df['STATE_FIPS'][i]
#        print(df['CZ_FIPS'][i])

In [10]:
#df["CZ_FIPS"] = df.apply(lambda x: "0" + x if len(x) == 2)

# WORKING, MARKDOWN UNTIL FINAL
# concatenate STATE FIPS and CZ FIPS into one column so that it can be used to match
df['ST_CT_FIPS'] = df['STATE_FIPS'].astype(str) + df['CZ_FIPS'].astype(str)

# WORKING, MARKDOWN UNTIL FINAL
# remove all of the K's, M's, and B's in the DAMAGE_PROPERTY column and multiply them by appropriate values
d = {r"(\d)K$": r"\1*1000", r"M$": r"*1000000", r"B$": r"*1000000000", r"^K$": r"1000"}

#r stands for raw string
#dollar is end of the line

# for every key and value, run this code
for k,v in d.items():
     df["DAMAGE_PROPERTY"] = df["DAMAGE_PROPERTY"].str.replace(k, v, regex=True).fillna("0.0")
#df["DAMAGE_PROPERTY"].apply(eval)
df["DAMAGE_PROPERTY"] = df["DAMAGE_PROPERTY"].apply(eval)

# WORKING, MARKDOWN UNTIL FINAL
# convert date strings to datetimes
df['BEGIN_DATE_TIME'] =  pd.to_datetime(df['BEGIN_DATE_TIME'])
df['END_DATE_TIME'] =  pd.to_datetime(df['END_DATE_TIME'])

# WORKING, MARKDOWN UNTIL FINAL
# calculate duration of storm
df['DURATION'] = df['END_DATE_TIME'] - df['BEGIN_DATE_TIME']

# convert storm duration to minutes
for i in (range(len(df['DURATION']))):
    df['DURATION'][i] = df['DURATION'][i].total_seconds() / 60

## Part 2 start here

In [11]:
#load the data
# df = pd.read_csv("../Data/all_storm_data4.csv", index_col=[0])
# df.drop(['index'], axis=1, inplace=True)
df = pd.read_parquet("../Data/all_storm_data6.pqt")

In [12]:
df.head()

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,...,TOR_LENGTH,TOR_WIDTH,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,ST_CT_FIPS,DURATION
0,202202,20,2118,202202,20,2218,165464,999902,NEVADA,32,...,0.0,0.0,,,,,Strong winds increased ahead of an approaching...,"Station (UP994) 3.1 SE West Wendover, Elevatio...",32033,60.0
1,202202,21,800,202202,22,1000,165465,999903,NEVADA,32,...,0.0,0.0,,,,,A low centered over northern and central Nevad...,Thirteen inches fell at station (BCSN2) Big Cr...,32037,1560.0
2,202202,22,200,202202,22,900,165465,999904,NEVADA,32,...,0.0,0.0,,,,,A low centered over northern and central Nevad...,Fifteen inches fell at station (TJMN2) Toe Jam...,32031,420.0
3,202202,18,1609,202202,18,1609,165611,1001181,ATLANTIC SOUTH,87,...,0.0,0.0,30.05,-81.17,30.05,-81.17,Pre-frontal showers and thunderstorms moved so...,A brief waterspout was observed offshore of So...,87452,0.0
4,202202,2,0,202202,3,0,165668,1001527,AMERICAN SAMOA,97,...,0.0,0.0,-14.333,-170.7157,-14.3393,-170.7268,A surface trough over the Islands held the po...,"Over a 24-hour period, WSO Pago Pago recorded ...",97002,1440.0


In [32]:
# WORKING, MARKDOWN UNTIL FINAL
# drop unnecessary columns
df.drop(['CZ_NAME', 'SOURCE'], axis=1, inplace=True)

KeyError: "['CZ_NAME', 'SOURCE'] not found in axis"

In [None]:
df.head().T

In [None]:
df['BEGIN_LAT'] = df['BEGIN_LAT'].replace("NA", "")

In [15]:
df['BEGIN_LAT'] = df['BEGIN_LAT'].astype(float)
# df['BEGIN_LON'] = df['BEGIN_LON'].astype(float)
# df['END_LAT'] = df['END_LAT'].astype(float)
# df['END_LON'] = df['END_LON'].astype(float)

#### write to CSV
from pathlib import Path  
filepath = Path('/Users/gregwelliver/Desktop/springboard_files/Severe-Weather-Repo/Data/all_storm_data5.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_csv(filepath)

#### write to parquet
parquet_file = 'example_pd.parquet'

df.to_parquet(parquet_file, engine = 'pyarrow', compression = 'gzip')

logging.info('Parquet file named "%s" has been written to disk', parquet_file)

#### write to parquet
from pathlib import Path  
filepath = Path('/Users/gregwelliver/Desktop/springboard_files/Severe-Weather-Repo/Data/all_storm_data6.pqt')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df.to_parquet(filepath)

resources

CZ FIPS documentation: https://www.irsa.miami.edu/_assets/pdf/Documents/fips_statecounty_code.pdf

Population density: https://covid19.census.gov/datasets/21843f238cbb46b08615fc53e19e0daf_1/explore?location=2.632620%2C0.315550%2C1.00

Home price index: https://www.fhfa.gov/DataTools/Downloads/Pages/House-Price-Index-Datasets.aspx

maybe useful: https://www.nar.realtor/research-and-statistics/housing-statistics/county-median-home-prices-and-monthly-mortgage-payment
        
land values: https://www.nass.usda.gov/Publications/Todays_Reports/reports/land0822.pdf

data that I created:
 - concatenated state and county codes for indentification
 - storm duration
 - storm distance (haven't done it yet)
 - county population density (haven't done it yet; pulled from other dataset)
 - land values ((haven't done it yet; pulled from other dataset)