### Data Wrangling: Storms and Severe Weather Events

In [21]:
import os

import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 100)

from sklearn import preprocessing, neighbors, model_selection

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [146]:
df = pd.read_csv('dataset-06-storms.csv.gz', low_memory=False)

In [147]:
df.head()

Unnamed: 0,STATE__,BGN_DATE,BGN_TIME,TIME_ZONE,COUNTY,COUNTYNAME,STATE,EVTYPE,BGN_RANGE,BGN_AZI,BGN_LOCATI,END_DATE,END_TIME,COUNTY_END,COUNTYENDN,END_RANGE,END_AZI,END_LOCATI,LENGTH,WIDTH,F,MAG,FATALITIES,INJURIES,PROPDMG,PROPDMGEXP,CROPDMG,CROPDMGEXP,WFO,STATEOFFIC,ZONENAMES,LATITUDE,LONGITUDE,LATITUDE_E,LONGITUDE_,REMARKS,REFNUM
0,1,4/18/1950 0:00:00,130,CST,97,MOBILE,AL,TORNADO,0,,,,,0,,0,,,14.0,100,3,0,0,15,25.0,K,0,,,,,3040,8812,3051,8806,,1
1,1,4/18/1950 0:00:00,145,CST,3,BALDWIN,AL,TORNADO,0,,,,,0,,0,,,2.0,150,2,0,0,0,2.5,K,0,,,,,3042,8755,0,0,,2
2,1,2/20/1951 0:00:00,1600,CST,57,FAYETTE,AL,TORNADO,0,,,,,0,,0,,,0.1,123,2,0,0,2,25.0,K,0,,,,,3340,8742,0,0,,3
3,1,6/8/1951 0:00:00,900,CST,89,MADISON,AL,TORNADO,0,,,,,0,,0,,,0.0,100,2,0,0,2,2.5,K,0,,,,,3458,8626,0,0,,4
4,1,11/15/1951 0:00:00,1500,CST,43,CULLMAN,AL,TORNADO,0,,,,,0,,0,,,0.0,150,2,0,0,2,2.5,K,0,,,,,3412,8642,0,0,,5


In [148]:
df.shape

(902297, 37)

In [149]:
df.COUNTYENDN.isnull().sum()

902297

In [150]:
df.isnull().sum().sort_values()

STATE__            0
LONGITUDE_         0
LONGITUDE          0
CROPDMG            0
PROPDMG            0
INJURIES           0
FATALITIES         0
MAG                0
WIDTH              0
END_RANGE          0
COUNTY_END         0
LENGTH             0
COUNTY             0
BGN_DATE           0
BGN_TIME           0
TIME_ZONE          0
STATE              0
EVTYPE             0
BGN_RANGE          0
REFNUM             0
LATITUDE_E        40
LATITUDE          47
COUNTYNAME      1589
WFO           142069
END_TIME      238978
END_DATE      243411
STATEOFFIC    248769
REMARKS       287433
BGN_LOCATI    287743
PROPDMGEXP    465934
END_LOCATI    499225
BGN_AZI       547332
ZONENAMES     594029
CROPDMGEXP    618413
END_AZI       724837
F             843563
COUNTYENDN    902297
dtype: int64

In [151]:
#dropping COUNTYENDN due to all NULLs 
#dropping REMARKS due to not learning natural language processing yet
#dropping COUNTY_END all 0
#df = df.drop(['COUNTYENDN','REMARKS','COUNTY_END'], axis=1)
#dropping the very specific location info, not helpful for identifying patterns
#df = df.drop(['LATITUDE','LONGITUDE','LATITUDE_E','LONGITUDE_'], axis=1)

In [152]:
df1 = df[['EVTYPE','FATALITIES','INJURIES','PROPDMG','CROPDMG','PROPDMGEXP','CROPDMGEXP']]

In [153]:
df1.describe()

Unnamed: 0,FATALITIES,INJURIES,PROPDMG,CROPDMG
count,902297.0,902297.0,902297.0,902297.0
mean,0.016785,0.155745,12.063101,1.527022
std,0.765283,5.431887,59.475851,22.17368
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.5,0.0
max,583.0,1700.0,5000.0,990.0


In [159]:
df1.PROPDMGEXP.unique()

array(['K', 'M', nan, 'B', 'm', '+', '0', '5', '6', '?', '4', '2', '3',
       'h', '7', 'H', '-', '1', '8'], dtype=object)

In [160]:
df1.PROPDMG *= df1.PROPDMGEXP.map({'M'or'm': 1. / 1000, 'K': 1., 'B': 1. / 1000000})

In [None]:
df1.PROPDMG *= df1.PROPDMGEXP.map({exponent_to_multiplier})

In [162]:
df1.CROPDMGEXP.unique()

array([nan, 'M', 'K', 'm', 'B', '?', '0', 'k', '2'], dtype=object)

In [163]:
df1.CROPDMG *= df1.CROPDMGEXP.map({'M'or'm': 1. / 1000, 'K'or'k': 1., 'B': 1. / 1000000})

In [164]:
df1

Unnamed: 0,EVTYPE,FATALITIES,INJURIES,PROPDMG,CROPDMG,PROPDMGEXP,CROPDMGEXP
0,TORNADO,0,15,25.0000,,K,
1,TORNADO,0,0,2.5000,,K,
2,TORNADO,0,2,25.0000,,K,
3,TORNADO,0,2,2.5000,,K,
4,TORNADO,0,2,2.5000,,K,
5,TORNADO,0,6,2.5000,,K,
6,TORNADO,0,1,2.5000,,K,
7,TORNADO,0,0,2.5000,,K,
8,TORNADO,1,14,25.0000,,K,
9,TORNADO,0,0,25.0000,,K,


In [None]:
exponent_to_multiplier = {
    #allll combinations
}

## drop all the rows that don't have a fatality, injury, property or crop damage

In [184]:
#get rid of all the storms have no injuries and no damage and no deaths
#df[df.gender == 'Female']
df2 = df1[(df1.FATALITIES > 0) | (df1.INJURIES > 0) | (df1.PROPDMG > 0) | (df1.CROPDMG > 0)]

In [185]:
df2.sort_values('FATALITIES')

Unnamed: 0,EVTYPE,FATALITIES,INJURIES,PROPDMG,CROPDMG,PROPDMGEXP,CROPDMGEXP
0,TORNADO,0,15,25.000000,,K,
629408,FLASH FLOOD,0,0,25.000000,0.0000,K,K
629409,FLASH FLOOD,0,0,25.000000,0.0000,K,K
629410,THUNDERSTORM WIND,0,0,10.000000,0.0000,K,K
629411,THUNDERSTORM WIND,0,0,1.000000,0.0000,K,K
629412,THUNDERSTORM WIND,0,0,1.000000,0.0000,K,K
629413,THUNDERSTORM WIND,0,0,1.000000,0.0000,K,K
629414,THUNDERSTORM WIND,0,0,1.000000,0.0000,K,K
629415,THUNDERSTORM WIND,0,0,25.000000,0.0000,K,K
629416,FLASH FLOOD,0,0,10.000000,0.0000,K,K


## Deal with EVTYPE

In [188]:
df1.EVTYPE.unique()

array(['TORNADO', 'TSTM WIND', 'HAIL', 'ICE STORM/FLASH FLOOD',
       'WINTER STORM', 'HURRICANE OPAL/HIGH WINDS', 'THUNDERSTORM WINDS',
       'HURRICANE ERIN', 'HURRICANE OPAL', 'HEAVY RAIN', 'LIGHTNING',
       'THUNDERSTORM WIND', 'DENSE FOG', 'RIP CURRENT',
       'THUNDERSTORM WINS', 'FLASH FLOODING', 'FLASH FLOOD', 'TORNADO F0',
       'THUNDERSTORM WINDS LIGHTNING', 'THUNDERSTORM WINDS/HAIL', 'HEAT',
       'HIGH WINDS', 'WIND', 'HEAVY RAINS', 'LIGHTNING AND HEAVY RAIN',
       'THUNDERSTORM WINDS HAIL', 'COLD', 'HEAVY RAIN/LIGHTNING',
       'FLASH FLOODING/THUNDERSTORM WI', 'FLOODING', 'WATERSPOUT',
       'EXTREME COLD', 'LIGHTNING/HEAVY RAIN', 'HIGH WIND', 'FREEZE',
       'RIVER FLOOD', 'HIGH WINDS HEAVY RAINS', 'AVALANCHE',
       'MARINE MISHAP', 'HIGH WIND/SEAS', 'HIGH WINDS/HEAVY RAIN',
       'HIGH SEAS', 'COASTAL FLOOD', 'SEVERE TURBULENCE',
       'RECORD RAINFALL', 'HEAVY SNOW', 'HEAVY SNOW/WIND', 'DUST STORM',
       'FLOOD', 'APACHE COUNTY', 'SLEET', 'DUST DEVIL

In [189]:
len(df1.EVTYPE.unique())

483

(254319, 7)