In [1]:
# Package import cell

import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
import scipy
import scipy.spatial
from scipy.spatial import distance

In [2]:
# Load merged tornadoes+income+density data

pd.set_option('display.max_columns', None)
data = pd.read_csv('/users/Orion/NYU/tornado_v2/Merged-Tornadoes.csv')

In [3]:
# Remove irrelevant columns and set EVENT_ID as index

data = data.drop(columns=['Unnamed: 0', 'EPISODE_ID', 'EVENT_TYPE', 'WFO', 'SOURCE', 'MAGNITUDE', 
                          'MAGNITUDE_TYPE', 'FLOOD_CAUSE', 'CATEGORY', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 
                          'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE', 'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME', 
                          'BEGIN_AZIMUTH', 'END_AZIMUTH', 'BEGIN_LOCATION', 'END_LOCATION', 
                          'EPISODE_NARRATIVE', 'DATA_SOURCE', 'BEGIN_YEARMONTH', 'END_YEARMONTH', 
                          'MONTH_NAME', 'CZ_TIMEZONE', 'YEAR', 'STATE_FIPS', 'CZ_TYPE', 'CZ_FIPS', 
                          'State FIPS Code', 'Name', 'Geographic area', 'Geographic area.1', 
                          'Population', 'Housing units', 'STATE', 'CZ_NAME', 'END_TIME'])
data = data.set_index('EVENT_ID')

In [4]:
# Rename some columns for clarity

data = data.rename(columns={'Density per square mile of land area - Population': 'population_density',
                            'Density per square mile of land area - Housing units': 'housing_units_density', 
                            'Area in square miles - Total area': 'total_area', 
                            'Area in square miles - Land area': 'land_area',
                            'BEGIN_TIME': 'begin_time', 'Median Household Income': 'median_income'})

In [5]:
# Convert begin and end dates to datetime and get tornado duration

data['BEGIN_DATE_TIME'] =  pd.to_datetime(data['BEGIN_DATE_TIME'])
data['END_DATE_TIME'] =  pd.to_datetime(data['END_DATE_TIME'])
data['duration'] = data['END_DATE_TIME'] - data['BEGIN_DATE_TIME']
data['duration'] = data['duration'].dt.seconds/60
data = data.drop(columns=['END_DATE_TIME'])

In [6]:
# Convert 'BEGIN_DATE_TIME' to datetime; define
# function to convert to integer 1 through 365 and
# convert using this function; for each day 
# determine if weekend or weekday

def date_to_nth_day(date, format='%Y%m%d'):
    new_year_day = datetime.datetime(year=date.year, month=1, day=1)
    return (date - new_year_day).days + 1

data['day_of_year'] = data['BEGIN_DATE_TIME'].map(date_to_nth_day)
data['day_of_week'] = data['BEGIN_DATE_TIME'].dt.weekday
data['weekend'] = data['day_of_week'].map({0:0, 1:0, 2:0, 3:0, 4:0, 5:1, 6:1})

In [7]:
# Convert time features cyclically

def convert_to_mins(time_in_24_hours):
    stringtime = str(time_in_24_hours)

    if (len(stringtime) == 4):
        hours_as_mins = int(stringtime[0:2])*60
        mins = int(stringtime[2:4])
        total_mins = hours_as_mins + mins
    elif (len(stringtime) == 3):
        hours_as_mins = int(stringtime[0])*60
        mins = int(stringtime[1:3])
        total_mins = hours_as_mins + mins
    elif (len(stringtime) < 3):
        total_mins = int(stringtime)
    else:
        print('Bad Data')
        assert False
    return total_mins

data['begin_time'] = data['begin_time'].apply(convert_to_mins)

minutes_in_a_day = 24*60
data['sin_time'] = np.sin(2*np.pi*data['begin_time']/minutes_in_a_day)
data['cos_time'] = np.cos(2*np.pi*data['begin_time']/minutes_in_a_day)

days_in_a_year = 365
data['sin_date'] = np.sin(2*np.pi*data['day_of_year']/days_in_a_year)
data['cos_date'] = np.cos(2*np.pi*data['day_of_year']/days_in_a_year)

data = data.drop(columns=['begin_time', 'day_of_year', 'BEGIN_DAY', 
                          'END_DAY', 'day_of_week', 'BEGIN_DATE_TIME'])

In [8]:
# Convert all injury- and death-related columns to numeric, sum
# and create casualties from sum

data['INJURIES_DIRECT'] = pd.to_numeric(data['INJURIES_DIRECT'])
data['INJURIES_INDIRECT'] = pd.to_numeric(data['INJURIES_INDIRECT'])
data['DEATHS_DIRECT'] = pd.to_numeric(data['DEATHS_DIRECT'])
data['DEATHS_INDIRECT'] = pd.to_numeric(data['DEATHS_INDIRECT'])
data["casualties"] = (data["INJURIES_INDIRECT"]+data["INJURIES_DIRECT"]+
                      data["DEATHS_DIRECT"]+data["DEATHS_INDIRECT"])
data = data.drop(columns=["INJURIES_INDIRECT", "INJURIES_DIRECT", 
                          "DEATHS_INDIRECT", "DEATHS_DIRECT"])

In [9]:
# Convert EF_Scale to binary (1 for 2+, 0 for 0,1)

def makeEFBinary(ef):
    if ef <= 1:
        return 0
    elif ef <= 5:
        return 1
    else:
        assert False

data = data[data['TOR_F_SCALE'] != 'EFU']
data['TOR_F_SCALE'] = data['TOR_F_SCALE'].map(lambda x: int(x.lstrip('EF')))
data["tornado_intensity"] = data["TOR_F_SCALE"].map(makeEFBinary)
data = data.drop(columns='TOR_F_SCALE')

In [10]:
# Calculate tornado area from length and width

data['TOR_LENGTH'] = data['TOR_LENGTH'].map(lambda x: int(x))
data['tornado_area'] = data['TOR_LENGTH']*data['TOR_WIDTH']
data = data.drop(columns=['TOR_LENGTH', 'TOR_WIDTH'])

In [11]:
# Calculating mean and min of 'BEGIN_RANGE' and 'END_RANGE', 
# roughly proxying distance from population center

data['average_range'] = data.loc[:, ['BEGIN_RANGE','END_RANGE']].mean(axis = 1)
data['minimum_range'] = data.loc[:, ['BEGIN_RANGE','END_RANGE']].min(axis = 1)
data = data.drop(columns=['BEGIN_RANGE', 'END_RANGE'])

In [12]:
# Calculate average lat/long

data['average_latitude'] = (data['BEGIN_LAT'] + data['END_LAT'])/2
data["average_longitude"] = (data['BEGIN_LON'] + data['END_LON'])/2
data = data.drop(columns=['BEGIN_LAT', 'END_LAT', 'BEGIN_LON', 'END_LON'])

In [13]:
# Calculate percent land

data['percent_land'] = data['land_area']/data['total_area']
data = data.drop(columns=['land_area', 'total_area'])

In [14]:
# Extract multi-vortex references from EVENT_NARRATIVE

data['multi_vortex'] = 0
data.loc[data.apply(lambda x: 'multi-vortex' in x['EVENT_NARRATIVE'], axis=1), ['multi_vortex']] = 1
data.loc[data.apply(lambda x: 'multiple vortex' in x['EVENT_NARRATIVE'], axis=1), ['multi_vortex']] = 1
data.loc[data.apply(lambda x: 'multiple vortices' in x['EVENT_NARRATIVE'], axis=1), ['multi_vortex']] = 1
data = data.drop(columns=['EVENT_NARRATIVE'])

In [15]:
# Standardize median income, population density, housing units density
# and percent land and replace NaNs with average

data['median_income'] = data['median_income'].str.replace(",", "").astype(float)
mean_Median_Household_Income = data['median_income'].mean(skipna = True)
data['median_income'] = data['median_income'].fillna(mean_Median_Household_Income)

data['population_density'] = data['population_density'].astype(float)
mean_Pop_Density = data['population_density'].mean(skipna = True)
data['population_density'] = data['population_density'].fillna(mean_Pop_Density)

data['housing_units_density'] = data['housing_units_density'].astype(float)
mean_Housing_Units_Density = data['housing_units_density'].mean(skipna = True)
data['housing_units_density'] = data['housing_units_density'].fillna(mean_Housing_Units_Density)

data['percent_land'] = data['percent_land'].astype(float)
mean_Percent_Land = data['percent_land'].mean(skipna = True)
data['percent_land'] = data['percent_land'].fillna(mean_Percent_Land)

In [16]:
# Create binary equivalent for casualties and
# tri-class casualties bins (0, 1-19, 20+)

data['binary_casualties'] = np.where(data['casualties']>=1, 1, 0)
data['multiclass_casualties'] = 0
data.loc[(data['casualties'] > 0) & (data['casualties'] < 20), 'multiclass_casualties'] = 1
data.loc[(data['casualties'] > 19), 'multiclass_casualties'] = 2

In [17]:
# Create separate DataFrames from classification (binary)
# and regression (non-binary)

data_binary = data.drop(columns=['casualties', 'multiclass_casualties'])
data_multiclass = data.drop(columns=['casualties', 'binary_casualties'])
data_regress = data.drop(columns=['binary_casualties', 'multiclass_casualties'])

In [18]:
# Convert to separate CSVs

data_binary.to_csv('tornadoes-binary.csv')
data_multiclass.to_csv('tornadoes-multiclass.csv')
data_regress.to_csv('tornadoes-nonbinary.csv')