In [1]:
# Import Dependancies
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Read Tornado data
csv_file = "data/1950-2019_all_tornadoes.csv"
df = pd.read_csv(csv_file)
df

Unnamed: 0,om,yr,mo,dy,date,time,tz,st,stf,stn,...,len,wid,ns,sn,sg,f1,f2,f3,f4,fc
0,1,1950,1,3,1950-01-03,11:00:00,3,MO,29,1,...,9.50,150,2,0,1,0,0,0,0,0
1,1,1950,1,3,1950-01-03,11:00:00,3,MO,29,1,...,6.20,150,2,1,2,189,0,0,0,0
2,1,1950,1,3,1950-01-03,11:10:00,3,IL,17,1,...,3.30,100,2,1,2,119,0,0,0,0
3,2,1950,1,3,1950-01-03,11:55:00,3,IL,17,2,...,3.60,130,1,1,1,135,0,0,0,0
4,3,1950,1,3,1950-01-03,16:00:00,3,OH,39,1,...,0.10,10,1,1,1,161,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66383,618537,2019,12,29,2019-12-29,16:03:00,3,MS,28,0,...,7.70,900,1,1,1,7,0,0,0,0
66384,618538,2019,12,29,2019-12-29,16:13:00,3,MS,28,0,...,3.82,200,1,1,1,19,0,0,0,0
66385,618539,2019,12,29,2019-12-29,16:32:00,3,MS,28,0,...,2.61,200,1,1,1,105,0,0,0,0
66386,618540,2019,12,29,2019-12-29,17:13:00,3,MS,28,0,...,3.23,125,1,1,1,101,0,0,0,0


## Data Cleaning

In [3]:
df.dtypes

om         int64
yr         int64
mo         int64
dy         int64
date      object
time      object
tz         int64
st        object
stf        int64
stn        int64
mag        int64
inj        int64
fat        int64
loss     float64
closs    float64
slat     float64
slon     float64
elat     float64
elon     float64
len      float64
wid        int64
ns         int64
sn         int64
sg         int64
f1         int64
f2         int64
f3         int64
f4         int64
fc         int64
dtype: object

In [4]:
df.columns

Index(['om', 'yr', 'mo', 'dy', 'date', 'time', 'tz', 'st', 'stf', 'stn', 'mag',
       'inj', 'fat', 'loss', 'closs', 'slat', 'slon', 'elat', 'elon', 'len',
       'wid', 'ns', 'sn', 'sg', 'f1', 'f2', 'f3', 'f4', 'fc'],
      dtype='object')

In [5]:
# Select tornado data for desired years
filtered_df = df.loc[df.yr >= 1996, :]
filtered_df

Unnamed: 0,om,yr,mo,dy,date,time,tz,st,stf,stn,...,len,wid,ns,sn,sg,f1,f2,f3,f4,fc
36003,1,1996,1,1,1996-01-01,11:25:00,3,FL,12,1,...,0.50,35,1,1,1,9,0,0,0,0
36004,859,1996,1,2,1996-01-02,18:10:00,3,SC,45,1,...,0.50,50,1,1,1,75,0,0,0,0
36005,860,1996,1,2,1996-01-02,18:20:00,3,SC,45,2,...,0.30,50,1,1,1,75,0,0,0,0
36006,4,1996,1,3,1996-01-03,07:07:00,3,FL,12,2,...,7.00,40,1,1,1,25,0,0,0,0
36007,5,1996,1,3,1996-01-03,07:40:00,3,FL,12,3,...,0.40,10,1,1,1,11,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66383,618537,2019,12,29,2019-12-29,16:03:00,3,MS,28,0,...,7.70,900,1,1,1,7,0,0,0,0
66384,618538,2019,12,29,2019-12-29,16:13:00,3,MS,28,0,...,3.82,200,1,1,1,19,0,0,0,0
66385,618539,2019,12,29,2019-12-29,16:32:00,3,MS,28,0,...,2.61,200,1,1,1,105,0,0,0,0
66386,618540,2019,12,29,2019-12-29,17:13:00,3,MS,28,0,...,3.23,125,1,1,1,101,0,0,0,0


In [6]:
# Add column totaling injuries and fatalities
filtered_df["casualty"] = filtered_df["inj"] + filtered_df["fat"]
filtered_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["casualty"] = filtered_df["inj"] + filtered_df["fat"]


Unnamed: 0,om,yr,mo,dy,date,time,tz,st,stf,stn,...,wid,ns,sn,sg,f1,f2,f3,f4,fc,casualty
36003,1,1996,1,1,1996-01-01,11:25:00,3,FL,12,1,...,35,1,1,1,9,0,0,0,0,0
36004,859,1996,1,2,1996-01-02,18:10:00,3,SC,45,1,...,50,1,1,1,75,0,0,0,0,0
36005,860,1996,1,2,1996-01-02,18:20:00,3,SC,45,2,...,50,1,1,1,75,0,0,0,0,0
36006,4,1996,1,3,1996-01-03,07:07:00,3,FL,12,2,...,40,1,1,1,25,0,0,0,0,9
36007,5,1996,1,3,1996-01-03,07:40:00,3,FL,12,3,...,10,1,1,1,11,0,0,0,0,0


In [7]:
# Select derired columns
df1 = filtered_df[['yr', 'mo', 'date', 'time', 'st', 'mag', 'inj', 'fat',\
                   'loss', 'closs', 'len', 'wid', 'slat', 'slon', 'casualty']]
df1

Unnamed: 0,yr,mo,date,time,st,mag,inj,fat,loss,closs,len,wid,slat,slon,casualty
36003,1996,1,1996-01-01,11:25:00,FL,0,0,0,0.04,0.0,0.50,35,28.0800,-80.6000,0
36004,1996,1,1996-01-02,18:10:00,SC,0,0,0,0.03,0.0,0.50,50,33.5000,-80.8700,0
36005,1996,1,1996-01-02,18:20:00,SC,0,0,0,0.00,0.0,0.30,50,33.5000,-80.8500,0
36006,1996,1,1996-01-03,07:07:00,FL,1,9,0,1.20,0.0,7.00,40,25.6800,-80.4200,9
36007,1996,1,1996-01-03,07:40:00,FL,0,0,0,0.10,0.0,0.40,10,26.0000,-80.2300,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66383,2019,12,2019-12-29,16:03:00,MS,1,0,0,75000.00,0.0,7.70,900,33.1628,-89.4323,0
66384,2019,12,2019-12-29,16:13:00,MS,1,0,0,10000.00,0.0,3.82,200,33.2598,-89.2778,0
66385,2019,12,2019-12-29,16:32:00,MS,0,0,0,5000.00,0.0,2.61,200,33.4720,-89.0315,0
66386,2019,12,2019-12-29,17:13:00,MS,1,0,0,150000.00,0.0,3.23,125,32.5268,-89.1628,0


In [8]:
df1.reset_index(inplace = True, drop = True)
df1

Unnamed: 0,yr,mo,date,time,st,mag,inj,fat,loss,closs,len,wid,slat,slon,casualty
0,1996,1,1996-01-01,11:25:00,FL,0,0,0,0.04,0.0,0.50,35,28.0800,-80.6000,0
1,1996,1,1996-01-02,18:10:00,SC,0,0,0,0.03,0.0,0.50,50,33.5000,-80.8700,0
2,1996,1,1996-01-02,18:20:00,SC,0,0,0,0.00,0.0,0.30,50,33.5000,-80.8500,0
3,1996,1,1996-01-03,07:07:00,FL,1,9,0,1.20,0.0,7.00,40,25.6800,-80.4200,9
4,1996,1,1996-01-03,07:40:00,FL,0,0,0,0.10,0.0,0.40,10,26.0000,-80.2300,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30380,2019,12,2019-12-29,16:03:00,MS,1,0,0,75000.00,0.0,7.70,900,33.1628,-89.4323,0
30381,2019,12,2019-12-29,16:13:00,MS,1,0,0,10000.00,0.0,3.82,200,33.2598,-89.2778,0
30382,2019,12,2019-12-29,16:32:00,MS,0,0,0,5000.00,0.0,2.61,200,33.4720,-89.0315,0
30383,2019,12,2019-12-29,17:13:00,MS,1,0,0,150000.00,0.0,3.23,125,32.5268,-89.1628,0


In [9]:
# States Data
data = {'StName':['Alabama',
'Alaska',
'Arizona',
'Arkansas',
'California',
'Colorado',
'Connecticut',
'Delaware',
'Florida',
'Georgia',
'Hawaii',
'Idaho',
'Illinois',
'Indiana',
'Iowa',
'Kansas',
'Kentucky',
'Louisiana',
'Maine',
'Maryland',
'Massachusetts',
'Michigan',
'Minnesota',
'Mississippi',
'Missouri',
'Montana',
'Nebraska',
'Nevada',
'New Hampshire',
'New Jersey',
'New Mexico',
'New York',
'North Carolina',
'North Dakota',
'Ohio',
'Oklahoma',
'Oregon',
'Pennsylvania',
'Rhode Island',
'South Carolina',
'South Dakota',
'Tennessee',
'Texas',
'Utah',
'Vermont',
'Virginia',
'Washington',
'West Virginia',
'Wisconsin',
'Wyoming']}

states_df = pd.DataFrame(data)
cw_location = 'http://app02.clerk.org/menu/ccis/Help/CCIS%20Codes/'
cw_filename = 'state_codes.html'

states = pd.read_html(cw_location + cw_filename)[0]
state_code_map = dict(zip(states['Description'], 
                          states['Code']))
states_df['StAbbr'] = states_df['StName'].map(state_code_map)
code_state_map = dict(zip(states['Code'],
                          states['Description']))
states_df['StNameAgain'] = states_df['StAbbr'].map(code_state_map)
states_df = states_df.rename(columns={'stName': 'State_Name', 'StAbbr': 'st'})
states_df.head()

Unnamed: 0,StName,st,StNameAgain
0,Alabama,AL,Alabama
1,Alaska,AK,Alaska
2,Arizona,AZ,Arizona
3,Arkansas,AR,Arkansas
4,California,CA,California


In [10]:
states_df.columns

Index(['StName', 'st', 'StNameAgain'], dtype='object')

In [11]:
# Merge dataframes
new_df2 = pd.merge(df1, states_df, on='st', how='left')
# new_df2
df3 = new_df2[['yr', 'mo', 'date', 'time', 'StName', 'mag', 'inj', 'fat',\
                   'loss', 'closs', 'len', 'wid', 'slat', 'slon', 'casualty']]
df3

Unnamed: 0,yr,mo,date,time,StName,mag,inj,fat,loss,closs,len,wid,slat,slon,casualty
0,1996,1,1996-01-01,11:25:00,Florida,0,0,0,0.04,0.0,0.50,35,28.0800,-80.6000,0
1,1996,1,1996-01-02,18:10:00,South Carolina,0,0,0,0.03,0.0,0.50,50,33.5000,-80.8700,0
2,1996,1,1996-01-02,18:20:00,South Carolina,0,0,0,0.00,0.0,0.30,50,33.5000,-80.8500,0
3,1996,1,1996-01-03,07:07:00,Florida,1,9,0,1.20,0.0,7.00,40,25.6800,-80.4200,9
4,1996,1,1996-01-03,07:40:00,Florida,0,0,0,0.10,0.0,0.40,10,26.0000,-80.2300,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30380,2019,12,2019-12-29,16:03:00,Mississippi,1,0,0,75000.00,0.0,7.70,900,33.1628,-89.4323,0
30381,2019,12,2019-12-29,16:13:00,Mississippi,1,0,0,10000.00,0.0,3.82,200,33.2598,-89.2778,0
30382,2019,12,2019-12-29,16:32:00,Mississippi,0,0,0,5000.00,0.0,2.61,200,33.4720,-89.0315,0
30383,2019,12,2019-12-29,17:13:00,Mississippi,1,0,0,150000.00,0.0,3.23,125,32.5268,-89.1628,0


In [12]:
# Rename Columns
df3 = df3.rename(columns={'yr': 'Year', 'mo': 'Month', 'date': 'Date', 'time': 'Time', 'mag': 'Magnitude',\
                          'inj': 'Injuries', 'fat': 'Fatalities', 'loss': 'Loss (Dollars)',\
                          'len':'Length (Miles)', 'wid': 'Width (Yards)', 'casualty': 'Total Casualty',\
                          'StName': 'State_Name', 'StAbbr': 'St', 'slat': 'Latitude', 'slon': 'Longitude'})
df3

Unnamed: 0,Year,Month,Date,Time,State_Name,Magnitude,Injuries,Fatalities,Loss (Dollars),closs,Length (Miles),Width (Yards),Latitude,Longitude,Total Casualty
0,1996,1,1996-01-01,11:25:00,Florida,0,0,0,0.04,0.0,0.50,35,28.0800,-80.6000,0
1,1996,1,1996-01-02,18:10:00,South Carolina,0,0,0,0.03,0.0,0.50,50,33.5000,-80.8700,0
2,1996,1,1996-01-02,18:20:00,South Carolina,0,0,0,0.00,0.0,0.30,50,33.5000,-80.8500,0
3,1996,1,1996-01-03,07:07:00,Florida,1,9,0,1.20,0.0,7.00,40,25.6800,-80.4200,9
4,1996,1,1996-01-03,07:40:00,Florida,0,0,0,0.10,0.0,0.40,10,26.0000,-80.2300,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30380,2019,12,2019-12-29,16:03:00,Mississippi,1,0,0,75000.00,0.0,7.70,900,33.1628,-89.4323,0
30381,2019,12,2019-12-29,16:13:00,Mississippi,1,0,0,10000.00,0.0,3.82,200,33.2598,-89.2778,0
30382,2019,12,2019-12-29,16:32:00,Mississippi,0,0,0,5000.00,0.0,2.61,200,33.4720,-89.0315,0
30383,2019,12,2019-12-29,17:13:00,Mississippi,1,0,0,150000.00,0.0,3.23,125,32.5268,-89.1628,0


In [13]:
df3.drop(['Injuries', 'Fatalities','closs'], axis=1, inplace=True)
df3

Unnamed: 0,Year,Month,Date,Time,State_Name,Magnitude,Loss (Dollars),Length (Miles),Width (Yards),Latitude,Longitude,Total Casualty
0,1996,1,1996-01-01,11:25:00,Florida,0,0.04,0.50,35,28.0800,-80.6000,0
1,1996,1,1996-01-02,18:10:00,South Carolina,0,0.03,0.50,50,33.5000,-80.8700,0
2,1996,1,1996-01-02,18:20:00,South Carolina,0,0.00,0.30,50,33.5000,-80.8500,0
3,1996,1,1996-01-03,07:07:00,Florida,1,1.20,7.00,40,25.6800,-80.4200,9
4,1996,1,1996-01-03,07:40:00,Florida,0,0.10,0.40,10,26.0000,-80.2300,0
...,...,...,...,...,...,...,...,...,...,...,...,...
30380,2019,12,2019-12-29,16:03:00,Mississippi,1,75000.00,7.70,900,33.1628,-89.4323,0
30381,2019,12,2019-12-29,16:13:00,Mississippi,1,10000.00,3.82,200,33.2598,-89.2778,0
30382,2019,12,2019-12-29,16:32:00,Mississippi,0,5000.00,2.61,200,33.4720,-89.0315,0
30383,2019,12,2019-12-29,17:13:00,Mississippi,1,150000.00,3.23,125,32.5268,-89.1628,0


In [14]:
df_clean = df3.copy()
df_clean

Unnamed: 0,Year,Month,Date,Time,State_Name,Magnitude,Loss (Dollars),Length (Miles),Width (Yards),Latitude,Longitude,Total Casualty
0,1996,1,1996-01-01,11:25:00,Florida,0,0.04,0.50,35,28.0800,-80.6000,0
1,1996,1,1996-01-02,18:10:00,South Carolina,0,0.03,0.50,50,33.5000,-80.8700,0
2,1996,1,1996-01-02,18:20:00,South Carolina,0,0.00,0.30,50,33.5000,-80.8500,0
3,1996,1,1996-01-03,07:07:00,Florida,1,1.20,7.00,40,25.6800,-80.4200,9
4,1996,1,1996-01-03,07:40:00,Florida,0,0.10,0.40,10,26.0000,-80.2300,0
...,...,...,...,...,...,...,...,...,...,...,...,...
30380,2019,12,2019-12-29,16:03:00,Mississippi,1,75000.00,7.70,900,33.1628,-89.4323,0
30381,2019,12,2019-12-29,16:13:00,Mississippi,1,10000.00,3.82,200,33.2598,-89.2778,0
30382,2019,12,2019-12-29,16:32:00,Mississippi,0,5000.00,2.61,200,33.4720,-89.0315,0
30383,2019,12,2019-12-29,17:13:00,Mississippi,1,150000.00,3.23,125,32.5268,-89.1628,0


In [None]:
import seaborn as sns
sns.countplot(df_clean["Magnitude"], label="Count")
plt.show()

In [15]:
# Select the rows where magnitude >= 0
df_magfilt = df_clean[df_clean['Magnitude'] >= 0]
df_magfilt

Unnamed: 0,Year,Month,Date,Time,State_Name,Magnitude,Loss (Dollars),Length (Miles),Width (Yards),Latitude,Longitude,Total Casualty
0,1996,1,1996-01-01,11:25:00,Florida,0,0.04,0.50,35,28.0800,-80.6000,0
1,1996,1,1996-01-02,18:10:00,South Carolina,0,0.03,0.50,50,33.5000,-80.8700,0
2,1996,1,1996-01-02,18:20:00,South Carolina,0,0.00,0.30,50,33.5000,-80.8500,0
3,1996,1,1996-01-03,07:07:00,Florida,1,1.20,7.00,40,25.6800,-80.4200,9
4,1996,1,1996-01-03,07:40:00,Florida,0,0.10,0.40,10,26.0000,-80.2300,0
...,...,...,...,...,...,...,...,...,...,...,...,...
30380,2019,12,2019-12-29,16:03:00,Mississippi,1,75000.00,7.70,900,33.1628,-89.4323,0
30381,2019,12,2019-12-29,16:13:00,Mississippi,1,10000.00,3.82,200,33.2598,-89.2778,0
30382,2019,12,2019-12-29,16:32:00,Mississippi,0,5000.00,2.61,200,33.4720,-89.0315,0
30383,2019,12,2019-12-29,17:13:00,Mississippi,1,150000.00,3.23,125,32.5268,-89.1628,0


In [None]:
sns.countplot(df_magfilt["Magnitude"], label="Count")
plt.show()

In [16]:
df_magfilt['Adjusted Magnitude'] = df_magfilt['Magnitude']
df_magfilt = df_magfilt.replace({'Adjusted Magnitude': {3: 2, 4: 2, 5:2}})
df_magfilt

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_magfilt['Adjusted Magnitude'] = df_magfilt['Magnitude']


Unnamed: 0,Year,Month,Date,Time,State_Name,Magnitude,Loss (Dollars),Length (Miles),Width (Yards),Latitude,Longitude,Total Casualty,Adjusted Magnitude
0,1996,1,1996-01-01,11:25:00,Florida,0,0.04,0.50,35,28.0800,-80.6000,0,0
1,1996,1,1996-01-02,18:10:00,South Carolina,0,0.03,0.50,50,33.5000,-80.8700,0,0
2,1996,1,1996-01-02,18:20:00,South Carolina,0,0.00,0.30,50,33.5000,-80.8500,0,0
3,1996,1,1996-01-03,07:07:00,Florida,1,1.20,7.00,40,25.6800,-80.4200,9,1
4,1996,1,1996-01-03,07:40:00,Florida,0,0.10,0.40,10,26.0000,-80.2300,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30380,2019,12,2019-12-29,16:03:00,Mississippi,1,75000.00,7.70,900,33.1628,-89.4323,0,1
30381,2019,12,2019-12-29,16:13:00,Mississippi,1,10000.00,3.82,200,33.2598,-89.2778,0,1
30382,2019,12,2019-12-29,16:32:00,Mississippi,0,5000.00,2.61,200,33.4720,-89.0315,0,0
30383,2019,12,2019-12-29,17:13:00,Mississippi,1,150000.00,3.23,125,32.5268,-89.1628,0,1


In [17]:
# df_magfilt.to_csv('data/cleaned_data.csv')

In [18]:
df_magfilt["Adjusted Magnitude"].value_counts()

0    17731
1     8939
2     3417
Name: Adjusted Magnitude, dtype: int64

In [19]:
df_magfilt["Magnitude"].max()

5

In [20]:
df_magfilt.columns

Index(['Year', 'Month', 'Date', 'Time', 'State_Name', 'Magnitude',
       'Loss (Dollars)', 'Length (Miles)', 'Width (Yards)', 'Latitude',
       'Longitude', 'Total Casualty', 'Adjusted Magnitude'],
      dtype='object')

# Machine Learning

In [21]:
cleaned_df = df_magfilt
cleaned_df

Unnamed: 0,Year,Month,Date,Time,State_Name,Magnitude,Loss (Dollars),Length (Miles),Width (Yards),Latitude,Longitude,Total Casualty,Adjusted Magnitude
0,1996,1,1996-01-01,11:25:00,Florida,0,0.04,0.50,35,28.0800,-80.6000,0,0
1,1996,1,1996-01-02,18:10:00,South Carolina,0,0.03,0.50,50,33.5000,-80.8700,0,0
2,1996,1,1996-01-02,18:20:00,South Carolina,0,0.00,0.30,50,33.5000,-80.8500,0,0
3,1996,1,1996-01-03,07:07:00,Florida,1,1.20,7.00,40,25.6800,-80.4200,9,1
4,1996,1,1996-01-03,07:40:00,Florida,0,0.10,0.40,10,26.0000,-80.2300,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30380,2019,12,2019-12-29,16:03:00,Mississippi,1,75000.00,7.70,900,33.1628,-89.4323,0,1
30381,2019,12,2019-12-29,16:13:00,Mississippi,1,10000.00,3.82,200,33.2598,-89.2778,0,1
30382,2019,12,2019-12-29,16:32:00,Mississippi,0,5000.00,2.61,200,33.4720,-89.0315,0,0
30383,2019,12,2019-12-29,17:13:00,Mississippi,1,150000.00,3.23,125,32.5268,-89.1628,0,1


In [None]:
# data is not balanced
from pandas.plotting import scatter_matrix 

scatter_matrix(cleaned_df.drop('Loss (Dollars)', axis=1), figsize=(10,5))
plt.show()

# Predict Tornado Category (magnitude)

## Examine potential inputs (features)

In [None]:
cleaned_df.info()

#### 1. Feature - Month

In [None]:
# scatter plot vs month
plt.scatter(cleaned_df["Adjusted Magnitude"], cleaned_df["Month"])

There is no trend here. Drop 'Month' feature

#### 2. Feature - Length (Miles)

In [None]:
# scatter plot vs length
plt.scatter(cleaned_df["Adjusted Magnitude"], cleaned_df["Length (Miles)"])

In [None]:
# Length distribution
cleaned_df["Length (Miles)"].hist(bins=7, figsize=(6,4))
plt.title("Tornado Length Distribution")
plt.xlabel("Tornado Length in miles")
plt.ylabel("Tornado Count")
plt.show()

In [None]:
# Average length by magnitude
groupEFscale = cleaned_df.groupby(by='Magnitude')
groupEFscale['Length (Miles)'].mean()

##### There is a positive trend with 'Length (Miles)'

#### 3. Feature - Width (Yards)

In [None]:
# scatter plot vs width
plt.scatter(cleaned_df["Adjusted Magnitude"], cleaned_df["Width (Yards)"])

In [None]:
# Width distribution
cleaned_df["Width (Yards)"].hist(bins=7, figsize=(6,4))
plt.title("Tornado Width Distribution")
plt.xlabel("Tornado Width in yards")
plt.ylabel("Tornado Count")
plt.show()

In [None]:
# Average width by magnitude
groupEFscale['Width (Yards)'].mean()

##### There is a positive trend with 'Width (Yards)'

#### 4. Feature - Loss (Dollars)

In [None]:
# scatter plot vs loss
plt.scatter(cleaned_df["Adjusted Magnitude"], cleaned_df["Loss (Dollars)"])

In [None]:
groupEFscale['Width (Yards)'].mean()

##### There is a positive trend with 'Loss (Dollars)'

#### 5. Feature - Total Casualty

In [None]:
# scatter plot vs total casualty
plt.scatter(cleaned_df["Adjusted Magnitude"], cleaned_df["Total Casualty"])

In [None]:
# Total Casualty distribution
cleaned_df["Total Casualty"].hist(bins=7, figsize=(6,4))
plt.title("Total Casualty Distribution")
plt.xlabel("Total Casualties")
plt.ylabel("Tornado Count")
plt.show()

In [None]:
groupEFscale['Total Casualty'].mean()

##### There is a positive trend with 'Total Casualty'


#### 6. Feature - Latitude

In [None]:
# scatter plot vs latitude
plt.scatter(cleaned_df["Adjusted Magnitude"], cleaned_df["Latitude"])

##### There is a weak trend with 'Latitude'

#### 7. Feature - Longitude

In [None]:
# scatter plot vs longitude
plt.scatter(cleaned_df["Adjusted Magnitude"], cleaned_df["Longitude"])

##### There is a weak trend with 'Longitude'

### Choose our features (inputs)

In [None]:
X = cleaned_df[["Total Casualty", "Length (Miles)", "Width (Yards)",\
                "Latitude", "Longitude", 'Loss (Dollars)']]
X

In [None]:
# Choose our label
y = cleaned_df['Magnitude']
y

## Preprocessing

### Create training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

### Apply Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Summary of models
Inputs: 
length, width, latitude, longitude, financial loss, causalties

Potential:
* Decision Tree max depth 5 - moderate training, high testing
* Random Forest - possible overfitting problem (high training), but also high testing

Rejected:
* Logistic Regression - data is not linear
* Decision Tree - overfitting (high on training, low on testing)
* K-nearest neighbor - low on testing
* Linear Discriminant Analysis - data is not linear, low on training and testing
* Gaussian Naive Bayes - low on testing and training
* Support Vector Machine - low on testing and training

## Build Models

In [None]:
## Logistic Regression
# Our data is not linear - Reject model

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

print('Accuracy of Logistic Regression on training data', logreg.score(X_train_scaled, y_train))
print('Accuracy of Logistic Regression on testing data', logreg.score(X_test_scaled, y_test))

In [None]:
## Decision Tree
# high on training low on testing shows problem of overfitting - reject model

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train_scaled, y_train)

print('Accuracy of Decision Tree on training', dt.score(X_train_scaled, y_train))
print('Accuracy of Decision Tree on testing', dt.score(X_test_scaled, y_test))

In [None]:
# Setting max decision tree depth to help avoid overfitting
# Improved over previous - additional depth may improve

dt2 = DecisionTreeClassifier(max_depth=3)
dt2.fit(X_train_scaled, y_train)
print('Accuracy of Decision tree on training', dt2.score(X_train_scaled, y_train))
print('Accuracy of Decision tree on testing', dt2.score(X_test_scaled, y_test))

In [None]:
# Decision tree - max depth 5

# Improved over previous - potential model

dt5 = DecisionTreeClassifier(max_depth=5)
dt5.fit(X_train_scaled, y_train)
print('Accuracy of Decision tree on training', dt2.score(X_train_scaled, y_train))
print('Accuracy of Decision tree on testing', dt2.score(X_test_scaled, y_test))

In [None]:
# K-nearest neighbor 
# Low on testing - rejected

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
print('Accuracy of KNN on training', knn.score(X_train_scaled, y_train))
print('Accuracy of KNN on testing', knn.score(X_test_scaled, y_test))

In [None]:
# Linear Discriminant Analysis 
# Our data is not linear, low on training and testing - rejected

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train_scaled,y_train)
print('Accuracy of Linear Discriminant Analysis on training', lda.score(X_train_scaled, y_train))
print('Accuracy of Linear Discriminant Analysis on testing', lda.score(X_test_scaled, y_test))

In [None]:
# Gaussian Naive Bayes 
# low on testing and training - rejected

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)
print('Accuracy of GNB on training', gnb.score(X_train_scaled, y_train))
print('Accuracy of GNB on testing', gnb.score(X_test_scaled, y_test))

In [None]:
# Support Vector Machine
# low on testing and training - rejected

from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train_scaled, y_train)
print('Accuracy of SVM on training', svm.score(X_train_scaled, y_train))
print('Accuracy of SVM on testing', svm.score(X_test_scaled, y_test))

### Save and load the best model

In [None]:
import pickle

In [None]:
pickle.dump(dt5, open('data/model.pkl','wb'))

In [None]:
# Loading model to compare the results
model = pickle.load( open('data/model.pkl','rb'))

### Make Prediction on tornado category (magnitude)

In [None]:
user_input = [[13, 107.70, 140, 33.1628, -89.4323, 7308099]]
inputdf = pd.DataFrame(user_input,columns=["Total Casualty",\
                    "Length (Miles)", "Width (Yards)",\
                    "Latitude", "Longitude", 'Loss (Dollars)'])
original_df = X.copy()
reference = original_df.append(inputdf).reset_index()
new_df = reference[["Total Casualty", "Length (Miles)",\
                    "Width (Yards)", "Latitude", "Longitude",\
                    'Loss (Dollars)']]
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(new_df)
[list(scaled_df[len(scaled_df)-1])]
# scaled_df

In [None]:
model.predict([list(scaled_df[len(scaled_df)-1])])

## Predict Loss (Dollars)

In [None]:
cleaned_df['Loss (Dollars)'] = cleaned_df.loc[:,'Loss (Dollars)'].multiply(1000)
cleaned_df

In [None]:
# newcleaned = cleaned_df[cleaned_df['Loss (Dollars)'] < 6000000]
# newcleaned = cleaned_df
exploss = cleaned_df[cleaned_df['Loss (Dollars)'] >= 0.000]
exploss

In [None]:
plt.scatter(exploss["Loss (Dollars)"], exploss["Magnitude"])

In [None]:
# scatter plot vs length
plt.scatter(exploss["Loss (Dollars)"], exploss["Length (Miles)"])

In [None]:
plt.scatter(exploss["Loss (Dollars)"], exploss["Width (Yards)"])

In [None]:
plt.scatter(exploss["Loss (Dollars)"], exploss["Total Casualty"])

In [None]:
df_losspred = exploss[["Total Casualty", "Length (Miles)",\
                          "Width (Yards)","Latitude",\
                          "Longitude", 'Magnitude', "Loss (Dollars)"]]
df_losspred

In [None]:
df_losspred.dtypes

In [None]:
targety = df_losspred[["Loss (Dollars)"]].astype(int)
targety

In [None]:
# # # Convert datatypes from float64 to int64
# converted = df_losspred[["Loss (Dollars)"]].astype(int)
# converted

In [None]:
df_losspred.dtypes

In [None]:
# Choose our features
X2 = df_losspred[["Length (Miles)","Width (Yards)"]]
X2

In [None]:
y2= targety
y2

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=0)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X2_train_scaled = scaler.fit_transform(X2_train)
X2_test_scaled = scaler.transform(X2_test)

In [None]:
## Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X2_train_scaled, y2_train)

print('Accuracy of Logistic Regression on training', logreg.score(X2_train_scaled, y2_train))
print('Accuracy of Logistic Regression on testing', logreg.score(X2_test_scaled, y2_test))

In [None]:
## Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X2_train_scaled, y2_train)

print('Accuracy of Decision tree on training', dt.score(X2_train_scaled, y2_train))
print('Accuracy of Decision tree on testing', dt.score(X2_test_scaled, y2_test))

In [None]:
# Setting max decision tree depth to help avoid overfitting
dt2 = DecisionTreeClassifier(max_depth=5)
dt2.fit(X2_train_scaled, y2_train)
print('Accuracy of Decision tree on training', dt2.score(X2_train_scaled, y2_train))
print('Accuracy of Decision tree on testing', dt2.score(X2_test_scaled, y2_test))

In [None]:
# K-nearest neighbor 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X2_train_scaled, y2_train)
print('Accuracy of KNN on training', knn.score(X2_train_scaled, y2_train))
print('Accuracy of KNN on testing', knn.score(X2_test_scaled, y2_test))

In [None]:
# Linear Discriminant Analysis 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X2_train_scaled,y2_train)
print('Accuracy of Linear Discriminant Analysis on training', lda.score(X2_train_scaled, y2_train))
print('Accuracy of Linear Discriminant Analysis on testing', lda.score(X2_test_scaled, y2_test))

In [None]:
# Gaussian Naive Bayes 
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X2_train_scaled, y2_train)
print('Accuracy of GNB on training', gnb.score(X2_train_scaled, y2_train))
print('Accuracy of GNB on testing', gnb.score(X2_test_scaled, y2_test))

In [None]:
# # Support Vector Machine
from sklearn.svm import SVC
svm = SVC()
svm.fit(X2_train_scaled, y2_train)
print('Accuracy of SVM on training', svm.score(X2_train_scaled, y2_train))
print('Accuracy of SVM on testing', svm.score(X2_test_scaled, y2_test))

#### Loss prediction models give poor results. This could be because of the diffrent methods used to record the tornado data over the years.

## Change input features to check if better Loss-Prediction

In [None]:
X2 = df_losspred[["Length (Miles)","Width (Yards)", "Magnitude", "Total Casualty"]]
X2

In [None]:
y2= targety
y2

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=0)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X2_train_scaled = scaler.fit_transform(X2_train)
X2_test_scaled = scaler.transform(X2_test)

In [None]:
## Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X2_train_scaled, y2_train)

print('Accuracy of Logistic Regression on training', logreg.score(X2_train_scaled, y2_train))
print('Accuracy of Logistic Regression on testing', logreg.score(X2_test_scaled, y2_test))

In [None]:
## Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X2_train_scaled, y2_train)

print('Accuracy of Decision tree on training', dt.score(X2_train_scaled, y2_train))
print('Accuracy of Decision tree on testing', dt.score(X2_test_scaled, y2_test))

In [None]:
# Setting max decision tree depth to help avoid overfitting
dt2 = DecisionTreeClassifier(max_depth=5)
dt2.fit(X2_train_scaled, y2_train)
print('Accuracy of Decision tree on training', dt2.score(X2_train_scaled, y2_train))
print('Accuracy of Decision tree on testing', dt2.score(X2_test_scaled, y2_test))

In [None]:
# K-nearest neighbor 
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X2_train_scaled, y2_train)
print('Accuracy of KNN on training', knn.score(X2_train_scaled, y2_train))
print('Accuracy of KNN on testing', knn.score(X2_test_scaled, y2_test))

In [None]:
# Gaussian Naive Bayes 
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X2_train_scaled, y2_train)
print('Accuracy of GNB on training', gnb.score(X2_train_scaled, y2_train))
print('Accuracy of GNB on testing', gnb.score(X2_test_scaled, y2_test))

In [None]:
# # Support Vector Machine
from sklearn.svm import SVC
svm = SVC()
svm.fit(X2_train_scaled, y2_train)
print('Accuracy of SVM on training', svm.score(X2_train_scaled, y2_train))
print('Accuracy of SVM on testing', svm.score(X2_test_scaled, y2_test))

#### Changing the input features did not result in better accuracies. Hence, no 'Loss-Prediction' possible for such dataset.

In [None]:
pickle.dump(dt, open('model2.pkl','wb'))

In [None]:
# Loading model to compare the results
lossmodel = pickle.load( open('model2.pkl','rb'))

### Make Loss Prediction

In [None]:
user_input = [[15,500,3,7]]
inputdf = pd.DataFrame(user_input,columns=["Length (Miles)","Width (Yards)", "Magnitude", "Total Casualty"])
original_df = X2.copy()
reference = original_df.append(inputdf).reset_index()
new_df = reference[["Length (Miles)","Width (Yards)", "Magnitude", "Total Casualty"]]
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(new_df)
[list(scaled_df[len(scaled_df)-1])]
# scaled_df

In [None]:
loss = lossmodel.predict([list(scaled_df[len(scaled_df)-1])])
loss