Analysis of intensity: Using available data, has there a change in intensity since 1851?

In [1]:
# Add the dependencies.
import json
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
import os
import re

from sqlalchemy import create_engine
import psycopg2

# from config import db_password

import time

In [2]:
# File to load
atlantic = os.path.join("C:/Users/Carlos/BootCamp/Final_Project/Resources", "Hurricanes_Typhoons_1851_2014_Atlantic.csv")

# Read the school data file and store it in a Pandas DataFrame.
#atlantic_df = pd.read_csv(atlantic, index_col=False)
#atlantic_df.set_index('ID', inplace=True)
atlantic_df = pd.read_csv(atlantic)
# show df using .
atlantic_df.sample(10)

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure,...,Low Wind SW,Low Wind NW,Moderate Wind NE,Moderate Wind SE,Moderate Wind SW,Moderate Wind NW,High Wind NE,High Wind SE,High Wind SW,High Wind NW
34869,AL061983,DEAN,19830930,0,,TS,36.4N,74.0W,55,1008,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
31301,AL091974,BECKY,19740831,1200,,HU,40.3N,56.0W,100,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
11869,AL111909,UNNAMED,19091006,1200,,TD,10.7N,76.0W,30,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
41093,AL141999,JOSE,19991024,600,,TS,28.0N,62.2W,60,990,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
10463,AL101903,UNNAMED,19031124,600,,HU,37.7N,41.7W,70,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
35440,AL051985,ELENA,19850903,1200,,TD,33.2N,93.7W,25,1006,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
12251,AL051911,UNNAMED,19110920,1800,,EX,38.7N,66.7W,25,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
29745,AL151970,UNNAMED,19701007,600,,TD,16.8N,70.5W,30,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
17054,AL061933,UNNAMED,19330823,0,,HU,34.1N,73.7W,90,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999
21701,AL091949,UNNAMED,19490921,0,,TS,26.6N,92.8W,40,-999,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,-999


Start preparing data to be entered into the DB

In [3]:
# Check if there are any missing values
atlantic_df.count()

ID                  49105
Name                49105
Date                49105
Time                49105
Event               49105
Status              49105
Latitude            49105
Longitude           49105
Maximum Wind        49105
Minimum Pressure    49105
Low Wind NE         49105
Low Wind SE         49105
Low Wind SW         49105
Low Wind NW         49105
Moderate Wind NE    49105
Moderate Wind SE    49105
Moderate Wind SW    49105
Moderate Wind NW    49105
High Wind NE        49105
High Wind SE        49105
High Wind SW        49105
High Wind NW        49105
dtype: int64

In [4]:
# i created AL_WR_df to exclude wind radii data w/ -999 values to confirm data follows PDF data description: 
# "Wind Radii – These values have been best tracked since 2004 and are thus available here from that year forward 
# with a resolution to the nearest 5 nm"
AL_WR_df = atlantic_df[atlantic_df['High Wind NE'] != -999]
  
AL_WR_df

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure,...,Low Wind SW,Low Wind NW,Moderate Wind NE,Moderate Wind SE,Moderate Wind SW,Moderate Wind NW,High Wind NE,High Wind SE,High Wind SW,High Wind NW
43104,AL012004,ALEX,20040731,1800,,TD,30.3N,78.3W,25,1010,...,0,0,0,0,0,0,0,0,0,0
43105,AL012004,ALEX,20040801,0,,TD,31.0N,78.8W,25,1009,...,0,0,0,0,0,0,0,0,0,0
43106,AL012004,ALEX,20040801,600,,TD,31.5N,79.0W,25,1009,...,0,0,0,0,0,0,0,0,0,0
43107,AL012004,ALEX,20040801,1200,,TD,31.6N,79.1W,30,1009,...,0,0,0,0,0,0,0,0,0,0
43108,AL012004,ALEX,20040801,1800,,TS,31.6N,79.2W,35,1009,...,50,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49100,AL122015,KATE,20151112,1200,,EX,41.3N,50.4W,55,981,...,180,120,120,120,60,0,0,0,0,0
49101,AL122015,KATE,20151112,1800,,EX,41.9N,49.9W,55,983,...,180,120,120,120,60,0,0,0,0,0
49102,AL122015,KATE,20151113,0,,EX,41.5N,49.2W,50,985,...,200,220,120,120,60,0,0,0,0,0
49103,AL122015,KATE,20151113,600,,EX,40.8N,47.5W,45,985,...,180,220,0,0,0,0,0,0,0,0


In [5]:
# confirming there is no data available < 01-01-2004
len(AL_WR_df[AL_WR_df['Date']<20040101])

0

In [6]:
# I wanted to see how many records would be impacted by missing wind radii data.
# Based on PDF, I selected Low Wind NE to find count as there wouldnt be records with partial wind radii data
atlantic_df['Low Wind NE'].value_counts()

-999    43184
 0       2084
 60       364
 90       304
 120      285
        ...  
 195        1
 280        1
 165        1
 620        1
 710        1
Name: Low Wind NE, Length: 65, dtype: int64

In [7]:
# 43184 records missing Wind Radii data. 5921 records with specific wind data and all data is post year 2004. 
# Need to analyze historical data therefore created AL_Hurricane_df to drop all Wind Radii columns.

AL_Hurricane_df = atlantic_df.drop(columns=["Low Wind NE", "Low Wind SE", "Low Wind SW", "Low Wind NW",
                                            "Moderate Wind NE","Moderate Wind SE","Moderate Wind SW","Moderate Wind NW",
                                            "High Wind NE","High Wind SE","High Wind SW","High Wind NW"])
AL_Hurricane_df.sample(10)

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure
30290,AL161971,UNNAMED,19710910,1800,,TD,16.0N,18.0W,20,-999
30360,AL191971,JANICE,19710921,600,,TD,11.5N,38.8W,25,-999
43907,AL062005,FRANKLIN,20050726,1800,,TS,32.2N,68.3W,35,1001
32411,AL191976,UNNAMED,19761006,600,,TD,16.2N,55.0W,25,-999
31542,AL181974,UNNAMED,19741006,1200,,SS,25.0N,78.7W,40,1006
35803,AL051986,CHARLEY,19860827,600,,EX,55.5N,5.0E,45,983
27938,AL121967,CHLOE,19670906,0,,TD,16.4N,26.0W,30,-999
19544,AL021941,UNNAMED,19410920,1200,,TS,24.1N,88.0W,60,-999
20158,AL091943,UNNAMED,19431011,1200,,TS,12.7N,61.0W,50,-999
3334,AL051878,UNNAMED,18780912,1800,,TS,34.0N,80.1W,60,-999


In [8]:
# Maximum wind to be main determiner for intenstity of storms.
# Per PDF: Maximum sustained surface wind: This is defined as the maximum 1-min average wind associated with the 
# tropical cyclone at an elevation of 10 m with an unobstructed exposure. Values are given to the nearest 10 kt 
# for the years 1851 through 1885 and to the nearest 5 kt from 1886 onward. A value is assigned for every cyclone 
# at every best track time. Note that the non-developing tropical depressions of 1967 did not have intensities assigned
# to them in the b-decks. These are indicated as “-99” currently.

# Count number of -99 values for Maximum Wind. 
AL_Hurricane_df['Maximum Wind'].value_counts()

 30     5900
 40     4582
 35     4515
 25     4432
 50     4225
 45     3413
 60     3016
 70     2875
 55     2124
 65     2098
 80     1838
 90     1634
 85     1485
 75     1442
 20     1237
 100     773
 95      669
 105     652
 110     508
-99      338
 115     318
 120     300
 15      193
 125     173
 130     112
 140      64
 10       61
 135      56
 145      30
 150      26
 155       9
 160       5
 165       1
 32        1
Name: Maximum Wind, dtype: int64

In [9]:
# Based on above counts there are 338 records with maximum wind missing. Created new DF to see what kind of data is
# associated with records w/ -99 maximum wind.
AL_MW_df = AL_Hurricane_df[AL_Hurricane_df['Maximum Wind'] == -99]
AL_MW_df

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure
27768,AL011967,UNNAMED,19670610,1200,,TD,18.0N,85.0W,-99,-999
27769,AL011967,UNNAMED,19670610,1800,,TD,18.0N,85.2W,-99,-999
27770,AL011967,UNNAMED,19670611,0,,TD,18.0N,85.5W,-99,-999
27771,AL011967,UNNAMED,19670611,600,,TD,18.0N,85.8W,-99,-999
27772,AL011967,UNNAMED,19670611,1200,,TD,17.9N,86.0W,-99,-999
...,...,...,...,...,...,...,...,...,...,...
35746,AL041986,UNNAMED,19860805,1800,,TD,28.0N,97.8W,-99,-999
35848,AL071986,UNNAMED,19860904,1200,,TD,22.9N,99.0W,-99,-999
36142,AL081987,UNNAMED,19870908,1800,,TD,14.0N,83.9W,-99,-999
36148,AL091987,UNNAMED,19870908,600,,TD,34.8N,78.5W,-99,-999


In [10]:
# Count number of -99 values for Minimum Pressure. 
AL_MW_df['Minimum Pressure'].value_counts()

-999    338
Name: Minimum Pressure, dtype: int64

In [11]:
# Count number of -99 values for Status. 
AL_MW_df['Status'].value_counts()

 TD    338
Name: Status, dtype: int64

In [12]:
# Count number of -99 values for Event. 
AL_MW_df['Event'].value_counts()

      338
Name: Event, dtype: int64

In [13]:
# Based on above counts for Minimum Pressure, Status and Event, there are 338 TD (Tropical depression) records with 
# maximum wind missing.
# Due to small sample size, I dropped the records

AL_Hurricane_df = AL_Hurricane_df[AL_Hurricane_df['Maximum Wind'] != -99]
AL_Hurricane_df

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure
0,AL011851,UNNAMED,18510625,0,,HU,28.0N,94.8W,80,-999
1,AL011851,UNNAMED,18510625,600,,HU,28.0N,95.4W,80,-999
2,AL011851,UNNAMED,18510625,1200,,HU,28.0N,96.0W,80,-999
3,AL011851,UNNAMED,18510625,1800,,HU,28.1N,96.5W,80,-999
4,AL011851,UNNAMED,18510625,2100,L,HU,28.2N,96.8W,80,-999
...,...,...,...,...,...,...,...,...,...,...
49100,AL122015,KATE,20151112,1200,,EX,41.3N,50.4W,55,981
49101,AL122015,KATE,20151112,1800,,EX,41.9N,49.9W,55,983
49102,AL122015,KATE,20151113,0,,EX,41.5N,49.2W,50,985
49103,AL122015,KATE,20151113,600,,EX,40.8N,47.5W,45,985


In [14]:
# See missing Minimum Pressure values, want to confirm count.
AL_Hurricane_df['Minimum Pressure'].value_counts()

-999     30331
 1005      883
 1008      846
 1006      808
 1009      800
         ...  
 902         1
 899         1
 889         1
 888         1
 907         1
Name: Minimum Pressure, Length: 130, dtype: int64

In [15]:
# Based on above counts there are 30331 records with maximum wind missing. Created new DF to see what kind of data is
# associated with records w/ -999 Minimum Pressure.
AL_MP_df = AL_Hurricane_df[AL_Hurricane_df['Minimum Pressure'] != -999]
AL_MP_df

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure
127,AL011852,UNNAMED,18520826,600,L,HU,30.2N,88.6W,100,961
252,AL031853,UNNAMED,18530903,1200,,HU,19.7N,56.2W,130,924
346,AL031854,UNNAMED,18540907,1200,,HU,28.0N,78.6W,110,938
351,AL031854,UNNAMED,18540908,1800,,HU,31.6N,81.1W,100,950
352,AL031854,UNNAMED,18540908,2000,L,HU,31.7N,81.1W,100,950
...,...,...,...,...,...,...,...,...,...,...
49100,AL122015,KATE,20151112,1200,,EX,41.3N,50.4W,55,981
49101,AL122015,KATE,20151112,1800,,EX,41.9N,49.9W,55,983
49102,AL122015,KATE,20151113,0,,EX,41.5N,49.2W,50,985
49103,AL122015,KATE,20151113,600,,EX,40.8N,47.5W,45,985


In [16]:
# According to SECOORD (Southeast Coastal Ocean Observing Regional Association), lower central pressure creates a 
# stronger gradient from outside to inside the system.  The stronger this pressure gradient is, the greater the 
# maximum wind speeds around the eye wall (https://secoora.org/hurricane_glossary/)

# Our PDF states that the Minimum Pressure column is the Central Pressure. These values are given to the nearest 
# millibar. Originally, central pressure best track values were only included if there was a specific
# observation that could be used explicitly. Missing central pressure values are noted as “-999”. 

# Based on this data I removed the -999 values from Minimum Pressure
AL_Hurricane_df = AL_Hurricane_df[AL_Hurricane_df['Minimum Pressure'] != -999]
AL_Hurricane_df

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure
127,AL011852,UNNAMED,18520826,600,L,HU,30.2N,88.6W,100,961
252,AL031853,UNNAMED,18530903,1200,,HU,19.7N,56.2W,130,924
346,AL031854,UNNAMED,18540907,1200,,HU,28.0N,78.6W,110,938
351,AL031854,UNNAMED,18540908,1800,,HU,31.6N,81.1W,100,950
352,AL031854,UNNAMED,18540908,2000,L,HU,31.7N,81.1W,100,950
...,...,...,...,...,...,...,...,...,...,...
49100,AL122015,KATE,20151112,1200,,EX,41.3N,50.4W,55,981
49101,AL122015,KATE,20151112,1800,,EX,41.9N,49.9W,55,983
49102,AL122015,KATE,20151113,0,,EX,41.5N,49.2W,50,985
49103,AL122015,KATE,20151113,600,,EX,40.8N,47.5W,45,985


In [17]:
# check data types
AL_Hurricane_df.dtypes

ID                  object
Name                object
Date                 int64
Time                 int64
Event               object
Status              object
Latitude            object
Longitude           object
Maximum Wind         int64
Minimum Pressure     int64
dtype: object

In [18]:
# I have 18,436 records with siginifcant data. I now need to complete cleaning the df to prepare for ML
# Need ID column, but its an object cause first two values are letters. Per PDF, AL means atlantic basin. Since
# we know we are only workng with atlanic ocean hurricanes, we will replace letters with "", meaning we will remove the 
# values from the column. The rest of the ID works as explained in PDF:
# AL (Spaces 1 and 2) – Basin – Atlantic -> THIS IS REMOVED WITH REPLACE METHOD
# 09 (Spaces 3 and 4) – ATCF cyclone number for that year -> THIS IS KEPT FOR ID
# 2011 (Spaces 5-8, before first comma) – Year -> THIS IS KEPT FOR ID
AL_Hurricane_df["ID"] = AL_Hurricane_df["ID"].str.replace("AL","")
AL_Hurricane_df

Unnamed: 0,ID,Name,Date,Time,Event,Status,Latitude,Longitude,Maximum Wind,Minimum Pressure
127,011852,UNNAMED,18520826,600,L,HU,30.2N,88.6W,100,961
252,031853,UNNAMED,18530903,1200,,HU,19.7N,56.2W,130,924
346,031854,UNNAMED,18540907,1200,,HU,28.0N,78.6W,110,938
351,031854,UNNAMED,18540908,1800,,HU,31.6N,81.1W,100,950
352,031854,UNNAMED,18540908,2000,L,HU,31.7N,81.1W,100,950
...,...,...,...,...,...,...,...,...,...,...
49100,122015,KATE,20151112,1200,,EX,41.3N,50.4W,55,981
49101,122015,KATE,20151112,1800,,EX,41.9N,49.9W,55,983
49102,122015,KATE,20151113,0,,EX,41.5N,49.2W,50,985
49103,122015,KATE,20151113,600,,EX,40.8N,47.5W,45,985


In [19]:
AL_Hurricane_df.dtypes

ID                  object
Name                object
Date                 int64
Time                 int64
Event               object
Status              object
Latitude            object
Longitude           object
Maximum Wind         int64
Minimum Pressure     int64
dtype: object

In [20]:
# Removed the AL prefix but its still an object, change format using to number
AL_Hurricane_df["ID"] = pd.to_numeric(AL_Hurricane_df["ID"], errors='coerce')
AL_Hurricane_df.dtypes

ID                   int64
Name                object
Date                 int64
Time                 int64
Event               object
Status              object
Latitude            object
Longitude           object
Maximum Wind         int64
Minimum Pressure     int64
dtype: object

In [21]:
# Name to be left as is for now, but I want to create new columns for date Year, Month Day, this could be useful later
# for visualization purposes. To do this I make string version of the Date column and replace the int64 column with
# the string one
AL_Hurricane_df['Date'] = AL_Hurricane_df['Date'].astype(str)


In [22]:
# With the column now a string I can easily make new year, month and day columns
AL_Hurricane_df['Year'] = AL_Hurricane_df['Date'].str[0:4]
AL_Hurricane_df['Month'] = AL_Hurricane_df['Date'].str[4:6]
AL_Hurricane_df['Day'] = AL_Hurricane_df['Date'].str[6:8]

# ensure that new columns as well as latitude and longitude are transformed into int64 
AL_Hurricane_df["Year"] = pd.to_numeric(AL_Hurricane_df["Year"], errors='coerce')
AL_Hurricane_df["Month"] = pd.to_numeric(AL_Hurricane_df["Year"], errors='coerce')
AL_Hurricane_df["Day"] = pd.to_numeric(AL_Hurricane_df["Year"], errors='coerce')
AL_Hurricane_df["Latitude"] = pd.to_numeric(AL_Hurricane_df["Latitude"], errors='coerce')
AL_Hurricane_df["Longitude"] = pd.to_numeric(AL_Hurricane_df["Longitude"], errors='coerce')

# ensure that new columns are transformed into int64
AL_Hurricane_df.dtypes

ID                    int64
Name                 object
Date                 object
Time                  int64
Event                object
Status               object
Latitude            float64
Longitude           float64
Maximum Wind          int64
Minimum Pressure      int64
Year                  int64
Month                 int64
Day                   int64
dtype: object

In [24]:
from sqlalchemy import create_engine

In [25]:
from config import db_password

In [26]:
# db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/movie_data"

In [27]:
# engine = create_engine(db_string)