In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as BS
import csv
import datetime

In [2]:
FINAL_YEAR_IN_DATASET = int(datetime.datetime.now().date().strftime("%Y"))

FINAL_YEAR_IN_DATASET

2022

In [3]:
year_urls = {}
for year in range(1800, 1850, 10):
    year_urls[year] = "https://en.wikipedia.org/wiki/%ss_Atlantic_hurricane_seasons" % (str(year))
for year in range(1850, FINAL_YEAR_IN_DATASET+1):
    year_urls[year] = "https://en.wikipedia.org/wiki/%s_Atlantic_hurricane_season" % (str(year))
    
len(year_urls)

178

In [4]:
year_pages = {}
for year in year_urls:
    r = requests.get(year_urls[year])
    page = BS(r.text, "html.parser")
    year_pages[year] = page
len(year_pages)

178

In [5]:
hurricane_urls = []
for year in year_pages:
    page = year_pages[year]
    for url in page.find_all("div", {"class": "hatnote navigation-not-searchable"}):
        if 'main article' in url.get_text().lower():
            hurr_url = "https://en.wikipedia.org" + url.find('a')['href']
            if hurr_url not in hurricane_urls:
                hurricane_urls.append(hurr_url)
                
len(hurricane_urls)

622

In [6]:
hurricane_urls

['https://en.wikipedia.org/wiki/1804_Antigua%E2%80%93Charleston_hurricane',
 'https://en.wikipedia.org/wiki/1804_Snow_hurricane',
 'https://en.wikipedia.org/wiki/1806_Great_Coastal_hurricane',
 'https://en.wikipedia.org/wiki/1812_Louisiana_hurricane',
 'https://en.wikipedia.org/wiki/1815_North_Carolina_hurricane',
 'https://en.wikipedia.org/wiki/Great_September_Gale_of_1815',
 'https://en.wikipedia.org/wiki/1821_Norfolk_and_Long_Island_hurricane',
 'https://en.wikipedia.org/wiki/1826_Canary_Islands_storm',
 'https://en.wikipedia.org/wiki/1827_North_Carolina_hurricane',
 'https://en.wikipedia.org/wiki/Great_Barbados_hurricane',
 'https://en.wikipedia.org/wiki/Racer%27s_hurricane',
 'https://en.wikipedia.org/wiki/1842_Atlantic_hurricane_season',
 'https://en.wikipedia.org/wiki/Great_Havana_Hurricane_of_1846',
 'https://en.wikipedia.org/wiki/1848_Tampa_Bay_hurricane',
 'https://en.wikipedia.org/wiki/1856_Last_Island_hurricane',
 'https://en.wikipedia.org/wiki/1867_San_Narciso_hurricane',


In [7]:
pages = {}
for url in hurricane_urls:
    r = requests.get(url)
    pages[url] = BS(r.text, "html.parser")

In [8]:
hurr_data = {}
for url in pages:
    page = pages[url]
    title_full = page.find('title').get_text()
    title = title_full.split("-")[0].strip()
    table = page.find('table')
    fields = {}
    for tr in page.find_all("tr"):
        tds = tr.find_all(["td", "th"])
        tds = [td.get_text().strip().lower() for td in tds]
        if len(tds) != 2:
            continue
        fields[tds[0]] = tds[1].strip().lower().replace(",", "")
    hurr_data[title] = fields

len(hurr_data)

622

In [9]:
def clean_name(hurr, formed):
    name = hurr.replace('Tropical Storm', 'Hurricane').replace('Tropical storms', 'Hurricane').split('Hurricane')[-1].strip()
    if name == '':
        name = hurr
    name = name.split('(')[0].strip()
    for hurr_type in ['subtropical storm', 'tropical depression', 'tropical cyclone', 'potential tropical cyclone']:
        if name.lower().startswith(hurr_type):
            name = formed[-4:] + ' ' + name
    return name

In [10]:
def clean_date(date):
    date = date.split('(')[0].replace('\xa0', ' ').replace('  ', ' ').strip()
    if len(date.split()) != 3:
        return None
    month_list = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
    months = {}
    for month in range(len(month_list)):
        mm = str(month)
        if len(mm) < 2:
            mm = '0' + mm
        months[month_list[month]] = mm
    month, dd, yyyy = date.split()
    if dd in month_list:
        month, dd = dd, month
    mm = months[month]
    if len(dd) < 2:
            dd = '0' + dd
    return '%s/%s/%s' % (mm[:2], dd[:2], yyyy[:4])

In [11]:
def clean_damage(damage):
    if damage in ['unknown', 'millions']:
        return None
    elif damage in ['minimal', 'none']:
        return '0'
    damage = damage.split("$")[-1].replace('\xa0', ' ').split('–')[-1].split(" ")
    num = float(damage[0])
    if int(num) == num:
        num = int(num)
    num = str(num)
    if len(damage) > 1 and damage[1] == "million":
        final_damage = num + "M"
    elif len(damage) > 1 and damage[1] == "billion":
        final_damage = num + "B"
    elif str(damage[0])[-3:] == "000":
        final_damage = num[:-3] + "K"
    else:
        final_damage = num
    return final_damage

In [12]:
def clean_deaths(deaths):
    if 'no ' in deaths or 'none' in deaths:
        return '0'
    deaths = deaths.replace('\xa0', ' ')
    deaths = deaths.replace('≥', '').replace('at least', '').replace('up to', '').replace('over', '')
    deaths = deaths.replace('c.', '').replace('+', '').replace('>', '').replace('~', '')
    if '–' in deaths:
        deaths = deaths.split('–')[1]
    elif '-' in deaths:
        deaths = deaths.split('-')[1]
    deaths = deaths.replace('indirect', 'total').replace('direct', 'total').replace('all', 'total')
    deaths = deaths.replace('reported', 'total').replace('related', 'total').replace('confirmed', 'total')
    deaths = deaths.replace('deaths', 'total').replace('dead', 'total').replace('overall', 'total')
    deaths = deaths.split('[')[0].split()[0]
    deaths = deaths.split('total')[0].strip('( ')
    try:
        return str(int(deaths))
    except:
        return None

In [13]:
full_data_hurr = []
for hurr in hurr_data:
    bad_hurr = False
    fields = list(hurr_data[hurr].keys())
    for key in ['formed', 'dissipated', 'highest winds', 'fatalities', 'damage']:
        if key not in fields:
            bad_hurr = True
    if bad_hurr == False:
        formed = clean_date(hurr_data[hurr]['formed'])
        dissipated = clean_date(hurr_data[hurr]['dissipated'])
        mph = int(hurr_data[hurr]['highest winds'].split(":")[-1].split('mph')[0].split('(')[-1].strip())
        damage = clean_damage(hurr_data[hurr]['damage'])
        deaths = clean_deaths(hurr_data[hurr]['fatalities'])
        if formed != None and dissipated != None and damage != None and deaths != None:
            final_hurr_data = {}
            final_hurr_data['name'] = clean_name(hurr, formed)
            final_hurr_data['formed'] = formed
            final_hurr_data['dissipated'] = dissipated
            final_hurr_data['mph'] = mph
            final_hurr_data['damage'] = damage
            final_hurr_data['deaths'] = deaths
            full_data_hurr.append(final_hurr_data)
        
len(full_data_hurr)

547

In [14]:
df = pd.DataFrame(full_data_hurr)
df.to_csv("hurricanes.csv", index=False)
print("Open 'hurricanes.csv' to find the extracted data")
df

Open 'hurricanes.csv' to find the extracted data


Unnamed: 0,name,formed,dissipated,mph,damage,deaths
0,1804 New England hurricane,09/04/1804,09/11/1804,110,100K,16
1,1806 Great Coastal hurricane,07/17/1806,07/25/1806,110,171K,24
2,1812 Louisiana hurricane,07/15/1812,07/20/1812,115,6M,100
3,1821 Norfolk and Long Island hurricane,08/01/1821,08/04/1821,130,200K,22
4,1848 Tampa Bay hurricane,08/23/1848,08/28/1848,130,20K,0
...,...,...,...,...,...,...
542,Mindy,08/08/2021,08/11/2021,60,75.169M,23
543,Nicholas,08/12/2021,08/20/2021,75,1.1B,4
544,Alex,05/05/2022,05/14/2022,70,104K,4
545,Bonnie,06/01/2022,06/11/2022,115,0,5
