# Pacific Hurricanes 1975 extractor
Web scraper for the 1975 Pacific hurricane season Wikipedia page

![](1975_Pacific_hurricane_season_summary_map_new.png)

## Heading

In [1]:
import functions as f

In [2]:
# URL for the 1975 Pacific hurricane season Wikipedia page
url = 'https://en.wikipedia.org/wiki/1975_Pacific_hurricane_season'
# Parse the HTML
soup=f.get_url_content(url)

Successfully fetched the requested page.


## hurricane/storm name

In [3]:
div_element=soup.find_all('div', {'class':'mw-heading mw-heading3'})
hurricane_storm_name=[]
for h3 in div_element:
    striped_h3=h3.find('h3').text.strip()
    hurricane_storm_name.append(striped_h3)

## start/end date

In [4]:
table_element=soup.find_all('table', {'class':'infobox'})
exact_class_tables = [table for table in table_element if table['class'] == ['infobox']]
start_date=[]
end_date=[]
for td in exact_class_tables:
    striped_date=td.find('td',{'class':'infobox-data'}).text.strip().split('\xa0– ')
    start_date.append(striped_date[0])
    end_date.append(striped_date[1])

## Number of deaths and list of affected areas

In [5]:
# extract the text of each hurricane/storm paragraph
all_h3 = soup.find_all('div',{'class':'mw-heading mw-heading3'})
info_text=[]
for h3 in all_h3:
    content_between_h3 = []
    # Loop through the siblings after the <div> tag
    for sibling in h3.find_next_siblings(recursive=False):
        # Stop when reaching the next <div> tag
        if sibling.name == 'div' and sibling.attrs=={'class': ['mw-heading', 'mw-heading3']}:
            break
        # Append the content of <p> to the list
        if sibling.name == 'p':
            text=sibling.text.strip()
            content_between_h3.append(text)
    # Join the list into a single string
    info_text.append(''.join(content_between_h3))
# Drop the last list item
info_text = info_text[:-1]

In [6]:
# parse the required information from the scraped text using gpt-3.5-turbo. Duration~15s
result=[]
tokens_used = 0
prompt_tokens = 0
completion_tokens = 0
for text in info_text:
    response=f.extract_info_from_text(text)
    text_response = response.choices[0].message.content
    result.append(text_response)
    tokens_used += response.usage.total_tokens
    prompt_tokens += response.usage.prompt_tokens
    completion_tokens += response.usage.completion_tokens

In [7]:
# # parse the required information from the scraped text using gpt-3.5-turbo. Duration~15s
# result=[]
# for text in info_text:
#     result.append(f.extract_info_from_text(text))

In [8]:
import json

deaths=[]
affected_areas=[]
for item in result:
    # convert the parsed info into python dictionary format
    item_n = item.replace("'",'"')
    info = json.loads(item_n)
    # extract info from the dictionary
    deaths.append(info['number_of_deaths'])
    affected_areas.append(info['areas_affected'])

## clean data and export to the csv file

In [9]:
import pandas as pd

columns={'hurricane_storm_name':hurricane_storm_name[:-1],
         'date_start':start_date,
         'date_end':end_date,
         'number_of_deaths':deaths,
         'list_of_areas_affected':affected_areas
         }

# populate a dataframe with the required info
df = pd.DataFrame(columns)

In [10]:
# Apply the conversion to the 'date' column
df['date_start'] = df['date_start'].apply(f.convert_date)
df['date_end'] = df['date_end'].apply(f.convert_date)

In [11]:
# join the areas from the area list
df['list_of_areas_affected'] = df['list_of_areas_affected'].apply(lambda x: ', '.join(x))

In [12]:
# export data to csv
df.to_csv('hurricanes_1975.csv', index=False)

In [16]:
with open('log.txt', 'w') as file:
    file.write('tokens_used: ' + str(tokens_used) + '\n')
    file.write('prompt_tokens: ' + str(prompt_tokens) + '\n')
    file.write('completion_tokens: ' + str(completion_tokens))