# Tide scraper 

## Import modules

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

from datetime import datetime

import wget

import zipfile

import platform

import os.path

import json

import pandas as pd

## Setup Input locations

In [2]:
location_l = [
    "Half Moon Bay, California",
    "Huntington Beach, California",
    "Providence, Rhode Island",
    "Wrightsville Beach, North Carolina",
]

In [3]:
home_url = "https://www.tide-forecast.com/"

## Setup driver

### If running on widows download windows chromedriver 

In [4]:
web_driver_path = './chromedriver.exe'

In [5]:
if( os.path.exists(web_driver_path)  ):
    print("chrome driver is in current directory")
else:
    print("chrome driver not in current directory, will try to download")

    if( 'Windows' in platform.platform() ):

        filename = wget.download('https://chromedriver.storage.googleapis.com/100.0.4896.60/chromedriver_win32.zip')

        print(filename)

        with zipfile.ZipFile(filename, 'r') as zip_ref:
            try:
                zip_ref.extractall('./')
            except:
                print("Issue found extracting driver to current folder")

    else:
        print("Please place chromdrive in current directory")

chrome driver is in current directory


### Initialize driver

In [6]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-ssl-errors=yes')
options.add_argument('--ignore-certificate-errors')

driver = webdriver.Chrome(web_driver_path, options=options)

### Initialize json to capture data

## Run webscraper to get data

In [7]:
# Initialize data object to collect data from webpage
j = {}
# Loop over input locations 
for location_i in location_l:
    # Always start at the home page
    driver.get(home_url)
    # initialize data objects
    location_json = {}
    days = {}
    # Get location name for list 
    loc_city = location_i.split(",")[0].strip()
    loc_state = location_i.split(",")[1].strip()
    #
    print("Searching for {}, {}".format(loc_city, loc_state))
    location_found = False
    # Search for location using search bar
    driver.find_element_by_id("homepage-mast__location").send_keys( location_i )
    el_s = driver.find_element_by_xpath("/html/body/main/div/div[1]/div[1]/div[1]/div/div[1]/div[1]/form/div/div[2]/input")
    el_s.click()
    # If location page is found 
    if( loc_city in  driver.title ):
        location_found = True
    else:
        # Search for location using search bar region_id
        driver.find_element_by_id("region_id").send_keys( loc_state )
        driver.find_element_by_id("location_filename_part").send_keys( loc_city )
        el_s = driver.find_element_by_xpath("/html/body/header/div[2]/div/nav/div/div/form/div/div/div[4]/div/input")
        el_s.click()        
        # If location page is found 
        if( loc_city in  driver.title ):
            location_found = True     
    if( location_found ):
        print("{} page found".format(location_i) )
        # Loop over days on the page
        for tideday in driver.find_elements_by_class_name('tide-day'):
            # print(tideday)          
            # Get date for day 
            for tideday__date in tideday.find_elements_by_class_name('tide-day__date'):
                # Parse date from string 
                day_title = tideday__date.text
                day_date = day_title.split(':')[1].strip()
                # print(day_date)
                # Initialize lists for tide information 
                day ={}
                tide_l = []
                tide_dt_l = []
                tide_h_l = []
                sr_l = []
                sr_dt_l = []
                # Loop over tide table
                for tideday__tables in tideday.find_elements_by_class_name('tide-day__tables'):
                    for row in tideday__tables.find_elements_by_css_selector('tr'):
                        cell_l = row.find_elements_by_tag_name('td') 
                        # ToDo: 
                        #   Use class table classes "tide-day-tides" and "not-in-print tide-day__sun-moon"
                        #   to split table reads 
                        # Check if it is a tide table
                        if( ('Tide' in row.text) & (len(cell_l) == 3 ) ):
                            # print(row.text)
                            tide_l.append(cell_l[0].text)
                            tide_dt_l.append(cell_l[1].text)
                            tide_h_l.append(cell_l[2].text)
                        # Check if it is a sunrise sunset table
                        elif( len(cell_l) == 4  ):
                            for cell in cell_l:
                                sr_text_l = cell.text.split('\n')
                                if( len(sr_text_l) > 1 ):
                                    sr_l.append(sr_text_l[0])
                                    sr_dt_l.append(sr_text_l[1])
                # Place lists in a dictionary 
                day['tides'] = {'Tide':tide_l,'Datetime':tide_dt_l,'Height':tide_h_l}
                # Place lists in a dictionary 
                day['sr'] = {'Event':sr_l,'Datetime':sr_dt_l}
                # Add day to days dictionary
                days[day_date] = day
        # ToDo: Add scraper for today's infromation
        # Add days to location dictionary 
        location_json['days'] = days
        location_json['city'] = loc_city
        location_json['state'] = loc_state
        j[location_i] = location_json
    else:
        print("Error {} not found".format(loc_city) )

Searching for Half Moon Bay, California
Half Moon Bay, California page found
Searching for Huntington Beach, California
Huntington Beach, California page found
Searching for Providence, Rhode Island
Providence, Rhode Island page found
Searching for Wrightsville Beach, North Carolina
Wrightsville Beach, North Carolina page found


In [8]:
driver.close()

## Process data scrapped from website

In [9]:
def tide_check(day, day_data):
    '''Process day data and find low tide during day light hours
    
    Args:
        day (str): date ('%A %d %B %Y')
        day_data (dic): dictionary containing tide and sunrise/sunset data from web.

    Returns:
        df (DataFrame): time series of sunrise, sunset and tide events 
 
    '''
    # Create dataframes
    df_sr = pd.DataFrame( day_data['sr'] )
    df_tides = pd.DataFrame( day_data['tides'] )
    # Clean names 
    df_sr['Event'] = df_sr['Event'].str.replace(":","")
    # Clean times 
    df_sr['Datetime'] = df_sr['Datetime'].str.replace('00','12')
    df_tides['Datetime'] = df_tides['Datetime'].str.replace('00','12')
     # Create datetime objeccts 
    df_sr['Datetime_obj'] = df_sr['Datetime'].apply(lambda v:  datetime.strptime("{} {}".format(day, v), '%A %d %B %Y %I:%M%p' ) )
    df_tides['Datetime_obj'] = df_tides['Datetime'].apply(lambda v:  datetime.strptime("{} {}".format(day, v.split('\n')[0]), '%A %d %B %Y %I:%M %p' ) )
   # Find sunrise and sunset 
    sunrise = df_sr.loc[ df_sr['Event']== 'Sunrise', 'Datetime_obj' ].iloc[0]
    sunset = df_sr.loc[ df_sr['Event']== 'Sunset', 'Datetime_obj' ].iloc[0]
    print("{} Sunrise:{} and Sunset:{}".format(day, sunrise,sunset))
    # Filter low tide during the day light hours 
    df_tides['daylight_lowtide'] = (df_tides['Tide'] == 'Low Tide') & (df_tides['Datetime_obj'] >= sunrise ) & (df_tides['Datetime_obj'] <= sunset )
    df_sr['daylight_lowtide'] = False
    # Set type 
    df_tides['type'] = 'tide'
    df_sr['type'] = 'sr'
    # Set day 
    day_obj  =  datetime.strptime(day,'%A %d %B %Y')
    df_tides['day_obj'] = day_obj
    df_sr['day_obj'] = day_obj
    # Combine sr and tide data into single time series 
    df_tides = df_tides.rename(columns={'Tide':'Event'})
    return df_tides.append( df_sr )

In [11]:
# Loop over extracted location data 
for location_i, location_data in j.items():
    print(location_i,location_data['city'],location_data['state'])

Half Moon Bay, California Half Moon Bay California
Huntington Beach, California Huntington Beach California
Providence, Rhode Island Providence Rhode Island
Wrightsville Beach, North Carolina Wrightsville Beach North Carolina


In [12]:
# Initialize dataframe object 
df = pd.DataFrame()
# Loop over extracted location data 
for location_i, location_data in j.items():
    print(location_i,location_data['city'],location_data['state'])
    # Loop over days in and create time series for  sunrise, sunset and tide events 
    for day, day_data in location_data['days'].items():
        df_day = tide_check(day, day_data)
        df_day['City'] = location_data['city']
        df_day['State'] = location_data['state']
        df = df.append(df_day)

Half Moon Bay, California Half Moon Bay California
Thursday 05 May 2022 Sunrise:2022-05-05 06:10:00 and Sunset:2022-05-05 20:03:00
Friday 06 May 2022 Sunrise:2022-05-06 06:09:00 and Sunset:2022-05-06 20:04:00
Saturday 07 May 2022 Sunrise:2022-05-07 06:08:00 and Sunset:2022-05-07 20:05:00
Sunday 08 May 2022 Sunrise:2022-05-08 06:07:00 and Sunset:2022-05-08 20:06:00
Monday 09 May 2022 Sunrise:2022-05-09 06:06:00 and Sunset:2022-05-09 20:07:00
Tuesday 10 May 2022 Sunrise:2022-05-10 06:05:00 and Sunset:2022-05-10 20:08:00
Wednesday 11 May 2022 Sunrise:2022-05-11 06:04:00 and Sunset:2022-05-11 20:09:00
Thursday 12 May 2022 Sunrise:2022-05-12 06:03:00 and Sunset:2022-05-12 20:10:00
Friday 13 May 2022 Sunrise:2022-05-13 06:02:00 and Sunset:2022-05-13 20:10:00
Saturday 14 May 2022 Sunrise:2022-05-14 06:01:00 and Sunset:2022-05-14 20:11:00
Sunday 15 May 2022 Sunrise:2022-05-15 06:12:00 and Sunset:2022-05-15 20:12:00
Monday 16 May 2022 Sunrise:2022-05-16 05:59:00 and Sunset:2022-05-16 20:13:00
T

Sunday 29 May 2022 Sunrise:2022-05-29 06:01:00 and Sunset:2022-05-29 20:16:00
Monday 30 May 2022 Sunrise:2022-05-30 06:01:00 and Sunset:2022-05-30 20:16:00
Tuesday 31 May 2022 Sunrise:2022-05-31 06:12:00 and Sunset:2022-05-31 20:17:00
Wednesday 01 June 2022 Sunrise:2022-06-01 06:12:00 and Sunset:2022-06-01 20:17:00


In [13]:
df = df.reset_index()

## Export data

In [14]:
now_str = datetime.now().strftime("%F_%R").replace(":","_")

In [15]:
with open('{}_web_data.json'.format(now_str), 'w') as outfile:
    json.dump(j, outfile)

In [16]:
df.to_csv('{}_proc_data.csv'.format(now_str))