In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
date_format= '%Y%m%d'

In [45]:
def get_html(url):
    # Send a request to get HTML content from the specified URL.
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept-Language': 'en-US,en;q=0.9',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'TE': 'Trailers'
    }
    try:
        # Send the request with specified headers
        response = requests.get(url,headers=headers)
        # Check if the request was successful
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error during request: {e}")
        return None
#############
def extract_data_from_row(row):
    # Extract data from a row in the table.
    columns = row.find_all('td')
    holiday_column = columns[2]

    # Extract data from each column
    day = columns[0].text.strip()
    date = columns[1].find('time')['datetime']
    holiday = holiday_column.find('a').text.strip()
    holiday_type = columns[3].text.strip()
    comments = columns[4].text.strip()

    # Convert date string to datetime object
    date = datetime.strptime(date, '%Y-%m-%d')

    return {
        'Day': date.strftime('%A'),
        'Date': date.strftime(date_format),
        'Holiday': holiday,
        'Type': holiday_type,
        'Is Holiday': check_is_holiday(date,holiday_type),
        'Comments': comments
    }
##########################
def check_is_holiday(date, holiday_type):
    # Check if a given date is a holiday based on its type.
    holiday_list=["National Holiday", "Extended Weekend"]

    if holiday_type in holiday_list:
        return True

    if "Compensated" in holiday_type:
        return False

    if "Not A Public Holiday" in holiday_type:
        date_obj = pd.to_datetime(date, errors='coerce')
        is_weekend = (date_obj.weekday() >= 5)

        return is_weekend

    return False

In [88]:

def scrape_holidays_for_year(year):
    # Scrape holiday information for a specific year based on the provided year.
    holiday_url = f"https://www.officeholidays.com/countries/taiwan/{year}"
    data = {
        'Day': [],
        'Date': [],
        'Holiday': [],
        'Type': [],
        'Is Holiday': [],
        'Comments': []
    }
    # Retrieve HTML content from the holiday URL
    html=get_html(holiday_url)
    # If HTML content is retrieved successfully, parse it using BeautifulSoup
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        # Extract holiday data from the parsed HTML
        for row in soup.select('tbody tr'):
            holiday_data = extract_data_from_row(row)
            for key, value in holiday_data.items():
                data[key].append(value)
    return pd.DataFrame(data)

In [108]:
scrape_holidays_for_year(2024)

Unnamed: 0,Day,Date,Holiday,Type,Is Holiday,Comments
0,Monday,20240101,Republic Day,National Holiday,True,
1,Thursday,20240208,Lunar New Year Holiday,National Holiday,True,Compensated by Sat 17 February
2,Friday,20240209,Lunar New Year's Eve,National Holiday,True,
3,Saturday,20240210,Lunar New Year,National Holiday,True,1st day of 1st lunar month
4,Sunday,20240211,Lunar New Year Holiday,National Holiday,True,
5,Monday,20240212,Lunar New Year Holiday,National Holiday,True,
6,Tuesday,20240213,Lunar New Year Holiday,National Holiday,True,
7,Wednesday,20240214,Lunar New Year Holiday,National Holiday,True,
8,Saturday,20240224,Lantern Festival,Not A Public Holiday,True,15th Day of 1st lunar month
9,Wednesday,20240228,Peace Memorial Day,National Holiday,True,
