In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [3]:
def extract_table_data_from_page(html_content, get_content_from_links=False):
    """Extract table data from a single page"""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the table with the specified classes
    table = soup.find('table', class_='table table-bordered table-striped rwd-table text-center')
    
    if not table:
        return []
    
    # Initialize lists to store the data for this page
    page_data = []
    
    # Find all rows in the table body
    rows = table.find('tbody').find_all('tr')
    
    for row in rows:
        # Get all cells in the row
        cells = row.find_all('td')
        
        if len(cells) >= 4:
            # Extract data from each cell
            released_by = cells[0].get_text(strip=True)
            modified_date = cells[1].get_text(strip=True)
            type_value = cells[2].get_text(strip=True)
            
            # Extract title and link
            title_cell = cells[3]
            title = title_cell.get_text(strip=True)
            
            # Find the anchor tag to extract the href
            link_tag = title_cell.find('a')
            link_content = ""
            
            # Add the domain prefix to the link
            if link_tag and link_tag.get('href'):
                link = "https://hike.taiwan.gov.tw/en/" + link_tag.get('href')
                
                # Get content from the link if requested
                if get_content_from_links:
                    link_content = extract_content_from_link(link)
            else:
                link = ""
            
            # Create a record for this row
            record = {
                'Released by': released_by,
                'Modified Date': modified_date,
                'Type': type_value,
                'Title': title,
                'Link': link,
                'Link Content': link_content
            }
            
            page_data.append(record)
    
    return page_data

def extract_content_from_link(url):
    """Extract content from alert boxes in the linked page"""
    try:
        # Set headers to mimic a browser request
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Get the page content
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all alert boxes
        alert_boxes = soup.find_all('div', class_='alert alert-light border')
        
        # Extract and combine text from all alert boxes
        alert_texts = []
        for box in alert_boxes:
            # Get all text, including from nested elements, with proper spacing
            text = box.get_text(separator=' ', strip=True)
            alert_texts.append(text)
        
        return '\n\n'.join(alert_texts)
    
    except Exception as e:
        print(f"Error extracting content from {url}: {e}")
        return "Error: Could not extract content"

def find_next_page_url(html_content, base_url="https://hike.taiwan.gov.tw"):
    """Find the URL of the next page if it exists"""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Look for the Next button
    next_link = soup.find('a', id='con_lnkNext')
    
    if next_link and next_link.get('href'):
        # Return the complete URL
        return base_url + next_link.get('href')
    
    return None

def scrape_all_pages(start_url, max_pages=100, get_content_from_links=True):
    """Scrape data from all pages starting from the given URL"""
    all_data = []
    current_url = start_url
    page_count = 0
    
    # Set headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    while current_url and page_count < max_pages:
        # Increment page counter
        page_count += 1
        print(f"Scraping page {page_count}: {current_url}")
        
        try:
            # Get the page content
            response = requests.get(current_url, headers=headers)
            response.raise_for_status()  # Raise exception for HTTP errors
            
            # Extract data from this page
            page_data = extract_table_data_from_page(response.text, get_content_from_links)
            all_data.extend(page_data)
            
            # Find the URL for the next page
            current_url = find_next_page_url(response.text)
            
            # Optional: add a small delay to be respectful to the server
            time.sleep(1)
            
        except Exception as e:
            print(f"Error scraping page {current_url}: {e}")
            break
    
    # Convert all collected data to a DataFrame
    df = pd.DataFrame(all_data)
    return df

# Example usage:
# start_url = "https://hike.taiwan.gov.tw/en/news_7.aspx"
# results = scrape_all_pages(start_url, get_content_from_links=True)
# print(f"Total records found: {len(results)}")
# results.to_csv("taiwan_hiking_data.csv", index=False)

In [4]:
start_url = "https://hike.taiwan.gov.tw/en/news_7.aspx"
results = scrape_all_pages(start_url)
print(f"Total records found: {len(results)}")

Scraping page 1: https://hike.taiwan.gov.tw/en/news_7.aspx
Scraping page 2: https://hike.taiwan.gov.tw/en/news_7.aspx?Page=2
Scraping page 3: https://hike.taiwan.gov.tw/en/news_7.aspx?Page=3
Scraping page 4: https://hike.taiwan.gov.tw/en/news_7.aspx?Page=4
Scraping page 5: https://hike.taiwan.gov.tw/en/news_7.aspx?Page=5
Scraping page 6: https://hike.taiwan.gov.tw/en/news_7.aspx?Page=6
Scraping page 7: https://hike.taiwan.gov.tw/en/news_7.aspx?Page=7
Scraping page 8: https://hike.taiwan.gov.tw/en/news_7.aspx?Page=8
Scraping page 9: https://hike.taiwan.gov.tw/en/news_7.aspx?Page=9
Scraping page 10: https://hike.taiwan.gov.tw/en/news_7.aspx?Page=10
Total records found: 92


In [5]:
results

Unnamed: 0,Released by,Modified Date,Type,Title,Link,Link Content
0,Taroko National Park Headquarters,2024/08/12,C01-Application Regulations,《Important Notices on Application for Entry in...,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2024/08/12\r\n \...
1,CPAMI,2018/09/10,C15-Online Application System,Instructions,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2018/09/10\r\n \...
2,Taroko National Park Headquarters,2019/04/01,C01-Application Regulations,What is a Mountain Entry Permit? What is a Par...,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2019/04/01\r\n \...
3,Yushan National Park Headquarters,2024/11/27,01-Application Regulations,How to Apply for Yushan National Park Ecologic...,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2024/11/27\r\n \...
4,Yushan National Park Headquarters,2024/12/13,01-Application Regulations,A brief introduction of how to apply for Paiyu...,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2024/12/13\r\n \...
...,...,...,...,...,...,...
87,Shei-pa National Park Headquarters,2024/09/23,S02-Trail Classification,How are the hiking routes within the ecologica...,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2024/09/23\r\n \...
88,Shei-pa National Park Headquarters,2024/09/23,S02-Trail Classification,How are the routes classified during the snow ...,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2024/09/23\r\n \...
89,Shei-pa National Park Headquarters,2024/09/23,S02-Trail Classification,What experience is required when applying for ...,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2024/09/23\r\n \...
90,Shei-pa National Park Headquarters,2024/09/23,S02-Trail Classification,What equipment is required when entering the p...,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2024/09/23\r\n \...


In [6]:
results.columns = [col.replace(' ', '_').lower() for col in results.columns]

In [7]:
import hashlib

def create_row_hash(row):
    """
    Creates a unique hash ID for a row by combining all values and hashing them.
    """
    # Convert all values to strings and concatenate them
    row_string = ''.join([str(value) for value in row.values])
    
    # Create SHA-256 hash
    hash_object = hashlib.sha256(row_string.encode())
    hash_hex = hash_object.hexdigest()
    
    # You can use the full hash or truncate it for shorter IDs
    return hash_hex[:16]  # Return first 16 characters of the hash

In [8]:
results['id'] = results.apply(create_row_hash, axis=1)

print(results[['id']].head())

                 id
0  c51b0dba91466efa
1  418c732ed38f0842
2  da02d5155b7a573f
3  9c82ab7be73b3827
4  e9ae2c9ab22c477b


In [9]:
num_ids = len(results['id'])
num_unique_ids = len(results['id'].unique())
print(f"Total rows: {num_ids}")
print(f"Unique IDs: {num_unique_ids}")
print(f"Duplicate IDs: {num_ids - num_unique_ids}")

Total rows: 92
Unique IDs: 92
Duplicate IDs: 0


In [10]:
results.head()

Unnamed: 0,released_by,modified_date,type,title,link,link_content,id
0,Taroko National Park Headquarters,2024/08/12,C01-Application Regulations,《Important Notices on Application for Entry in...,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2024/08/12\r\n \...,c51b0dba91466efa
1,CPAMI,2018/09/10,C15-Online Application System,Instructions,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2018/09/10\r\n \...,418c732ed38f0842
2,Taroko National Park Headquarters,2019/04/01,C01-Application Regulations,What is a Mountain Entry Permit? What is a Par...,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2019/04/01\r\n \...,da02d5155b7a573f
3,Yushan National Park Headquarters,2024/11/27,01-Application Regulations,How to Apply for Yushan National Park Ecologic...,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2024/11/27\r\n \...,9c82ab7be73b3827
4,Yushan National Park Headquarters,2024/12/13,01-Application Regulations,A brief introduction of how to apply for Paiyu...,https://hike.taiwan.gov.tw/en/news_7_1.aspx?ID...,Modified Date: 2024/12/13\r\n \...,e9ae2c9ab22c477b


In [11]:
# Reorder the DataFrame columns
results = results[['id', 'type', 'title', 'link_content', 'link', 'released_by', 'modified_date']]

In [13]:
results.to_csv("data/taiwan_hiking_data.csv", index=False)