In [1]:
from bs4 import BeautifulSoup, Tag
import requests
import re
import os
import time
from tqdm import tqdm

In [None]:
base_url = 'https://flyairpeace.com/'
path = '../data/flights'
bs_parser = 'html.parser'
delay_sec = 5

if not os.path.exists(path):
    os.makedirs(path)

In [None]:
def fetch_page(url):
    """Fetch the page and return a BeautifulSoup object."""
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to fetch {url} with status code {response.status_code}")
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
    return None

In [None]:
def get_article_links(html_content):
    """Extract HTML content."""
    links = []
    soup = BeautifulSoup(html_content, 'html.parser')  # Convert string to BeautifulSoup object here
    articles = soup.find_all('div', class_='post-item style2 no-padding')
    for article in articles:
        link = article.find('a', href=True)
        if link:
            links.append(link['href'])
    return links

In [None]:
def extract_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    data = {}

    # Extract the title
    title_section = soup.find('h1', class_='page-title')
    data['title'] = title_section.get_text(strip=True) if title_section else 'No Title Found'

    # Initialize the body content
    body_content = []

    # Process the main content section
    main_content = soup.find('div', class_='post-desc')
    if main_content:
        # Collect text from all paragraphs until 'Schools' section
        for element in main_content.next_elements:
            if isinstance(element, Tag):
                if element.name == 'h2' and "Real Estate" in element.text:
                    break  # Stop processing if 'Real Estate' section is reached
                if element.name == 'p':
                    body_content.append(element.get_text(strip=True))

    data['body'] = ' '.join(body_content)
    return data

In [None]:

def save_content_to_markdown(data, filename):
    """Convert extracted data to markdown and save to a file."""
    title_md = md(f"# {data['title']}")
    body_md = md(data['body'])
    markdown_content = f"{title_md}\n\n{body_md}"
    with open(os.path.join(path, f"{filename}.md"), 'w') as file:
        file.write(markdown_content)

In [None]:
def process_article_links(links):
    for link in links:
        html_content = fetch_page(link)
        if html_content:
            content_data = extract_content(html_content)
            save_content_to_markdown(content_data, link.split('/')[-1])
            print(f"Data from {link}:")
            print(content_data)
        else:
            print(f"Failed to process {link}")

In [None]:
def main():
    base_url = 'https://nigeriapropertycentre.com/area-guides'
    page_links = [f"{base_url}?page={i}" for i in range(1, 11)]  # Pages 1 to 10
    all_links = []
    
    for page_link in page_links:
        soup = fetch_page(page_link)
        if soup:
            article_links = get_article_links(soup)
            all_links.extend(article_links)
            print(f"Found {len(article_links)} links on {page_link}")

    # Display all unique links gathered
    unique_links = list(set(all_links))  # Remove duplicates if any
    print(f"Total unique links found: {len(unique_links)}")
    for link in unique_links:
        print(link)

    process_article_links(unique_links)

if __name__ == '__main__':
    main()