In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

# Configuration for a single site (for example: "Visit Chattanooga")
SITE = {
    "name": "CHA Guide Events",
    "url": "https://www.cha.guide/events"
}

def fetch_page(url):
    options = Options()
    options.add_argument("--headless")  # Run in headless mode (no GUI)
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    
    try:
        driver.get(url)
        time.sleep(5)  # Wait for JavaScript to load content
        return driver.page_source
    except Exception as e:
        print(f"Error fetching the page: {e}")
        return None
    finally:
        driver.quit()

def parse_html(html_content):
    return BeautifulSoup(html_content, 'html.parser')

def log_parsed_html(parsed_content, site_name):
    with open(f"{site_name}_parsed_log.html", "w", encoding='utf-8') as file:
        file.write(str(parsed_content))

def main():
    site_name = SITE["name"]
    url = SITE["url"]
    
    print(f"Fetching and parsing {site_name}")
    html_content = fetch_page(url)  # This should be using Selenium
    if html_content:
        parsed_content = parse_html(html_content)
        print(f"Length of parsed content: {len(str(parsed_content))}")
        log_parsed_html(parsed_content, site_name)
        print(f"Parsed HTML content has been logged to {site_name}_parsed_log.html")
    else:
        print(f"Skipping {site_name} due to fetch error")

if __name__ == "__main__":
    main()


Fetching and parsing CHA Guide Events


OSError: [Errno 8] Exec format error: '/home/garner/.wdm/drivers/chromedriver/linux64/127.0.6533.88/chromedriver-linux64/THIRD_PARTY_NOTICES.chromedriver'

In [4]:
pip install chromedriver-autoinstaller

Collecting chromedriver-autoinstaller
  Downloading chromedriver_autoinstaller-0.6.4-py3-none-any.whl.metadata (2.1 kB)
Downloading chromedriver_autoinstaller-0.6.4-py3-none-any.whl (7.6 kB)
Installing collected packages: chromedriver-autoinstaller
Successfully installed chromedriver-autoinstaller-0.6.4
Note: you may need to restart the kernel to use updated packages.


In [18]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import chromedriver_autoinstaller
from bs4 import BeautifulSoup
from datetime import datetime
import time

# Configuration
SITES = {

    "Visit Chattanooga": {
        "url": "https://www.visitchattanooga.com/events/",
        "content_list_class": {"div": "content grid"},
        "item_attr": {"div": {"data-type": "events"}},
        "title": {"a": {"class": "title truncate"}},
        "date": {"span": {"class": "mini-date-container"}},
        "month": {"span": {"class": "month"}},
        "day": {"span": {"class": "day"}},
        "img": {
            "img": {"class": "thumb"},
            "parse_method": "lazy-src"
        },
        "location": {"li": {"class": "locations truncate"}},
        "recurrence": {"li": {"class": "recurrence"}}
    },
    # "Nooga Today": {
    #     "url": "https://noogatoday.6amcity.com/events#/", # preloaded_lightbox blocking site
    #     "content_list_class": "",
    #     "item_attr": {},
    #     "title_tag": "a",
    #     "title_class": "",
    #     "date_class": "",
    #     "month_class": "",
    #     "day_class": "",
    #     "img_class": "",
    #     "location_class": "",
    #     "recurrence_class": ""
    # },
    # "Choose Chatt": {
    #     "url": "https://choosechatt.com/chattanooga-events/", # dialog-lightbox-message blocking site
    #     "content_list_class": "",
    #     "item_attr": {},
    #     "title_tag": "a",
    #     "title_class": "",
    #     "date_class": "",
    #     "month_class": "",
    #     "day_class": "",
    #     "img_class": "",
    #     "location_class": "",
    #     "recurrence_class": ""
    # },
    "CHA Guide Events": {
        "url": "https://www.cha.guide/events",
        "content_list_class": {"div": "flex-table w-dyn-items"},
        "item_attr": {"div": {"role": "listitem"}},
        "title": {"h3": {"class": "event-title"}},
        "date": {"div": {"class": "event-date-div"}},
        "month": {"div": {"class": "event-month"}},
        "day": {"div": {"class": "event-card-date"}},
        "img": {
            "parse_method": "none",
        },
        "location": {"div": {"class": "location-2"}},
        "recurrence": {"div": {"class": "event---category-circle"}}
        # "category": {"div": {"class": "event---category-circle"}}
        # "details": {"div": {"class": "smaller-text bottom-margin---5px"}}
    },
    "Chattanooga Pulse": {
        "url": "https://www.chattanoogapulse.com/search/event/the-pulse-event-search/#page=1",
        "content_list_class": {"div": {"id": "event_list_div", "class": "results"}},
        "item_attr": {"div": {"class": "event_result"}},
        "title": {"h4": {"class": "event_title"}},
        "date": {"p": {"class": "event_date"}},
        # modify our SITES dictionary to allow for more detailed date extraction instructions per site
        "month_class": "",
        "day_class": "",
        "img": {
            "container": {"div": {"class": "event_thumb"}},
            "tag": "img",
            "attr": "srcset",
            "parse_method": "srcset_220w"
        },
        "location": {"a": {}},
        "recurrence": {"p": {"class": "cats"}}
        # Add a description or details section

    },
    
    # "Times Free Press": {
    #     "url": "https://www.timesfreepress.com/tfpevents/?_evDiscoveryPath=/",
    #     "content_list_class": "",
    #     "item_attr": {},
    #     "title_tag": "a",
    #     "title_class": "",
    #     "date_class": "",
    #     "month_class": "",
    #     "day_class": "",
    #     "img_class": "",
    #     "location_class": "",
    #     "recurrence_class": ""
    # },
    # "CHA Guide Weekly": {
    #     "url": "https://www.cha.guide/explore/things-to-do-in-chattanooga-this-week",
    #     "content_list_class": "",
    #     "item_attr": {},
    #     "title_tag": "a",
    #     "title_class": "",
    #     "date_class": "",
    #     "month_class": "",
    #     "day_class": "",
    #     "img_class": "",
    #     "location_class": "",
    #     "recurrence_class": ""
    # },
    # "Chattanooga Chamber": {
    #     "url": "https://chattanoogachamber.com/",
    #     "content_list_class": "",
    #     "item_attr": {},
    #     "title_tag": "a",
    #     "title_class": "",
    #     "date_class": "",
    #     "month_class": "",
    #     "day_class": "",
    #     "img_class": "",
    #     "location_class": "",
    #     "recurrence_class": ""
    # },
    # "Chattanooga Library": {
    #     "url": "https://www.chattlibrary.org/events/",
    #     "content_list_class": "",
    #     "item_attr": {},
    #     "title_tag": "a",
    #     "title_class": "",
    #     "date_class": "",
    #     "month_class": "",
    #     "day_class": "",
    #     "img_class": "",
    #     "location_class": "",
    #     "recurrence_class": ""
    # }
}

def fetch_page(url):
    options = Options()
    options.add_argument("--headless")  # Run in headless mode (no GUI)
    service = Service(ChromeDriverManager().install())
    chromedriver_autoinstaller.install()
    driver = webdriver.Chrome()
    
    try:
        driver.get(url)
        time.sleep(5)  # Wait for JavaScript to load content
        return driver.page_source
    except Exception as e:
        print(f"Error fetching the page: {e}")
        return None
    finally:
        driver.quit()

def parse_html(html_content):
    return BeautifulSoup(html_content, 'html.parser')

def extract_image_url(img_config, item):
    if img_config['parse_method'] == 'lazy-src':
        img_tag, img_attrs = next(iter(img_config.items()))
        img_element = item.find(img_tag, **img_attrs)
        return img_element.get('data-lazy-src') or img_element.get('src') if img_element else None

    elif img_config['parse_method'] == 'srcset_220w':
        container_tag, container_attrs = next(iter(img_config['container'].items()))
        container = item.find(container_tag, **container_attrs)
        if not container:
            return None

        img_element = container.find(img_config['tag'])
        if not img_element or img_config['attr'] not in img_element.attrs:
            return None

        srcset = img_element[img_config['attr']]
        urls = srcset.split(', ')
        for url in urls:
            if '220w' in url:
                return url.split(' ')[0]
        return urls[0].split(' ')[0] if urls else None

    elif img_config['parse_method'] == 'none':
        return None
    
    # Add other parsing methods here if needed
    
    return None

def extract_events(parsed_content, config):
    events = []
    print(f"Searching for content list with: {config['content_list_class']}")
    
    # Unpack the content_list_class dictionary
    tag, class_name = next(iter(config['content_list_class'].items()))
    content_list = parsed_content.find(tag, class_=class_name)
    
    if not content_list:
        print("Couldn't find content list")
        return events

    # Unpack the item_attr dictionary
    item_tag, item_attrs = next(iter(config['item_attr'].items()))
    items = content_list.find_all(item_tag, **item_attrs)
    
    print(f"Found {len(items)} items with specified attributes")
    
    for item in items:
        event = {}
        
        # Extract title and URL
        title_tag, title_attrs = next(iter(config['title'].items()))
        title_element = item.find(title_tag, **title_attrs)
        if title_element:
            event['title'] = title_element.text.strip()
            url_element = title_element if title_element.name == 'a' else title_element.find_parent('a')
            event['url'] = config["url"] + url_element['href'] if url_element else ''
        else:
            print("Couldn't find title element")
            continue
        
        # Extract date
        date_tag, date_attrs = next(iter(config['date'].items()))
        date_element = item.find(date_tag, **date_attrs)
        if date_element:
            month_tag, month_attrs = next(iter(config['month'].items()))
            day_tag, day_attrs = next(iter(config['day'].items()))
            month = date_element.find(month_tag, **month_attrs)
            day = date_element.find(day_tag, **day_attrs)
            if month and day:
                event_date = f"{month.text.strip()} {day.text.strip()}, {datetime.now().year}"
                event['date'] = event_date
            else:
                event['date'] = date_element.text.strip()
        else:
            event['date'] = None
        
        # Extract image URL
        if config.get("img"):
            event['image_url'] = extract_image_url(config["img"], item)
        else:
            event['image_url'] = None
        
        # Extract location
        location_tag, location_attrs = next(iter(config['location'].items()))
        location_element = item.find(location_tag, **location_attrs)
        event['location'] = location_element.text.strip() if location_element else None
        
        # Extract recurrence information
        recurrence_tag, recurrence_attrs = next(iter(config['recurrence'].items()))
        recurrence_element = item.find(recurrence_tag, **recurrence_attrs)
        event['recurrence'] = recurrence_element.text.strip() if recurrence_element else None
        
        events.append(event)
    
    return events

def main():
    for site_name, config in SITES.items():
        url = config["url"]
        print(f"Fetching and parsing {site_name}")
        html_content = fetch_page(url)  # This should be using Selenium
        if html_content:
            parsed_content = parse_html(html_content)
            print(f"Length of parsed content: {len(str(parsed_content))}")
            events = extract_events(parsed_content, config)
            print(f"Extracted {len(events)} events")
            for event in events:
                print(f"Title: {event.get('title')}")
                print(f"URL: {event.get('url')}")
                print(f"Date: {event.get('date')}")
                print(f"Image URL: {event.get('image_url')}")
                print(f"Location: {event.get('location')}")
                print(f"Recurrence: {event.get('recurrence')}")
                print("-" * 40)
        else:
            print(f"Skipping {site_name} due to fetch error")
        time.sleep(1)  # Be polite, wait a second between requests

if __name__ == "__main__":
    main()


Fetching and parsing Visit Chattanooga
Length of parsed content: 502334
Searching for content list with: {'div': 'content grid'}
Found 12 items with specified attributes
Extracted 12 events
Title: Chickamauga Dam Lock Through Kayak Tour with Chattanooga Guided Adventures
URL: https://www.visitchattanooga.com/events//event/chickamauga-dam-lock-through-kayak-tour-with-chattanooga-guided-adventures/19807/
Date: Aug 05, 2024
Image URL: https://assets.simpleviewinc.com/simpleview/image/upload/c_fill,h_227,q_75,w_340/v1/crm/chattanooga/CGA-Chickamanga_17047717-561C-4434-B479C10F432C4937_42bd9946-ca66-4b83-9259ff2418b75f33.jpg
Location: Hubert Fry Center Boatramp
Recurrence: Recurring daily until November 10, 2024
----------------------------------------
Title: Deadpool & Wolverine in IMAX!
URL: https://www.visitchattanooga.com/events//event/deadpool-%26-wolverine-in-imax!/20885/
Date: Aug 05, 2024
Image URL: https://assets.simpleviewinc.com/simpleview/image/upload/c_fill,h_227,q_75,w_340/v1/

# Latest working code:

In [41]:
from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import chromedriver_autoinstaller
from bs4 import BeautifulSoup
from datetime import datetime
from dateutil import parser
import time
import re

# Configuration
SITES = {

    "Visit Chattanooga": {
        "url": "https://www.visitchattanooga.com/events/",
        "content_list_class": {"div": {"class": "content grid"}},
        "item_attr": {"div": {"data-type": "events"}},
        "title": {"a": {"class": "title truncate"}},
        "date": {"span": {"class": "mini-date-container"}},
        "img": {
            "img": {"class": "thumb"},
            "parse_method": "lazy-src"
        },
        "location": {"li": {"class": "locations truncate"}},
        "recurrence": {"li": {"class": "recurrence"}}
    },

    "CHA Guide Events": {
        "url": "https://www.cha.guide/events",
        "content_list_class": {"div": {"class": "flex-table w-dyn-items"}},
        "item_attr": {"div": {"role": "listitem"}},
        "title": {"h3": {"class": "event-title"}},
        "date": {"div": {"class": "event-date-div"}},
        "img": {
            "parse_method": "none",
        },
        "location": {"div": {"class": "location-2"}},
        "recurrence": {"div": {"class": "event---category-circle"}}
        # "category": {"div": {"class": "event---category-circle"}}
        # "details": {"div": {"class": "smaller-text bottom-margin---5px"}}
    },
    
    "Chattanooga Pulse": {
        "url": "https://www.chattanoogapulse.com/search/event/the-pulse-event-search/#page=1",
        "content_list_class": {"div": {"id": "event_list_div"}},
        "item_attr": {"div": {"class": "event_result"}},
        "title": {"h4": {"class": "event_title"}},
        "date": {"p": {"class": "event_date"}},
        "img": {
            "container": {"div": {"class": "event_thumb"}},
            "tag": "img",
            "attr": "srcset",
            "parse_method": "srcset_220w"
        },
        "location": {"a": {}}, # outputs title instead of location
        "recurrence": {"p": {"class": "cats"}}
        # Add a description or details section

    },
    
    # "Nooga Today": {
    #     "url": "https://noogatoday.6amcity.com/events#/", # preloaded_lightbox blocking site
    #     },
    
    # "Choose Chatt": {
    #     "url": "https://choosechatt.com/chattanooga-events/", # dialog-lightbox-message blocking site
    # },

    # "Times Free Press": {
    #     "url": "https://www.timesfreepress.com/tfpevents/?_evDiscoveryPath=/",
    # },

    # "CHA Guide Weekly": {
    #     "url": "https://www.cha.guide/explore/things-to-do-in-chattanooga-this-week",
    # },

    # "Chattanooga Chamber": {
    #     "url": "https://chattanoogachamber.com/",
    # },

    # "Chattanooga Library": {
    #     "url": "https://www.chattlibrary.org/events/",
    # }

}

def fetch_page(url):
    options = Options()
    options.add_argument("--headless")  # Run in headless mode (no GUI)
    service = Service(ChromeDriverManager().install())
    chromedriver_autoinstaller.install()
    driver = webdriver.Chrome()
    
    try:
        driver.get(url)
        time.sleep(5)  # Wait for JavaScript to load content
        return driver.page_source
    except Exception as e:
        print(f"Error fetching the page: {e}")
        return None
    finally:
        driver.quit()

def parse_html(html_content):
    return BeautifulSoup(html_content, 'html.parser')

def extract_image_url(img_config, item):
    if img_config['parse_method'] == 'lazy-src':
        img_tag, img_attrs = next(iter(img_config.items()))
        img_element = item.find(img_tag, **img_attrs)
        return img_element.get('data-lazy-src') or img_element.get('src') if img_element else None

    elif img_config['parse_method'] == 'srcset_220w':
        container_tag, container_attrs = next(iter(img_config['container'].items()))
        container = item.find(container_tag, **container_attrs)
        if not container:
            return None

        img_element = container.find(img_config['tag'])
        if not img_element or img_config['attr'] not in img_element.attrs:
            return None

        srcset = img_element[img_config['attr']]
        urls = srcset.split(', ')
        for url in urls:
            if '220w' in url:
                return url.split(' ')[0]
        return urls[0].split(' ')[0] if urls else None

    elif img_config['parse_method'] == 'none':
        return None
    
    # Add other parsing methods here if needed
    
    return None

def parse_date_range(date_text):
    # Regular expression to match date, start time, and end time
    pattern = r'(\w+ \d+, \d{4})(?: (\d+:\d+ [AP]M))?(?: - (\d+:\d+ [AP]M))?'
    match = re.match(pattern, date_text)
    
    if match:
        date_str, start_time, end_time = match.groups()
        
        try:
            # Parse the date
            parsed_date = parser.parse(date_str)
            date = parsed_date.strftime("%Y-%m-%d")
            
            # Parse start time if available
            start = None
            if start_time:
                start = parser.parse(f"{date_str} {start_time}").strftime("%H:%M")
            
            # Parse end time if available
            end = None
            if end_time:
                end = parser.parse(f"{date_str} {end_time}").strftime("%H:%M")
            
            return {
                'date': date,
                'start_time': start,
                'end_time': end
            }
        except ValueError:
            print(f"Couldn't parse date: {date_text}")
    
    return {
        'date': None,
        'start_time': None,
        'end_time': None
    }

def extract_events(parsed_content, config):
    events = []
    print(f"Searching for content list with: {config['content_list_class']}")
    
    # Unpack the content_list_class dictionary
    tag, class_name = next(iter(config['content_list_class'].items()))
    
    content_list = None
    
    # Try to find content list by id first
    try:
        if 'id' in class_name:
            content_list = parsed_content.find(tag, id=class_name['id'])
            print(f"Attempting to find content list by id: {class_name['id']}")
    except Exception as e:
        print(f"Error when searching by id: {e}")

    # If not found by id, try to find by class
    if not content_list:
        try:
            if 'class' in class_name:
                content_list = parsed_content.find(tag, class_=class_name['class'])
                print(f"Attempting to find content list by class: {class_name['class']}")
        except Exception as e:
            print(f"Error when searching by class: {e}")

    # If still not found, try to find without any attributes
    if not content_list:
        try:
            content_list = parsed_content.find(tag)
            print(f"Attempting to find content list by tag: {tag}")
        except Exception as e:
            print(f"Error when searching by tag: {e}")

    if not content_list:
        print("Couldn't find content list")
        return events

    # print(f"Found content list: {content_list}")

    # Unpack the item_attr dictionary
    item_tag, item_attrs = next(iter(config['item_attr'].items()))
    try:
        items = content_list.find_all(item_tag, **item_attrs)
        print(f"Found {len(items)} items with specified attributes")
    except Exception as e:
        print(f"Error when finding items: {e}")
        return events

    for item in items:
        event = {}
        
        # Extract title and URL
        try:
            title_tag, title_attrs = next(iter(config['title'].items()))
            title_element = item.find(title_tag, **title_attrs)
            if title_element:
                event['title'] = title_element.text.strip()
                url_element = title_element if title_element.name == 'a' else title_element.find_parent('a')
                event['url'] = config["url"] + url_element['href'] if url_element else ''
            else:
                print("Couldn't find title element")
                continue
        except Exception as e:
            print(f"Error extracting title: {e}")
            continue

        # Extract date and times
        try:
            date_tag, date_attrs = next(iter(config['date'].items()))
            date_element = item.find(date_tag, **date_attrs)
            if date_element:
                date_text = date_element.text.strip()
                date_info = parse_date_range(date_text)
                event.update(date_info)
            else:
                event.update({
                    'date': None,
                    'start_time': None,
                    'end_time': None
                })
        except Exception as e:
            print(f"Error extracting date: {e}")

        # Extract image URL
        try:
            if config.get("img"):
                event['image_url'] = extract_image_url(config["img"], item)
            else:
                event['image_url'] = None
        except Exception as e:
            print(f"Error extracting image URL: {e}")
            event['image_url'] = None

        # Extract location
        try:
            location_tag, location_attrs = next(iter(config['location'].items()))
            location_element = item.find(location_tag, **location_attrs)
            event['location'] = location_element.text.strip() if location_element else None
        except Exception as e:
            print(f"Error extracting location: {e}")
            event['location'] = None

        # Extract recurrence information
        try:
            recurrence_tag, recurrence_attrs = next(iter(config['recurrence'].items()))
            recurrence_element = item.find(recurrence_tag, **recurrence_attrs)
            event['recurrence'] = recurrence_element.text.strip() if recurrence_element else None
        except Exception as e:
            print(f"Error extracting recurrence: {e}")
            event['recurrence'] = None

        events.append(event)
    
    return events

def main():
    for site_name, config in SITES.items():
        url = config["url"]
        print(f"Fetching and parsing {site_name}")
        html_content = fetch_page(url)
        if html_content:
            parsed_content = parse_html(html_content)
            print(f"Length of parsed content: {len(str(parsed_content))}")
            events = extract_events(parsed_content, config)
            print(f"Extracted {len(events)} events")
            for event in events:
                print(f"Title: {event.get('title')}")
                print(f"URL: {event.get('url')}")
                print(f"Date: {event.get('date')}")
                print(f"Start Time: {event.get('start_time')}")
                print(f"End Time: {event.get('end_time')}")
                print(f"Image URL: {event.get('image_url')}")
                print(f"Location: {event.get('location')}")
                print(f"Recurrence: {event.get('recurrence')}")
                print("-" * 40)
        else:
            print(f"Skipping {site_name} due to fetch error")
        time.sleep(1)  # Be polite, wait a second between requests

if __name__ == "__main__":
    main()


Fetching and parsing Visit Chattanooga
Length of parsed content: 502460
Searching for content list with: {'div': {'class': 'content grid'}}
Attempting to find content list by class: content grid
Found 12 items with specified attributes
Extracted 12 events
Title: Chickamauga Dam Lock Through Kayak Tour with Chattanooga Guided Adventures
URL: https://www.visitchattanooga.com/events//event/chickamauga-dam-lock-through-kayak-tour-with-chattanooga-guided-adventures/19807/
Date: None
Start Time: None
End Time: None
Image URL: https://assets.simpleviewinc.com/simpleview/image/upload/c_fill,h_227,q_75,w_340/v1/crm/chattanooga/CGA-Chickamanga_17047717-561C-4434-B479C10F432C4937_42bd9946-ca66-4b83-9259ff2418b75f33.jpg
Location: Hubert Fry Center Boatramp
Recurrence: Recurring daily until November 10, 2024
----------------------------------------
Title: Deadpool & Wolverine in IMAX!
URL: https://www.visitchattanooga.com/events//event/deadpool-%26-wolverine-in-imax!/20885/
Date: None
Start Time: N