In [3]:
import datetime
import pandas as pd
from requests_html import AsyncHTMLSession
from bs4 import BeautifulSoup
import re

interested_cities = ["Mountain View", "Palo Alto", "San Jose", "Santa Clara", "Sunnyvale"]

categories = {
    "Business": "business",
    "Science & Tech": "science-and-tech"
}

async def get_event_details(event_url):
    session = AsyncHTMLSession()
    response = await session.get(event_url)
    await response.html.arender(timeout=20)
    soup = BeautifulSoup(response.html.html, 'html.parser')

    description_element = soup.find('div', {'class': 'structured-content-rich-text'})
    description = description_element.text.strip() if description_element else None

    location_element = soup.find('div', class_='detail__content')
    location_text = None
    if location_element:
        location_paragraph = location_element.find('p')
        if location_paragraph:
            location_text = location_paragraph.text.strip()

    streetAddress = None
    postalCode = None
    if location_text:
        if re.search(r'\b\d{5}\b', location_text):
            split_location = location_text.split(',')
            streetAddress = ', '.join(split_location[:-1]).strip()
            postalCode = split_location[-1].strip().split()[-1]

    return {'description': description, 'streetAddress': streetAddress, 'postalCode': postalCode}

def parse_location(location_str):
    split_location = location_str.split('•')
    place_name = split_location[0].strip()
    city_state = split_location[1].strip().split(',')
    city = city_state[0].strip()
    state = city_state[1].strip() if len(city_state) > 1 else None
    return place_name, city, state

async def get_event_data():
    event_data = pd.DataFrame(columns=['start_date', 'end_date', 'time', 'category', 'name', 'url', 'description','addressCountry', 'addressRegion', 'addressLocality', 'streetAddress', 'postalCode', 'place_name'])

    for category_name, category_slug in categories.items():
        for page_number in range(1, 57):
            url = f'https://www.eventbrite.com/d/ca--palo-alto/{category_slug}--events/silicon-valley/?page={page_number}'
            session = AsyncHTMLSession()
            response = await session.get(url)
            await response.html.arender(timeout=20)
            soup = BeautifulSoup(response.html.html, 'html.parser')
            event_cards = soup.find_all('div', class_='search-event-card-wrapper')
            
            # If no event cards are found, break the loop
            if not event_cards:
                break

            for event_card in event_cards:
                event_name_element = event_card.find('div', class_='eds-event-card-content__primary-content')
                event_name = event_name_element.text.strip() if event_name_element else None

                event_url_element = event_card.find('a', class_='eds-event-card-content__action-link')
                event_url = event_url_element['href'] if event_url_element else None

                event_date_element = event_card.find('div', class_='eds-event-card-content__sub-title eds-text-color--primary-brand eds-l-pad-bot-1 eds-l-pad-top-2 eds-text-weight--heavy eds-text-bm')
                event_date = event_date_element.text.strip() if event_date_element else None

                event_location_element = event_card.find('div', class_='card-text--truncated__one')
                event_location = event_location_element.text.strip() if event_location_element else None
    
                if event_location:
                    place_name, addressLocality, addressRegion = parse_location(event_location)

                if addressLocality not in interested_cities:
                    continue

                if event_date and not event_date.startswith('+'):
                    event_date = event_date.split('+')[0].strip()

                    if event_date.startswith('Tomorrow'):
                        date_time_obj = datetime.datetime.now() + datetime.timedelta(days=1)
                        event_date = event_date.replace('Tomorrow', date_time_obj.strftime('%a, %b %d'))
                    elif event_date.startswith('Today'):
                        date_time_obj = datetime.datetime.now()
                        event_date = event_date.replace('Today', date_time_obj.strftime('%a, %b %d'))

                    event_date = event_date.replace(" at ", ", ")

                    try:
                        date_time_obj = datetime.datetime.strptime(event_date, '%a, %b %d, %I:%M %p')
                        date_time_obj = date_time_obj.replace(year=datetime.datetime.now().year)
                        start_date = date_time_obj.strftime('%Y-%m-%d')
                        end_date = start_date
                        event_time = date_time_obj.strftime('%I:%M %p')
                    
                        event_details = await get_event_details(event_url)
                        description = event_details['description']
                        streetAddress = event_details['streetAddress']
                        postalCode = event_details['postalCode']
                        addressCountry = 'US'

                        event_row = pd.DataFrame({
                            'start_date': [start_date], 
                            'end_date': [end_date], 
                            'time': [event_time], 
                            'name': [event_name], 
                            'category': [category_name],
                            'url': [event_url], 
                            'description': [description],
                            'addressCountry': [addressCountry], 
                            'addressRegion': [addressRegion], 
                            'addressLocality': [addressLocality], 
                            'streetAddress': [streetAddress], 
                            'postalCode': [postalCode], 
                            'place_name': [place_name], 
                            })

                        event_data = pd.concat([event_data, event_row], ignore_index=True)
                    except ValueError:
                        print(f"Unable to parse date and time for event: {event_name}, {event_date}")

    return event_data

async def main():
    event_data = await get_event_data()
    print(event_data)
    event_data.to_excel("Final_event_data.xlsx", index=False)

# Run the script
if __name__ == '__main__':
    import nest_asyncio
    nest_asyncio.apply()

await main()

     start_date    end_date      time        category  \
0    2023-04-03  2023-04-03  06:00 PM        Business   
1    2023-03-29  2023-03-29  07:00 PM        Business   
2    2023-04-20  2023-04-20  02:00 PM        Business   
3    2023-03-30  2023-03-30  08:00 PM        Business   
4    2023-04-14  2023-04-14  08:00 AM        Business   
..          ...         ...       ...             ...   
157  2023-05-30  2023-05-30  09:00 AM  Science & Tech   
158  2023-03-29  2023-03-29  01:00 PM  Science & Tech   
159  2023-04-24  2023-04-24  04:00 PM  Science & Tech   
160  2023-04-04  2023-04-04  09:00 AM  Science & Tech   
161  2023-06-26  2023-06-26  08:30 AM  Science & Tech   

                                                  name  \
0    Silicon Valley: Past, Present and Future by Da...   
1    Silicon Valley Entrepreneurs ForumSilicon Vall...   
2    Silicon Valley Funding ForumSilicon Valley Fun...   
3    Silicon Valley Thursday Night Networking (In-P...   
4    Silicon Valley Propt