# Google Places Api

In [None]:
# Part 1: Setup and Initialization
import requests
import csv
import time


# Constants
API_KEY = 'AIzaSyCjrwh-MIXJDbM-9K7NmMF2TyfTZMuNgPA'  # Replace with your actual API key
LOCATION = '-25.2637,-57.5759'  # Asunción coordinates
RADIUS = 15000  # 5 km radius # Categories to fetch
OUTPUT_FILE = 'places_details.csv'

# Headers for the CSV file
CSV_HEADERS = ['place_id', 'place_name', 'category', 'rating', 'location_lat', 'location_lng', 'formatted_address', 'opening_hours', 'phone_number', 'website_url', 'facebook', 'instagram', 'twitter', 'timestamp']

# # Function to write data to CSV
# def write_to_csv(data, file):
#     with open(file, mode='a', newline='', encoding='utf-8') as f:
#         writer = csv.writer(f)
#         writer.writerow(data)

# # Initialize CSV with headers
# with open(OUTPUT_FILE, mode='w', newline='', encoding='utf-8') as f:
#     writer = csv.writer(f)
#     writer.writerow(CSV_HEADERS)

# print("Setup complete and CSV initialized.")


In [None]:
PLACE_TYPES = [
    'accounting', 'airport', 'amusement_park', 'aquarium', 'art_gallery', 'atm', 'bakery', 'bank', 'bar',
    'beauty_salon', 'bicycle_store', 'book_store', 'bowling_alley', 'bus_station', 'cafe', 'campground',
    'car_dealer', 'car_rental', 'car_repair', 'car_wash', 'casino', 'cemetery', 'church', 'city_hall',
    'clothing_store', 'convenience_store', 'courthouse', 'dentist', 'department_store', 'doctor', 'electrician',
    'electronics_store', 'embassy', 'fire_station', 'florist', 'funeral_home', 'furniture_store', 'gas_station',
    'gym', 'hair_care', 'hardware_store', 'hindu_temple', 'home_goods_store', 'hospital', 'insurance_agency',
    'jewelry_store', 'laundry', 'lawyer', 'library', 'light_rail_station', 'liquor_store', 'local_government_office',
    'locksmith', 'lodging', 'meal_delivery', 'meal_takeaway', 'mosque', 'movie_rental', 'movie_theater',
    'moving_company', 'museum', 'night_club', 'painter', 'park', 'parking', 'pet_store', 'pharmacy', 'physiotherapist',
    'plumber', 'police', 'post_office', 'real_estate_agency', 'restaurant', 'roofing_contractor', 'rv_park', 'school',
    'shoe_store', 'shopping_mall', 'spa', 'stadium', 'storage', 'store', 'subway_station', 'supermarket', 'synagogue',
    'taxi_stand', 'train_station', 'transit_station', 'travel_agency', 'university', 'veterinary_care', 'zoo'
]


In [None]:
def get_place_details(place_id):
    place_details_url = f"https://maps.googleapis.com/maps/api/place/details/json?place_id={place_id}&fields=name,rating,formatted_address,geometry,opening_hours,international_phone_number,website&key={API_KEY}"

    # Debugging: Print the place details request URL
    print(f"Fetching details for place_id: {place_id}")
    print(f"Details URL: {place_details_url}")

    response = requests.get(place_details_url)

    # Debugging: Check the status code and response
    print(f"Response Status Code for Place Details: {response.status_code}")

    if response.status_code == 200:
        return response.json().get('result', {})
    else:
        print(f"Error fetching details for place_id {place_id}. Status code: {response.status_code}")
        return {}

# Function to get nearby places with debugging
def get_nearby_places():
    places = []
    for place_type in PLACE_TYPES:
        nearby_search_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json?location={LOCATION}&radius={RADIUS}&type={place_type}&key={API_KEY}"

        # Debugging: Print the URL to ensure it's correct
        print(f"Fetching places of type: {place_type}")
        print(f"Request URL: {nearby_search_url}")

        response = requests.get(nearby_search_url)

        # Debugging: Check the status code of the response
        print(f"Response Status Code: {response.status_code}")

        if response.status_code == 200:
            data = response.json()

            # Debugging: Print the full response to understand the content
            print("Response JSON:")
            print(data)

            if data.get('results'):
                places.extend(data['results'])
            else:
                print(f"No places found for type: {place_type}")
        else:
            # If the request failed, print the error
            print(f"Error fetching places for type {place_type}. Status code: {response.status_code}")

    return places

# Function to write data to CSV
def write_to_csv(data, file):
    with open(file, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(data)

# Initialize CSV with headers
with open(OUTPUT_FILE, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(CSV_HEADERS)

# Main process: Fetch nearby places, then get details and write to CSV
def main():
    places = get_nearby_places()

    print(f"Total places fetched: {len(places)}")

    if len(places) > 0:
        for place in places:
            place_id = place.get('place_id')
            details = get_place_details(place_id)

            # Extract relevant fields from place details
            place_name = details.get('name', 'N/A')
            rating = details.get('rating', 'N/A')
            formatted_address = details.get('formatted_address', 'N/A')
            location_lat = details['geometry']['location']['lat']
            location_lng = details['geometry']['location']['lng']
            opening_hours = details.get('opening_hours', {}).get('weekday_text', 'N/A')
            phone_number = details.get('international_phone_number', details.get('national_phone_number', 'N/A'))
            website_url = details.get('website', 'N/A')
            timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())

            # Determine the category (place type)
            categories = place.get('types', [])
            category = categories[0] if categories else 'N/A'

            # Write the row to the CSV (without social media links)
            row = [place_id, place_name, category, rating, location_lat, location_lng, formatted_address, opening_hours, phone_number, website_url, timestamp]
            write_to_csv(row, OUTPUT_FILE)

    else:
        print("No places were fetched.")

# Run the script
if __name__ == "__main__":
    main()

In [None]:
# Part 3: Fetch Place Details and Extract Social Media Links

# Function to get detailed information about each place
def get_place_details(place_id):
    place_details_url = f"https://maps.googleapis.com/maps/api/place/details/json?place_id={place_id}&fields=name,rating,formatted_address,geometry,opening_hours,international_phone_number,website&key={API_KEY}"
    response = requests.get(place_details_url)
    return response.json().get('result', {})

# Function to extract social media links from a website
def extract_social_media_links(website_url):
    social_media = {'facebook': None, 'instagram': None, 'twitter': None}

    try:
        response = requests.get(website_url, timeout=5)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Look for links in the website's content
            for link in soup.find_all('a', href=True):
                href = link['href']
                if 'facebook.com' in href:
                    social_media['facebook'] = href
                elif 'instagram.com' in href:
                    social_media['instagram'] = href
                elif 'twitter.com' in href:
                    social_media['twitter'] = href
    except Exception as e:
        print(f"Error fetching website {website_url}: {e}")

    return social_media

# Process each place
for place in places:
    place_id = place.get('place_id')
    details = get_place_details(place_id)

    # Extract relevant fields
    place_name = details.get('name', 'N/A')
    rating = details.get('rating', 'N/A')
    formatted_address = details.get('formatted_address', 'N/A')
    location_lat = details['geometry']['location']['lat']
    location_lng = details['geometry']['location']['lng']
    opening_hours = details.get('opening_hours', {}).get('weekday_text', 'N/A')
    phone_number = details.get('international_phone_number', details.get('national_phone_number', 'N/A'))
    website_url = details.get('website', 'N/A')
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())

    # Determine the category (place type)
    categories = place.get('types', [])
    category = categories[0] if categories else 'N/A'

    # Extract social media links (if a website is available)
    facebook, instagram, twitter = None, None, None
    if website_url != 'N/A':
        social_media_links = extract_social_media_links(website_url)
        facebook = social_media_links.get('facebook')
        instagram = social_media_links.get('instagram')
        twitter = social_media_links.get('twitter')

    # Write the row to the CSV
    row = [place_id, place_name, category, rating, location_lat, location_lng, formatted_address, opening_hours, phone_number, website_url, facebook, instagram, twitter, timestamp]
    write_to_csv(row, OUTPUT_FILE)

print(f"Data written to {OUTPUT_FILE}")


# Ticketea Scraping


In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
from datetime import datetime
import json
import os
from pathlib import Path

def fetch_event_links():
    base_url = "https://ticketea.com.py"

    try:
        response = requests.get(base_url)
        print(f"Status code for main page: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching main page: {e}")
        return []

    if response.status_code != 200:
        print(f"Failed to fetch the main page. Status code: {response.status_code}")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    print(f"Page content length: {len(response.content)}")

    event_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith("/events/"):
            full_url = f"{base_url}{href}"
            event_links.append(full_url)
            print(f"Found event link: {full_url}")

    event_links = list(set(event_links))
    print(f"Total unique event links: {len(event_links)}")
    return event_links

def extract_meta_property(soup, property_name):
    meta_tag = soup.find('meta', property=property_name)
    return meta_tag.get('content') if meta_tag else None

def extract_coordinates_from_iframe(soup):
    iframe = soup.find('iframe', src=lambda x: x and 'google.com/maps/embed' in x)
    if iframe and iframe.get('src'):
        src = iframe['src']
        if 'q=' in src:
            coords = src.split('q=')[-1].split('&')[0]
            try:
                lat, lon = map(float, coords.split(','))
                return {'latitude': lat, 'longitude': lon}
            except:
                pass
    return {'latitude': None, 'longitude': None}

def fetch_event_details(event_url):
    print(f"\nFetching details for event: {event_url}")

    try:
        response = requests.get(event_url)
        print(f"Status code for event page {event_url}: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching event page {event_url}: {e}")
        return None

    if response.status_code != 200:
        print(f"Failed to fetch event details for {event_url}. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')
    print(f"Event page content length: {len(response.content)}")

    # Extract meta properties
    meta_data = {
        'id': extract_meta_property(soup, 'og:id'),
        'category': extract_meta_property(soup, 'og:product:category'),
        'price': extract_meta_property(soup, 'og:product:price:amount'),
        'currency': extract_meta_property(soup, 'og:product:price:currency'),
        'brand': extract_meta_property(soup, 'og:product:brand'),
        'availability': extract_meta_property(soup, 'og:product:availability'),
        'condition': extract_meta_property(soup, 'og:product:condition')
    }

    # Extract title from different possible locations
    title = soup.find('title').text.strip() if soup.find('title') else None
    if title and "Entradas para" in title:
        title = title.replace("Entradas para", "").replace("- Ticketea.com.py", "").strip()

    # Extract coordinates
    coordinates = extract_coordinates_from_iframe(soup)

    # Extract event card details
    event_card = soup.find("div", {"class": "card card-event-config"})
    if event_card:
        print("Found the event card.")

        # Extract event image attributes
        image_tag = event_card.find("img")
        img_alt = image_tag.get('alt', 'No Alt') if image_tag else "No Alt"
        img_title = image_tag.get('title', 'No Title') if image_tag else "No Title"
        img_src = image_tag.get('src', 'No Image') if image_tag else "No Image"

        # Extract event info
        event_info = event_card.find_all("p", {"class": "size-md m-b-0 line-height-20"})
        event_hours = event_info[0].text.strip() if len(event_info) > 0 else "No Hours"
        event_location = event_info[1].text.strip() if len(event_info) > 1 else "No Location"
        event_address = event_info[2].text.strip() if len(event_info) > 2 else "No Address"

        # Extract description
        details_elem = soup.find("div", {"class": "event__detalle p-x"})
        additional_details = []
        if details_elem:
            for p in details_elem.find_all("p"):
                additional_details.append(p.text.strip())
        additional_details_text = " | ".join(additional_details)

        # Compile all extracted data
        event_data = {
            "title": title,
            "image_alt": img_alt,
            "image_title": img_title,
            "image_url": img_src,
            "event_url": event_url,
            "event_hours": event_hours,
            "event_location": event_location,
            "event_address": event_address,
            "additional_details": additional_details_text,
            "coordinates": coordinates,
            "meta": meta_data
        }

        return event_data
    else:
        print(f"No event card found for {event_url}.")
        return None

def write_to_csv(events, timestamp):
    if not events:
        print("No event data available to write.")
        return

    output_dir = "output"
    os.makedirs(output_dir, exist_ok=True)

    csv_path = os.path.join(output_dir, f"ticketea_events_{timestamp}.csv")
    json_path = os.path.join(output_dir, f"ticketea_events_{timestamp}.json")

    # Write CSV
    with open(csv_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=events[0].keys())
        writer.writeheader()
        writer.writerows(events)
    print(f"CSV file saved: {csv_path}")

    # Write JSON
    with open(json_path, 'w', encoding='utf-8') as file:
        json.dump(events, file, ensure_ascii=False, indent=2)
    print(f"JSON file saved: {json_path}")

def main():
    print("\n=== Starting event scraping process... ===\n")

    event_links = fetch_event_links()
    print(f"\nFound {len(event_links)} event links.")

    events = []
    for event_url in event_links:
        print(f"\nScraping event: {event_url}")
        event_data = fetch_event_details(event_url)

        if event_data:
            events.append(event_data)
            print(f"Successfully scraped: {event_data['title']}")

        time.sleep(2)  # Sleep between requests

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    if events:
        write_to_csv(events, timestamp)
    else:
        print("No events to write to files.")

    print(f"\n=== Scraping completed. Total events scraped: {len(events)} ===\n")

if __name__ == "__main__":
    main()


=== Starting event scraping process... ===

Status code for main page: 200
Page content length: 138857
Found event link: https://ticketea.com.py/events/duki-ameri-world-tour-2025-dia-2
Found event link: https://ticketea.com.py/events/rock-al-puerto
Found event link: https://ticketea.com.py/events/chayanne-bailemos-otra-vez-tour-2025
Found event link: https://ticketea.com.py/events/river-beats
Found event link: https://ticketea.com.py/events/conferencia-mario-alonso-puig
Found event link: https://ticketea.com.py/events/paraguari-music-fest-2
Found event link: https://ticketea.com.py/events/kany-garcia-en-paraguay
Found event link: https://ticketea.com.py/events/congreso-diamante-renuevo-el-pacto
Found event link: https://ticketea.com.py/events/un-mentes-expertas-de-victor-kuppers-vivir-con-actitud
Found event link: https://ticketea.com.py/events/ventas-salvajes
Found event link: https://ticketea.com.py/events/retro-fest
Found event link: https://ticketea.com.py/events/el-retrovisor-dia

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from typing import Dict, Optional, Tuple
from googlesearch import search
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_coordinates(html_content: str) -> Optional[Tuple[float, float]]:
    """Extract coordinates from the map link in the HTML content."""
    soup = BeautifulSoup(html_content, 'html.parser')
    map_link = soup.find('a', {'aria-label': 'View larger map'})

    if map_link and 'll=' in map_link.get('href', ''):
        href = map_link['href']
        coords_match = re.search(r'll=([-\d.]+),([-\d.]+)', href)
        if coords_match:
            lat, lng = map(float, coords_match.groups())
            return (lat, lng)
    return None

def determine_event_category(event_data: Dict) -> str:
    """
    Determine event category based on predefined local categories.
    """
    # Define category keywords in both Spanish and English
    category_keywords = {
        'Capacitaciones': [
            'capacitación', 'taller', 'curso', 'workshop', 'training',
            'aprende', 'aprendizaje', 'certificación', 'diploma'
        ],
        'Poesía': [
            'poesía', 'poetry', 'poeta', 'verso', 'recital',
            'poema', 'lírica', 'literario'
        ],
        'Conferencias': [
            'conferencia', 'charla', 'ponencia', 'seminario', 'congreso',
            'lecture', 'conference', 'talk', 'symposium'
        ],
        'Festivales': [
            'festival', 'feria', 'fiesta', 'celebration', 'carnaval',
            'fair', 'fest'
        ],
        'Cine': [
            'cine', 'película', 'film', 'cinema', 'movie',
            'proyección', 'screening', 'premiere', 'estreno'
        ],
        'Música': [
            'música', 'music', 'concierto', 'concert', 'show',
            'banda', 'band', 'cantante', 'singer', 'dj', 'recital',
            'orquesta', 'orchestra', 'musical'
        ],
        'Danza': [
            'danza', 'dance', 'baile', 'ballet', 'coreografía',
            'choreography', 'bailarín', 'dancer'
        ],
        'Teatro': [
            'teatro', 'theater', 'obra', 'play', 'drama',
            'actuación', 'acting', 'theatrical', 'teatral'
        ],
        'Arte': [
            'arte', 'art', 'exposición', 'exhibition', 'galería',
            'gallery', 'artista', 'artist', 'muestra'
        ],
        'Fotografía': [
            'fotografía', 'photography', 'foto', 'photo', 'imagen',
            'fotógrafo', 'photographer', 'photoshoot'
        ],
        'Gastronomía': [
            'gastronomía', 'gastronomy', 'comida', 'food', 'cocina',
            'cooking', 'culinario', 'culinary', 'restaurante', 'degustación',
            'tasting', 'chef'
        ],
        'Deportes': [
            'deporte', 'sports', 'partido', 'match', 'torneo',
            'tournament', 'campeonato', 'championship', 'competencia',
            'competition', 'juego', 'game'
        ],
        'Turismo': [
            'turismo', 'tourism', 'viaje', 'travel', 'excursión',
            'tour', 'visita', 'visit', 'aventura', 'adventure'
        ]
    }

    try:
        # Combine event name and description for analysis
        event_text = f"{event_data['eventName']} {event_data.get('eventDescription', '')}".lower()

        # Search for keywords in the event text
        category_scores = {}
        for category, keywords in category_keywords.items():
            # Count matches for each keyword
            score = sum(1 for keyword in keywords if keyword.lower() in event_text)
            # Add extra weight for keywords found in the event name
            name_score = sum(3 for keyword in keywords
                           if keyword.lower() in event_data['eventName'].lower())
            category_scores[category] = score + name_score

        # Get additional context from web search
        search_query = f"{event_data['eventName']} {event_data.get('eventDescription', '')[:100]}"
        search_results = []
        for result in search(search_query, num_results=3):
            search_results.append(result)

        # Add scores from search results
        search_text = ' '.join(search_results).lower()
        for category, keywords in category_keywords.items():
            search_score = sum(1 for keyword in keywords if keyword.lower() in search_text)
            category_scores[category] += search_score * 0.5  # Weight search results less than direct matches

        # Return the category with the highest score
        max_score = max(category_scores.values())
        if max_score > 0:
            return max(category_scores.items(), key=lambda x: x[1])[0]
        return 'Otros'  # Default category if no clear match

    except Exception as e:
        print(f"Error in category determination: {e}")
        return 'Otros'

def enrich_event_data(events_df: pd.DataFrame) -> pd.DataFrame:
    """
    Enrich events dataframe with coordinates and categories.
    """
    enriched_df = events_df.copy()
    enriched_df['latlng'] = None
    enriched_df['eventCategory'] = None

    for idx, row in enriched_df.iterrows():
        try:
            # Fetch the event page again to get coordinates
            response = requests.get(row['event_url'])
            if response.status_code == 200:
                # Extract coordinates
                coords = extract_coordinates(response.text)
                if coords:
                    enriched_df.at[idx, 'latlng'] = str(coords)

                # Determine category
                event_data = row.to_dict()
                category = determine_event_category(event_data)
                enriched_df.at[idx, 'eventCategory'] = category

            print(f"Enriched data for event: {row['eventName']}")

        except Exception as e:
            print(f"Error enriching data for event {row['eventName']}: {e}")

        # Add delay between requests
        time.sleep(2)

    return enriched_df

def save_enriched_data(df: pd.DataFrame, output_path: str):
    """Save enriched data to both CSV and JSON formats."""
    # Save CSV
    df.to_csv(f"{output_path}_enriched.csv", index=False)

    # Save JSON
    df.to_json(f"{output_path}_enriched.json", orient='records', indent=2)

def main():
    # Load the original scraped data
    input_path = "output/ticketea_events_latest"  # Adjust path as needed
    df = pd.read_csv(f"{input_path}.csv")

    # Enrich the data
    print("Starting data enrichment process...")
    enriched_df = enrich_event_data(df)

    # Save enriched data
    save_enriched_data(enriched_df, input_path)
    print("Data enrichment completed and saved!")

if __name__ == "__main__":
    main()

Starting data enrichment process...
Error in category determination: 'eventName'


KeyError: 'eventName'

# Asukeai

In [None]:
!pip install selenium
!pip install beautifulsoup4
!pip install requests
!apt-get update
!apt install chromium-chromedriver
!pip install webdriver_manager

import json
from bs4 import BeautifulSoup
import re
from urllib.parse import unquote
import uuid
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def extract_google_maps_coordinates(url):
    """
    Extract coordinates from Google Maps URL by following redirects and parsing the final URL.
    """
    try:
        response = requests.get(url, allow_redirects=True)
        final_url = response.url

        patterns = [
            r'@(-?\d+\.\d+),(-?\d+\.\d+)',  # Pattern for @lat,lng
            r'll=(-?\d+\.\d+),(-?\d+\.\d+)',  # Pattern for ll=lat,lng
            r'q=(-?\d+\.\d+),(-?\d+\.\d+)'    # Pattern for q=lat,lng
        ]

        for pattern in patterns:
            match = re.search(pattern, final_url)
            if match:
                return {
                    "latitude": float(match.group(1)),
                    "longitude": float(match.group(2))
                }

        print(f"Could not extract coordinates from URL: {final_url}")
        return {
            "latitude": -25.2867,
            "longitude": -57.3333
        }

    except Exception as e:
        print(f"Error extracting coordinates: {str(e)}")
        return {
            "latitude": -25.2867,
            "longitude": -57.3333
        }

def clean_text(text):
    """Clean and normalize text content."""
    if not text:
        return ""
    return " ".join(text.strip().split())

def extract_price(price_text):
    """Extract numerical price from text."""
    if not price_text:
        return "0"
    numbers = re.findall(r'\d+', price_text.replace('.', ''))
    return numbers[0] if numbers else "0"

def parse_event_block(block):
    """Parse a single event block and return structured data."""
    soup = BeautifulSoup(block, 'html.parser')

    event = {
        "title": "",
        "image_url": "",
        "image_alt": "",
        "image_title": "",
        "event_url": "",
        "event_hours": "",
        "event_location": "",
        "event_address": "",
        "additional_details": "",
        "coordinates": {"latitude": 0, "longitude": 0},
        "meta": {
            "id": str(uuid.uuid4()),
            "category": "",
            "price": "0",
            "currency": "PYG",
            "brand": "Notion Events",
            "availability": "in stock",
            "condition": "new"
        }
    }

    # Extract title
    title_div = soup.find('div', attrs={'data-content-editable-leaf': 'true'})
    if title_div:
        event["title"] = clean_text(title_div.text)
        event["image_alt"] = event["title"]
        event["image_title"] = event["title"]

    # Extract image
    img_tag = soup.find('img')
    if img_tag and 'src' in img_tag.attrs:
        src = img_tag['src']
        if src.startswith('/image/'):
            src = unquote(src[7:].split('?')[0])
        event["image_url"] = src

    # Extract date and time
    date_div = soup.find('div', text=re.compile(r'\w+\s+\d+,\s+\d{4}'))
    time_div = soup.find('div', text=re.compile(r'\d{1,2}:\d{2}\s*Hs\.'))

    if date_div and time_div:
        event["event_hours"] = f"{clean_text(date_div.text)}, {clean_text(time_div.text)}"

    # Extract location and coordinates
    location_link = soup.find('a', href=re.compile(r'maps\.google\.com|goo\.gl'))
    if location_link:
        event["event_location"] = clean_text(location_link.text)
        event["event_address"] = event["event_location"]
        event["coordinates"] = extract_google_maps_coordinates(location_link['href'])

    # Extract category
    category_span = soup.find('span', text=re.compile(r'Música|Teatro|Arte|Cine|Capacitación'))
    if category_span:
        event["meta"]["category"] = clean_text(category_span.text)

    # Extract price
    price_span = soup.find('span', text=re.compile(r'Gs\.\s*[\d\.]+'))
    if price_span:
        event["meta"]["price"] = extract_price(price_span.text)

    # Extract description
    description_spans = soup.find_all('span', recursive=True)
    descriptions = []
    for span in description_spans:
        if len(clean_text(span.text)) > 30:  # Assume longer texts are descriptions
            descriptions.append(clean_text(span.text))
    if descriptions:
        event["additional_details"] = " | ".join(descriptions)

    return event

def parse_notion_events(html_content):
    """Parse all event blocks from the Notion page."""
    soup = BeautifulSoup(html_content, 'html.parser')
    event_blocks = soup.find_all('div', attrs={'data-block-id': True, 'class': 'notion-selectable notion-page-block notion-collection-item'})

    events = []
    for block in event_blocks:
        try:
            event = parse_event_block(str(block))
            events.append(event)
        except Exception as e:
            print(f"Error parsing block: {e}")
            continue

    return events

# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Setup the driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Load the page
url = "https://asukeai.notion.site/f3e4701380844f29a8775995fd9f9747?v=3c1dfc5a47d346d5a65ccc8c596c843c"
driver.get(url)

# Wait for the content to load
wait = WebDriverWait(driver, 20)
try:
    # Wait for events to load
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "notion-collection-item")))
    # Give additional time for everything to render
    time.sleep(5)

    # Get the page source after JavaScript has rendered
    html_content = driver.page_source

    # Parse events
    events = parse_notion_events(html_content)

    # Save to JSON file
    with open('events.json', 'w', encoding='utf-8') as f:
        json.dump(events, f, indent=2, ensure_ascii=False)

    print("Events have been saved to events.json")
    print(f"Total events extracted: {len(events)}")

except Exception as e:
    print(f"Error loading page: {e}")

finally:
    driver.quit()

Collecting selenium
  Downloading selenium-4.27.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.28.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.27.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.28.0-py3-none-any.whl (486 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.3/486.3 kB[0m [31m37.6 MB/s

WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally.
  (unknown error: DevToolsActivePort file doesn't exist)
  (The process started from chrome location /usr/bin/chromium-browser is no longer running, so ChromeDriver is assuming that Chrome has crashed.)
Stacktrace:
#0 0x59c5f8f6e4e3 <unknown>
#1 0x59c5f8c9dc76 <unknown>
#2 0x59c5f8cc6d78 <unknown>
#3 0x59c5f8cc3029 <unknown>
#4 0x59c5f8d01ccc <unknown>
#5 0x59c5f8d0147f <unknown>
#6 0x59c5f8cf8de3 <unknown>
#7 0x59c5f8cce2dd <unknown>
#8 0x59c5f8ccf34e <unknown>
#9 0x59c5f8f2e3e4 <unknown>
#10 0x59c5f8f323d7 <unknown>
#11 0x59c5f8f3cb20 <unknown>
#12 0x59c5f8f33023 <unknown>
#13 0x59c5f8f011aa <unknown>
#14 0x59c5f8f576b8 <unknown>
#15 0x59c5f8f57847 <unknown>
#16 0x59c5f8f67243 <unknown>
#17 0x7aca406acac3 <unknown>


# Tuti Scraping


In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import time
from datetime import datetime
import json
import os
import re
from pathlib import Path
from urllib.parse import urljoin

def extract_meta_property(soup, property_name):
    """Helper function to extract meta tag properties."""
    meta_tag = soup.find('meta', property=property_name)
    return meta_tag.get('content') if meta_tag else None

def fetch_event_links(base_url="https://tuti.com.py"):
    """Extract all event links from the main page."""
    print(f"\nFetching events from: {base_url}")

    try:
        response = requests.get(base_url)
        print(f"Status code for main page: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching main page: {e}")
        return []

    if response.status_code != 200:
        print(f"Failed to fetch main page. Status code: {response.status_code}")
        return []

    return extract_events(response.content)

def fetch_event_details(event_url, basic_info):
    """Fetch detailed information for a single event."""
    print(f"\nFetching details for event: {event_url}")

    try:
        response = requests.get(event_url)
        print(f"Status code: {response.status_code}")
        soup = BeautifulSoup(response.content, 'html.parser')

        # Initialize details dictionary
        details = {}

        # Extract map coordinates if available
        map_iframe = soup.find('iframe', {'src': lambda x: x and 'maps.google.com/maps' in x})
        if map_iframe:
            src = map_iframe.get('src', '')
            coords_match = re.search(r'q=([-\d\.]+),([-\d\.]+)', src)
            if coords_match:
                details['coordinates'] = {
                    'latitude': coords_match.group(1),
                    'longitude': coords_match.group(2)
                }

        # Extract meta properties
        basic_info['meta'] = {
            'id': extract_meta_property(soup, 'og:id'),
            'category': extract_meta_property(soup, 'og:product:category'),
            'price': extract_meta_property(soup, 'og:product:price:amount'),
            'currency': extract_meta_property(soup, 'og:product:price:currency'),
            'brand': extract_meta_property(soup, 'og:product:brand'),
            'availability': extract_meta_property(soup, 'og:product:availability'),
            'description': extract_meta_property(soup, 'og:description')
        }

        # Extract event information
        info_div = soup.find('div', {'class': 'informacion'})
        if info_div:
            event_info = []
            info_header = info_div.find('h4')
            if info_header and info_header.text.strip() == 'Información del evento':
                paragraphs = info_div.find_all('p')
                for p in paragraphs:
                    text = p.get_text(strip=True)
                    if text:
                        event_info.append(text)
            if event_info:
                details['event_information'] = event_info

        # Extract sectors and prices
        sectors_div = soup.find('section', {'class': 'sectores_disponibles'})
        if sectors_div:
            sectors = []
            sector_rows = sectors_div.find_all('div', {'class': 'caja_sectores_fila'})
            for row in sector_rows:
                sector_name = row.find('div', {'class': 'caja_sectores_nombre'})
                sector_price = row.find('div', {'class': 'caja_sectores_precio'})
                if sector_name and sector_price:
                    sectors.append({
                        'name': sector_name.text.strip(),
                        'price': sector_price.text.strip()
                    })
            if sectors:
                details['sectors'] = sectors

        # Get venue information
        location_details = soup.find("div", class_="info-event")
        if location_details:
            details['venue'] = location_details.get_text(strip=True)

        basic_info['details'] = details
        return basic_info

    except Exception as e:
        print(f"Error fetching event details: {e}")
        return basic_info

def extract_events(html):
    """Extract events from the main page HTML"""
    soup = BeautifulSoup(html, 'html.parser')
    events = []

    event_blocks = soup.find_all("div", class_="un_evento")

    for block in event_blocks:
        try:
            # Extract basic info
            link = block.find("a", href=True)['href']
            title = block.find("h2", class_="un_evento_title").text.strip()

            info_spans = block.find_all("span")
            date = info_spans[0].text.strip() if info_spans else "No date"
            location = info_spans[1].text.strip() if len(info_spans) > 1 else "No location"

            price_btn = block.find("div", class_="un_evento_info_bottom").find("a", class_="btn-primary")
            if price_btn:
                price_text = price_btn.text.strip()
                # Remove "Comprá desde" and extract just the number
                price = price_text.replace("Comprá desde", "").replace("PYG", "").strip()
            else:
                price = "No price"

            img_tag = block.find("img")
            img_url = img_tag.get('src', '') if img_tag else ''

            # Check if the link already has the full URL
            if link.startswith('http'):
                full_url = link
            else:
                full_url = urljoin("https://tuti.com.py", link)

            event_info = {
                "title": title,
                "url": full_url,
                "date": date,
                "location": location,
                "price": price,
                "image_url": img_url
            }

            events.append(event_info)
            print(f"Found event: {title}")

        except Exception as e:
            print(f"Error processing event block: {e}")
            continue

    return events

def write_to_files(events, timestamp):
    """Write scraped data to both CSV and JSON files."""
    if not events:
        print("No event data available to write.")
        return

    output_dir = "output"
    os.makedirs(output_dir, exist_ok=True)

    # Write JSON
    json_path = os.path.join(output_dir, f"tuti_events_{timestamp}.json")
    with open(json_path, 'w', encoding='utf-8') as file:
        json.dump(events, file, ensure_ascii=False, indent=2)
    print(f"JSON file saved: {json_path}")

    # Write CSV - updated to include new fields
    csv_path = os.path.join(output_dir, f"tuti_events_{timestamp}.csv")
    with open(csv_path, mode="w", newline="", encoding="utf-8") as file:
        fieldnames = [
            'title', 'url', 'date', 'location', 'price', 'image_url',
            'meta_id', 'meta_category', 'meta_price', 'meta_currency',
            'meta_brand', 'meta_availability', 'description', 'venue',
            'latitude', 'longitude'
        ]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        for event in events:
            coordinates = event.get('details', {}).get('coordinates', {})
            flat_event = {
                'title': event.get('title', ''),
                'url': event.get('url', ''),
                'date': event.get('date', ''),
                'location': event.get('location', ''),
                'price': event.get('price', ''),
                'image_url': event.get('image_url', ''),
                'meta_id': event.get('meta', {}).get('id', ''),
                'meta_category': event.get('meta', {}).get('category', ''),
                'meta_price': event.get('meta', {}).get('price', ''),
                'meta_currency': event.get('meta', {}).get('currency', ''),
                'meta_brand': event.get('meta', {}).get('brand', ''),
                'meta_availability': event.get('meta', {}).get('availability', ''),
                'description': event.get('meta', {}).get('description', ''),
                'venue': event.get('details', {}).get('venue', ''),
                'latitude': coordinates.get('latitude', ''),
                'longitude': coordinates.get('longitude', '')
            }
            writer.writerow(flat_event)

    print(f"CSV file saved: {csv_path}")

def main():
    print("\n=== Starting TUTI event scraping process... ===\n")

    # Get all event links from main page
    events = fetch_event_links()
    print(f"\nFound {len(events)} events on main page.")

    # Get detailed information for each event
    for event in events:
        print(f"\nScraping details for: {event['title']}")
        event_details = fetch_event_details(event['url'], event)
        if event_details:
            print(f"Successfully scraped details for: {event['title']}")

        # Sleep between requests to avoid overloading the server
        time.sleep(2)

    # Generate timestamp and save data
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    write_to_files(events, timestamp)

    print(f"\n=== Scraping completed. Total events processed: {len(events)} ===\n")

if __name__ == "__main__":
    main()


=== Starting TUTI event scraping process... ===


Fetching events from: https://tuti.com.py
Status code for main page: 200

Found 0 events on main page.
No event data available to write.

=== Scraping completed. Total events processed: 0 ===



In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
from datetime import datetime
import json
import os
from pathlib import Path
import pandas as pd
import re
from typing import Dict, Optional, Tuple, List

def parse_spanish_date(date_str: str) -> Optional[str]:
    """Parse Spanish date string to ISO format timestamp."""
    try:
        spanish_months = {
            'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
            'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
            'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12
        }

        # Remove day name and split at comma
        main_parts = date_str.split(',')[0].split()
        time_part = date_str.split(',')[1].strip()

        # Extract day and month
        day = int([x for x in main_parts if x.isdigit()][0])
        month_str = next(x.lower() for x in main_parts if x.lower() in spanish_months)
        month = spanish_months[month_str]

        # Extract time
        time_match = re.search(r'(\d{1,2}):(\d{2})', time_part)
        if time_match:
            hour = int(time_match.group(1))
            minute = int(time_match.group(2))

            # Get current year or next year if the date has passed
            current_date = datetime.now()
            year = current_date.year
            event_date = datetime(year, month, day, hour, minute)

            # If the date has passed, use next year
            if event_date < current_date:
                event_date = datetime(year + 1, month, day, hour, minute)

            return event_date.isoformat()
        return None
    except Exception as e:
        print(f"Error parsing date {date_str}: {e}")
        return None

def extract_coordinates_from_schema(html_content: str) -> Optional[Tuple[float, float]]:
    """Extract coordinates from schema.org metadata."""
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        script_tags = soup.find_all('script', type='application/ld+json')

        for script in script_tags:
            try:
                data = json.loads(script.string)
                if isinstance(data, dict) and data.get('@type') == 'Event':
                    location = data.get('location', {})
                    if location.get('geo'):
                        lat = float(location['geo'].get('latitude'))
                        lng = float(location['geo'].get('longitude'))
                        return (lat, lng)
            except json.JSONDecodeError:
                continue
        return None
    except Exception as e:
        print(f"Error extracting coordinates from schema: {e}")
        return None

def format_coordinates(coords: Tuple[float, float]) -> List[str]:
    """Format coordinates as [latitude° S/N, longitude° E/W]."""
    if not coords:
        return []

    lat, lng = coords
    lat_hemisphere = 'S' if lat < 0 else 'N'
    lng_hemisphere = 'W' if lng < 0 else 'E'

    lat_str = f"{abs(lat)}° {lat_hemisphere}"
    lng_str = f"{abs(lng)}° {lng_hemisphere}"

    return [lat_str, lng_str]

def extract_price_from_text(text: str) -> Optional[int]:
    """Extract the lowest price from text looking for Gs. pattern."""
    try:
        price_pattern = r'(?:Gs\.?\s*(\d{1,3}(?:\.\d{3})*)|(\d{1,3}(?:\.\d{3})*)\s*Gs\.?)'
        matches = re.findall(price_pattern, text)

        if matches:
            prices = []
            for match in matches:
                price_str = match[0] if match[0] else match[1]
                price = int(price_str.replace('.', ''))
                prices.append(price)
            return min(prices) if prices else None
        return None
    except Exception as e:
        print(f"Error extracting price: {e}")
        return None

def clean_event_title(title: str) -> str:
    """Clean event title by removing website suffix and 'Entradas para' prefix."""
    title = re.sub(r'\s*-\s*Ticketea\.com\.py$', '', title)
    title = re.sub(r'^Entradas\s+para\s+', '', title)
    return title.strip()

def fetch_event_links():
    """Fetch all event links from the main page."""
    base_url = "https://ticketea.com.py"
    try:
        response = requests.get(base_url)
        if response.status_code != 200:
            print(f"Failed to fetch main page. Status code: {response.status_code}")
            return []

        soup = BeautifulSoup(response.content, 'html.parser')
        event_links = [f"{base_url}{link['href']}" for link in soup.find_all('a', href=True)
                      if link['href'].startswith("/events/")]

        unique_links = list(set(event_links))
        print(f"Found {len(unique_links)} unique event links")
        return unique_links
    except Exception as e:
        print(f"Error fetching main page: {e}")
        return []

def fetch_event_details(event_url: str) -> Optional[Dict]:
    """Fetch and parse event details from the event page."""
    print(f"\nFetching details for event: {event_url}")

    try:
        response = requests.get(event_url)
        if response.status_code != 200:
            return None

        soup = BeautifulSoup(response.content, 'html.parser')

        title = soup.find('meta', {'property': 'og:title'})
        title = clean_event_title(title['content']) if title else ""

        image = soup.find('meta', {'property': 'og:image'})
        image_url = image['content'] if image else ""

        description_div = soup.find('div', {'class': 'description-content'})
        if description_div:
            description_text = description_div.get_text()
            price = extract_price_from_text(description_text)
        else:
            description = soup.find('meta', {'property': 'og:description'})
            description_text = description['content'] if description else ""
            price = 0

        coords = extract_coordinates_from_schema(response.text)
        latlong = format_coordinates(coords) if coords else []

        date_time_elem = soup.find('p', {'class': 'size-md m-b-0 line-height-20'})
        event_datetime = date_time_elem.text.strip() if date_time_elem else None
        parsed_datetime = parse_spanish_date(event_datetime) if event_datetime else datetime.now().isoformat()

        address_elem = soup.find('meta', {'property': 'og:description'})
        if address_elem:
            address_parts = address_elem['content'].split(',')
            street = address_parts[0].strip() if len(address_parts) > 0 else ""
            city = address_parts[1].strip() if len(address_parts) > 1 else ""
        else:
            street, city = "", ""

        event_data = {
            "eventId": event_url.split('/')[-1],
            "eventName": title,
            "eventPhoto": image_url,
            "createdTime": datetime.now().isoformat(),
            "eventStartTime": parsed_datetime,
            "eventEndTime": parsed_datetime,
            "priceTicket": int(price) if price else 0,
            "registeredUsers": "",
            "street": street,
            "city": city,
            "zipcode": "",
            "country": "PY",
            "uid": "/users/rBU6BGqtw1QAMeXOxUEktFtfVan1",
            "eventDescription": description_text,
            "eventCategory": None,
            "latlong": latlong,
            "eventPrice": int(price) if price else 0,
            "name": city
        }

        event_data['eventCategory'] = determine_event_category(event_data)

        print(f"Successfully processed: {title}")
        print(f"Date: {parsed_datetime}")
        print(f"Price: {price}")
        print(f"Coordinates: {latlong}")

        return event_data

    except Exception as e:
        print(f"Error processing event {event_url}: {e}")
        return None

def determine_event_category(event_data: Dict) -> str:
    """Determine event category based on predefined local categories."""
    category_keywords = {
        'Capacitaciones': ['capacitación', 'taller', 'curso', 'workshop'],
        'Música': ['música', 'concierto', 'festival', 'show', 'banda'],
        'Teatro': ['teatro', 'obra', 'drama'],
        'Deportes': ['deporte', 'partido', 'torneo']
    }

    try:
        text = f"{event_data['eventName']} {event_data.get('eventDescription', '')}".lower()
        scores = {cat: sum(1 for kw in kws if kw.lower() in text)
                 for cat, kws in category_keywords.items()}

        if scores:
            return max(scores.items(), key=lambda x: x[1])[0]
        return 'Otros'

    except Exception as e:
        print(f"Error determining category: {e}")
        return 'Otros'

def save_results(events, timestamp):
    """Save results in a format compatible with FlutterFlow import."""
    output_dir = "output"
    os.makedirs(output_dir, exist_ok=True)

    csv_path = os.path.join(output_dir, f"ticketea_events_{timestamp}.csv")

    # Clean and standardize the data for CSV
    csv_events = []
    for event in events:
        csv_event = event.copy()
        # Convert latlong to string if it's a list
        if isinstance(csv_event['latlong'], list):
            csv_event['latlong'] = ', '.join(csv_event['latlong'])
        csv_events.append(csv_event)

    df = pd.DataFrame(csv_events)
    df.to_csv(csv_path,
              index=False,
              encoding='utf-8',
              quoting=csv.QUOTE_MINIMAL,
              na_rep='')

    return csv_path

def main():
    print("Starting scraping process...")
    event_links = fetch_event_links()
    events = []

    for event_url in event_links:
        try:
            event_data = fetch_event_details(event_url)
            if event_data:
                events.append(event_data)
                print(f"Successfully added event: {event_data['eventName']}")
            time.sleep(2)
        except Exception as e:
            print(f"Error processing event {event_url}: {e}")
            continue

    if events:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        csv_path = save_results(events, timestamp)

        print(f"\nScraping completed!")
        print(f"Total events processed: {len(events)}")
        print(f"File saved: {csv_path}")
    else:
        print("No events were successfully processed")

if __name__ == "__main__":
    main()

Starting scraping process...
Found 35 unique event links

Fetching details for event: https://ticketea.com.py/events/juli-hernandez-en-asuncion
Successfully processed: JULI HERNANDEZ EN ASUNCION
Date: 2025-07-11T22:00:00
Price: None
Coordinates: ['25.2908566° S', '57.6177523° W']
Successfully added event: JULI HERNANDEZ EN ASUNCION

Fetching details for event: https://ticketea.com.py/events/pisada-indie
Successfully processed: PISADA INDIE
Date: 2025-07-04T20:00:00
Price: None
Coordinates: ['25.2872649° S', '57.62565° W']
Successfully added event: PISADA INDIE

Fetching details for event: https://ticketea.com.py/events/barca-academy-campus-minga-guazu-2025
Successfully processed: BARÇA ACADEMY CAMPUS MINGA GUAZÚ 2025
Date: 2025-07-21T08:30:00
Price: 3120000
Coordinates: ['25.4871801° S', '54.782938° W']
Successfully added event: BARÇA ACADEMY CAMPUS MINGA GUAZÚ 2025

Fetching details for event: https://ticketea.com.py/events/bootcamp-los-secretos-detras-de-la-magia
Successfully process

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# ETL

# Update To Firestore


In [None]:
import json
from datetime import datetime
import re

def determine_category(event):
    """Determine proper category based on event data."""
    # Get category data from different possible sources
    meta_category = (event.get('meta', {}).get('category') or '').lower()
    description = (event.get('eventDescription', '') or event.get('additional_details', '') or '').lower()
    title = (event.get('title') or '').lower()

    # Define category mappings
    category_map = {
        'conciertos': ['concierto', 'música', 'music', 'show', 'festival', 'recital', 'banda', 'cantante'],
        'teatro': ['teatro', 'obra', 'drama', 'comedia', 'musical'],
        'deportes': ['carrera', 'deporte', 'partido', 'torneo', 'competencia', '5k', 'maratón'],
        'festivales': ['festival', 'fiesta', 'party', 'carnaval', 'patronales'],
        'educación': ['curso', 'taller', 'workshop', 'capacitación', 'seminario', 'congreso', 'colación'],
        'arte': ['arte', 'exposición', 'galería', 'museum', 'exhibición'],
        'familiar': ['familiar', 'family', 'niños', 'kids'],
        'urbano': ['dj', 'electrónica', 'reggaeton', 'rap', 'trap', 'hip hop'],
        'cultural': ['cultural', 'tradición', 'folklore']
    }

    # Text to check
    text_to_check = f"{title} {description} {meta_category}"

    # Check each category's keywords
    for category, keywords in category_map.items():
        if any(keyword in text_to_check for keyword in keywords):
            return category

    # Default category
    return 'otros'

def parse_date_string(date_str):
    """Parse various date string formats and return datetime object."""
    if not date_str:
        return datetime.now()

    try:
        # Clean up the date string
        date_str = date_str.lower()

        # Remove day names
        date_str = re.sub(r'^(lunes|martes|miércoles|jueves|viernes|sábado|domingo)\s+', '', date_str)

        # Fix common scraping issues in month names
        date_str = re.sub(r'februaryruary|februaryuary', 'february', date_str)
        date_str = re.sub(r'marchch', 'march', date_str)
        date_str = re.sub(r'julyy', 'july', date_str)
        date_str = re.sub(r'septembertember', 'september', date_str)

        # Convert Spanish months to English
        month_map = {
            'enero': 'january', 'febrero': 'february', 'marzo': 'march',
            'abril': 'april', 'mayo': 'may', 'junio': 'june',
            'julio': 'july', 'agosto': 'august', 'septiembre': 'september',
            'octubre': 'october', 'noviembre': 'november', 'diciembre': 'december',
            'ene': 'january', 'feb': 'february', 'mar': 'march',
            'abr': 'april', 'may': 'may', 'jun': 'june',
            'jul': 'july', 'ago': 'august', 'sep': 'september',
            'oct': 'october', 'nov': 'november', 'dic': 'december'
        }

        for spanish, english in month_map.items():
            date_str = re.sub(rf'\b{spanish}\b', english, date_str)

        # Remove 'de' and other connecting words
        date_str = re.sub(r'\sde\s', ' ', date_str)

        # Extract date components
        match = re.search(r'(\d{1,2})\s+([a-z]+)(?:\s+(\d{4}))?\s*(?:-|,)?\s*(\d{1,2}):(\d{2})', date_str)

        if match:
            day = int(match.group(1))
            month = match.group(2)
            year = int(match.group(3)) if match.group(3) else 2025
            hour = int(match.group(4))
            minute = int(match.group(5))
        else:
            match = re.search(r'(\d{1,2})\s+([a-z]+)(?:\s+(\d{4}))?', date_str)
            if not match:
                return datetime.now()

            day = int(match.group(1))
            month = match.group(2)
            year = int(match.group(3)) if match.group(3) else 2025
            hour = 0
            minute = 0

        month_nums = {
            'january': 1, 'february': 2, 'march': 3, 'april': 4,
            'may': 5, 'june': 6, 'july': 7, 'august': 8,
            'september': 9, 'october': 10, 'november': 11, 'december': 12
        }

        month_num = month_nums.get(month.lower())
        if not month_num:
            raise ValueError(f"Invalid month: {month}")

        return datetime(year, month_num, day, hour, minute)
    except Exception as e:
        print(f"Error parsing date '{date_str}': {str(e)}")
        return datetime.now()

def transform_event(event):
    """Transform event data to Firestore format."""
    # Parse dates
    start_time = parse_date_string(event.get('date') or event.get('event_hours', ''))

    # Clean up location data
    location = event.get('event_location', '') or event.get('location', '')
    if location == 'No Location':
        location = 'Asunción'

    # Extract coordinates
    lat = -25.2867  # Default coordinates for Asunción
    lng = -57.3333

    # For Tuti events
    if 'details' in event and event['details'].get('coordinates'):
        coordinates = event['details']['coordinates']
        if coordinates.get('latitude') and coordinates.get('longitude'):
            try:
                lat = float(coordinates['latitude'])
                lng = float(coordinates['longitude'])
            except (ValueError, TypeError):
                pass

    # For Ticketea events
    elif event.get('coordinates'):
        coordinates = event['coordinates']
        if coordinates.get('latitude') and coordinates.get('longitude'):
            try:
                lat = float(coordinates['latitude'])
                lng = float(coordinates['longitude'])
            except (ValueError, TypeError):
                pass

    # Ensure numeric values
    try:
        raw_price = event.get('price', '') or event.get('meta', {}).get('price', '0')
        price = int(float(str(raw_price).replace('PYG', '').replace('.', '').strip()))
    except (ValueError, TypeError):
        price = 0

    # Get category
    event_category = determine_category(event)

    transformed = {
        "eventId": event.get('meta', {}).get('id', '') or event.get('url', '').split('/')[-1],
        "eventName": event.get('title', ''),
        "eventPhoto": event.get('image_url', ''),
        "createdTime": "SERVER_TIMESTAMP",
        "eventStartTime": start_time,
        "eventEndTime": start_time,  # Same as start time since we don't have end time
        "priceTicket": price,
        "registeredUsers": [],
        "street": event.get('event_address', ''),
        "city": location,
        "zipcode": "",
        "country": "PY",
        "uid": "rBU6BGqtw1QAMeXOxUEktFtfVan1",
        "eventDescription": event.get('additional_details', '') or event.get('meta', {}).get('description', ''),
        "eventCategory": event_category,
        "latlong": {"latitude": lat, "longitude": lng},
        "eventPrice": price,
        "name": event.get('title', ''),
        "source": "ticketea" if 'ticketea' in str(event.get('url', '')) else "tuti"
    }

    print(f"Processing {transformed['eventName']} - Category: {event_category}")
    return transformed

def process_events(json_file):
    """Process events from JSON file and transform to Firestore format."""
    with open(json_file, 'r', encoding='utf-8') as f:
        events = json.load(f)
    return [transform_event(event) for event in events]

def main():
    """Main process to transform events and save output."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Process both Ticketea and Tuti events
    tuti_path = '/content/output/tuti_events_20250112_235505.json'
    ticketea_path = '/content/output/ticketea_events_20250112_235058.json'

    # Load and transform Tuti events
    tuti_events = process_events(tuti_path)
    ticketea_events = process_events(ticketea_path)

    # Combine all events
    all_events = tuti_events + ticketea_events

    # Save transformed events
    output_path = f'/content/output/processed_events_{timestamp}.json'
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(all_events, f, ensure_ascii=False, indent=2, default=str)

    print(f"Successfully processed {len(all_events)} events")
    print(f"Output saved to: {output_path}")

if __name__ == "__main__":
    main()

Processing Super Copa - Category: otros
Processing Combos Navideños - Category: otros
Processing Carnaval Encarnaceno 2025 - Category: festivales
Processing Conferencia de Sanidad Emocional "Izzanami Martinez" - Category: otros
Processing DUKI en Concierto 5 de Abril - Category: conciertos
Processing Carrera de las chicas 5 K - Category: deportes
Processing FIESTA RETROVISOR VERANO - Category: festivales
Processing So Pra Contrariar - Category: otros
Processing Super Copa - Category: otros
Processing Combos Navideños - Category: otros
Processing Carnaval Encarnaceno 2025 - Category: festivales
Processing Conferencia de Sanidad Emocional "Izzanami Martinez" - Category: otros
Processing DUKI en Concierto 5 de Abril - Category: conciertos
Processing Festival del Ykua Bolaños - Category: conciertos
Processing Sepultura - Category: otros
Processing Carrera de las chicas 5 K - Category: deportes
Processing Fansrock - Category: otros
Processing FIESTA RETROVISOR VERANO - Category: festivales


In [None]:
os.environ['GOOGLE_CLOUD_PROJECT'] = 'vamospy-discovery-vf7nj4'

In [None]:
import json
from datetime import datetime
import re

def parse_date_string(date_str):
    """Parse various date string formats and return datetime object."""
    if not date_str:
        return datetime.now()

    try:
        # Clean up the date string
        date_str = date_str.lower()

        # Remove day names
        date_str = re.sub(r'^(lunes|martes|miércoles|jueves|viernes|sábado|domingo)\s+', '', date_str)

        # Fix common scraping issues in month names
        date_str = re.sub(r'februaryruary|februaryuary', 'february', date_str)
        date_str = re.sub(r'marchch', 'march', date_str)
        date_str = re.sub(r'julyy', 'july', date_str)
        date_str = re.sub(r'septembertember', 'september', date_str)

        # Convert Spanish months to English
        month_map = {
            'enero': 'january',
            'febrero': 'february',
            'marzo': 'march',
            'abril': 'april',
            'mayo': 'may',
            'junio': 'june',
            'julio': 'july',
            'agosto': 'august',
            'septiembre': 'september',
            'octubre': 'october',
            'noviembre': 'november',
            'diciembre': 'december',
            'ene': 'january',
            'feb': 'february',
            'mar': 'march',
            'abr': 'april',
            'may': 'may',
            'jun': 'june',
            'jul': 'july',
            'ago': 'august',
            'sep': 'september',
            'oct': 'october',
            'nov': 'november',
            'dic': 'december'
        }

        for spanish, english in month_map.items():
            date_str = re.sub(rf'\b{spanish}\b', english, date_str)

        # Remove 'de' and other connecting words
        date_str = re.sub(r'\sde\s', ' ', date_str)

        # Extract date components
        match = re.search(r'(\d{1,2})\s+([a-z]+)(?:\s+(\d{4}))?\s*(?:-|,)?\s*(\d{1,2}):(\d{2})', date_str)

        if match:
            day = int(match.group(1))
            month = match.group(2)
            year = int(match.group(3)) if match.group(3) else 2025
            hour = int(match.group(4))
            minute = int(match.group(5))
        else:
            match = re.search(r'(\d{1,2})\s+([a-z]+)(?:\s+(\d{4}))?', date_str)
            if not match:
                return datetime.now()

            day = int(match.group(1))
            month = match.group(2)
            year = int(match.group(3)) if match.group(3) else 2025
            hour = 0
            minute = 0

        month_nums = {
            'january': 1, 'february': 2, 'march': 3, 'april': 4,
            'may': 5, 'june': 6, 'july': 7, 'august': 8,
            'september': 9, 'october': 10, 'november': 11, 'december': 12
        }

        month_num = month_nums.get(month.lower())
        if not month_num:
            raise ValueError(f"Invalid month: {month}")

        return datetime(year, month_num, day, hour, minute)
    except Exception as e:
        print(f"Error parsing date '{date_str}': {str(e)}")
        return datetime.now()

def transform_event(event):
    """Transform event data to Firestore format."""
    # Parse dates
    start_time = parse_date_string(event.get('date') or event.get('event_hours', ''))

    # Clean up location data
    location = event.get('event_location', '') or event.get('location', '')
    if location == 'No Location':
        location = 'Asunción'

    # Extract coordinates
    lat = -25.2867  # Default coordinates for Asunción
    lng = -57.3333

    # For Tuti events
    if 'details' in event and event['details'].get('coordinates'):
        coordinates = event['details']['coordinates']
        if coordinates.get('latitude') and coordinates.get('longitude'):
            lat = float(coordinates['latitude'])
            lng = float(coordinates['longitude'])

    # For Ticketea events
    elif event.get('coordinates'):
        coordinates = event['coordinates']
        if coordinates.get('latitude') and coordinates.get('longitude'):
            lat = float(coordinates['latitude'])
            lng = float(coordinates['longitude'])

    # Ensure numeric values
    try:
        raw_price = event.get('price', '') or event.get('meta', {}).get('price', '0')
        price = int(float(str(raw_price).replace('PYG', '').replace('.', '').strip()))
    except (ValueError, TypeError):
        price = 0

    transformed = {
        "eventId": event.get('meta', {}).get('id', '') or event.get('url', '').split('/')[-1],
        "eventName": event.get('title', ''),
        "eventPhoto": event.get('image_url', ''),
        "createdTime": "SERVER_TIMESTAMP",
        "eventStartTime": start_time,
        "eventEndTime": start_time,  # Same as start time since we don't have end time
        "priceTicket": price,
        "registeredUsers": [],
        "street": event.get('event_address', ''),
        "city": location,
        "zipcode": "",
        "country": "PY",
        "uid": "users/rBU6BGqtw1QAMeXOxUEktFtfVan1",
        "eventDescription": event.get('additional_details', '') or event.get('meta', {}).get('description', ''),
        "eventCategory": event.get('meta', {}).get('category', ''),
        "latlong": {"latitude": lat, "longitude": lng},
        "eventPrice": price,
        "name": event.get('title', ''),
        "source": "ticketea" if 'ticketea' in str(event.get('url', '')) else "tuti"
    }

    print(f"Processing {transformed['eventName']} - Coordinates: {lat}, {lng}")
    return transformed

def process_events(json_file):
    """Process events from JSON file and transform to Firestore format."""
    with open(json_file, 'r', encoding='utf-8') as f:
        events = json.load(f)
    return [transform_event(event) for event in events]

def main():
    """Main process to transform events and save output."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Process both Ticketea and Tuti events
    tuti_path = '/content/output/tuti_events_20250107_223214.json'
    ticketea_path = '/content/output/ticketea_events_20250107_214401.json'

    # Load and transform Tuti events
    tuti_events = process_events(tuti_path)
    ticketea_events = process_events(ticketea_path)

    # Combine all events
    all_events = tuti_events + ticketea_events

    # Save transformed events
    output_path = f'/content/output/processed_events_{timestamp}.json'
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(all_events, f, ensure_ascii=False, indent=2, default=str)

    print(f"Successfully processed {len(all_events)} events")
    print(f"Output saved to: {output_path}")

if __name__ == "__main__":
    main()

AttributeError: 'NoneType' object has no attribute 'lower'

In [None]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import json
import os
import logging
from datetime import datetime

def prepare_firestore_data(event_data):
    """Prepare data for Firestore by converting data types appropriately."""
    data = event_data.copy()

    # Convert datetime strings to datetime objects
    if isinstance(data.get('eventStartTime'), str):
        try:
            data['eventStartTime'] = datetime.fromisoformat(data['eventStartTime'])
        except (ValueError, TypeError):
            data['eventStartTime'] = datetime.now()

    if isinstance(data.get('eventEndTime'), str):
        try:
            data['eventEndTime'] = datetime.fromisoformat(data['eventEndTime'])
        except (ValueError, TypeError):
            data['eventEndTime'] = datetime.now()

    # Handle ServerTimestamp
    if data.get('createdTime') == 'SERVER_TIMESTAMP':
        data['createdTime'] = firestore.SERVER_TIMESTAMP

    # Convert latlong to GeoPoint if it's a dict
    if isinstance(data.get('latlong'), dict):
        lat = float(data['latlong'].get('latitude', -25.2867))
        lng = float(data['latlong'].get('longitude', -57.3333))
        data['latlong'] = firestore.GeoPoint(lat, lng)

    # Ensure registeredUsers is a list
    if not isinstance(data.get('registeredUsers'), list):
        data['registeredUsers'] = []

    # Ensure numeric fields are integers
    try:
        data['priceTicket'] = int(data.get('priceTicket', 0))
    except (ValueError, TypeError):
        data['priceTicket'] = 0

    try:
        data['eventPrice'] = int(data.get('eventPrice', 0))
    except (ValueError, TypeError):
        data['eventPrice'] = 0

    return data

def upload_to_firestore(json_path, cred_path):
    """
    Upload processed events to Firestore.

    Args:
        json_path: Path to the processed events JSON file
        cred_path: Path to the Firebase credentials JSON file
    """
    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
    logger = logging.getLogger(__name__)

    try:
        # Set project ID
        os.environ['GOOGLE_CLOUD_PROJECT'] = 'vamospy-discovery-vf7nj4'

        # Initialize Firebase
        if not firebase_admin._apps:
            cred = credentials.Certificate(cred_path)
            firebase_admin.initialize_app(cred)

        db = firestore.client()
        logger.info("Successfully connected to Firestore")

        # Read the processed events
        with open(json_path, 'r', encoding='utf-8') as f:
            events = json.load(f)
        logger.info(f"Loaded {len(events)} events from {json_path}")

        # Process events in batches
        batch_size = 400
        total_uploaded = 0
        total_errors = 0
        batch = db.batch()
        count = 0

        for event in events:
            try:
                # Prepare data for Firestore
                event_data = prepare_firestore_data(event)

                # Get document ID
                doc_id = event_data.get('eventId')
                if not doc_id:
                    logger.warning(f"Skipping event with missing eventId: {event}")
                    continue

                # Create reference and add to batch
                doc_ref = db.collection('events').document(doc_id)
                batch.set(doc_ref, event_data)
                count += 1
                total_uploaded += 1

                # Commit batch when limit reached
                if count >= batch_size:
                    batch.commit()
                    logger.info(f"Committed batch of {count} events. Total uploaded: {total_uploaded}")
                    batch = db.batch()
                    count = 0

            except Exception as e:
                total_errors += 1
                logger.error(f"Error processing event {event.get('eventId', 'UNKNOWN')}: {str(e)}")
                if total_errors > 50:
                    logger.error("Too many errors. Stopping upload.")
                    break
                continue

        # Commit remaining batch
        if count > 0:
            batch.commit()
            logger.info(f"Committed final batch of {count} events")

        logger.info(f"Upload completed. Total events uploaded: {total_uploaded}, Total errors: {total_errors}")
        return total_uploaded, total_errors

    except Exception as e:
        logger.error(f"Critical error during upload: {str(e)}")
        return 0, 1

if __name__ == "__main__":
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    json_path = '/content/output/processed_events_20250108_001716.json'
    cred_path = '/content/vamospy-discovery-vf7nj4-89f04f350c3f.json'

    if not os.path.exists(cred_path):
        print(f"Error: Service account key not found at {cred_path}")
    elif not os.path.exists(json_path):
        print(f"Error: Processed events file not found at {json_path}")
    else:
        total_uploaded, total_errors = upload_to_firestore(json_path, cred_path)
        print(f"Upload complete: {total_uploaded} uploaded, {total_errors} errors")

Upload complete: 61 uploaded, 0 errors


In [None]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import logging
import os

def update_uids_in_firestore():
    """Update all UIDs in Firestore events collection."""
    try:
        # Get db reference
        db = firestore.client()

        # Get all documents
        events_ref = db.collection('events')
        docs = events_ref.stream()

        # Process updates directly (without batch)
        updated = 0
        for doc in docs:
            try:
                events_ref.document(doc.id).set({
                    'uid': '/users/rBU6BGqtw1QAMeXOxUEktFtfVan1'
                }, merge=True)
                updated += 1
                print(f"Updated document {doc.id}")
            except Exception as e:
                print(f"Error updating document {doc.id}: {e}")
                continue

        print(f"Successfully updated {updated} documents")

    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    # Set project ID
    os.environ['GOOGLE_CLOUD_PROJECT'] = 'vamospy-discovery-vf7nj4'

    # Initialize Firebase
    if not firebase_admin._apps:
        cred = credentials.Certificate('/content/vamospy-discovery-vf7nj4-89f04f350c3f.json')
        firebase_admin.initialize_app(cred)

    update_uids_in_firestore()

Updated document 011e5e95-a5d3-47e1-9e08-0e905a1fa444
Updated document 01c43b58-e459-4252-a958-493bbb9f6824
Updated document 0329c5c0-f703-47ed-a032-e77c2685aa8f
Updated document 1054284f-0bbe-4238-bbf9-81c8cff4d7f8
Updated document 118c6b5d-b1d0-4280-8fd2-22a9a8df1d06
Updated document 11b27e4e-0732-4a97-aba5-df57d8357ec4
Updated document 15eb83c8-fa03-4c14-bad0-5ba2303f9368
Updated document 27c97dbb-2c58-45c3-9b7b-1fa4c7495016
Updated document 43410aa5-ed86-4f77-a935-feed745e0f81
Updated document 50-aniversario-credivill
Updated document 57fe698e-9f5b-43e2-b154-29f9a4da8a61
Updated document 59845c62-2bee-4336-8078-58c8e043406c
Updated document 6aadce01-6c3d-44ea-b2d2-086b5bfbd59c
Updated document 708ddbfe-64c7-4a4f-9a0f-bc6acefa949f
Updated document 7113c965-4ec0-4fbd-8460-9cdf2229448e
Updated document 8tdibgsOzODv46hM2Hmk
Updated document 909a56bd-498c-4ccf-be9c-65a8b7269f58
Updated document 9452cf0a-8123-4b65-9d1a-f305c37232f3
Updated document 9825849c-0da3-4a47-9c84-811521889d4e
Up

In [None]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import os

def update_all_documents():
    """Update all documents with the working UID format."""
    try:
        db = firestore.client()

        # First get the working format from our known working document
        working_doc = db.collection('events').document('a4c068f5-845d-42ea-9c90-e1ac78570347').get()
        working_uid = working_doc.get('uid')

        print("Using UID format:", working_uid)

        # Process in batches for efficiency
        batch_size = 450  # Firestore limit is 500
        batch = db.batch()
        count = 0
        total_updated = 0

        # Get all documents
        docs = db.collection('events').stream()

        for doc in docs:
            try:
                if doc.id != 'a4c068f5-845d-42ea-9c90-e1ac78570347':  # Skip our reference document
                    doc_ref = db.collection('events').document(doc.id)
                    batch.update(doc_ref, {'uid': working_uid})
                    count += 1
                    total_updated += 1

                    # Commit batch when size limit reached
                    if count >= batch_size:
                        batch.commit()
                        print(f"Updated batch of {count} documents. Total: {total_updated}")
                        batch = db.batch()
                        count = 0

            except Exception as e:
                print(f"Error updating document {doc.id}: {e}")
                continue

        # Commit any remaining documents
        if count > 0:
            batch.commit()
            print(f"Updated final batch of {count} documents")

        print(f"\nSuccessfully updated {total_updated} documents")

    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    # Set project ID
    os.environ['GOOGLE_CLOUD_PROJECT'] = 'vamospy-discovery-vf7nj4'

    # Initialize Firebase
    if not firebase_admin._apps:
        cred = credentials.Certificate('/content/vamospy-discovery-vf7nj4-89f04f350c3f.json')
        firebase_admin.initialize_app(cred)

    update_all_documents()

Using UID format: <google.cloud.firestore_v1.document.DocumentReference object at 0x7aa1f049d240>
Updated final batch of 94 documents

Successfully updated 94 documents
