# Buenos Aires Cultural Events Dataset

### BUSINESS CHALLENGE:

Create a product that allows us to see a quick view of current cultural events in Buenos Aires classified by type of event and venue. It should provide information about all events and related links from different venues in Buenos Aires, so we can easily pick what we want to do this week. 

In [24]:
import os
print("cwd:", os.getcwd())
print("src exists:", os.path.exists(os.path.abspath("../src")))
print("scraper exists:", os.path.exists(os.path.abspath("../src/scraper.py")))

cwd: /Users/victoriayuzova/Data-Science-Projects/ba-events-recommender/notebooks
src exists: True
scraper exists: True


In [34]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt
import os
import sys
import json
import importlib

from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display
from openai import OpenAI

# Force the notebook to import the *local* ../src/scraper.py (not the pip package named `scraper`)
src_path = os.path.abspath("../src")
if src_path not in sys.path:
    sys.path.insert(0, src_path)
if "scraper" in sys.modules:
    del sys.modules["scraper"]

scraper = importlib.import_module("scraper")
fetch_website_links = scraper.fetch_website_links
fetch_website_contents = scraper.fetch_website_contents

In [35]:
homepage_urls = [
    "https://complejoteatral.gob.ar/",
    "https://malba.org.ar/",
    "https://www.teatrocervantes.gob.ar/",
    "https://turismo.buenosaires.gob.ar/es/article/que-hacer-esta-semana",
    "https://www.bellasartes.gob.ar/agenda/",
]

In [37]:
import pandas as pd

rows = []
for page_url in homepage_urls:
    event_url = fetch_website_links(page_url)
    rows.extend([{"page_url": page_url, "event_url": url} for url in event_url])
df = pd.DataFrame(rows)
df.head()

Unnamed: 0,page_url,event_url
0,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar
1,https://complejoteatral.gob.ar/,http://buenosaires.gob.ar/
2,https://complejoteatral.gob.ar/,#programacion
3,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/agenda?fecha=18...
4,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/pdf/temporada20...


In [40]:
df.describe()

Unnamed: 0,page_url,event_url
count,576,576
unique,5,297
top,https://malba.org.ar/,#
freq,187,38


In [41]:
df = df.drop_duplicates(subset=["page_url", "event_url"]).reset_index(drop=True)

In [42]:
df

Unnamed: 0,page_url,event_url
0,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar
1,https://complejoteatral.gob.ar/,http://buenosaires.gob.ar/
2,https://complejoteatral.gob.ar/,#programacion
3,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/agenda?fecha=18...
4,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/pdf/temporada20...
...,...,...
296,https://www.bellasartes.gob.ar/agenda/,/publicaciones
297,https://www.bellasartes.gob.ar/agenda/,/cdn-cgi/l/email-protection
298,https://www.bellasartes.gob.ar/agenda/,https://museoartedecorativo.cultura.gob.ar/
299,https://www.bellasartes.gob.ar/agenda/,https://museohistoriconacional.cultura.gob.ar/


In [46]:
mask = ~df["event_url"].astype(str).str.match(r"^https?://", na=False)
df[mask][["page_url", "event_url"]]

Unnamed: 0,page_url,event_url
2,https://complejoteatral.gob.ar/,#programacion
8,https://complejoteatral.gob.ar/,/cdn-cgi/l/email-protection#ddb4b3bbb29dbeb2b0...
13,https://complejoteatral.gob.ar/,/agenda?fecha=18-02-2026
14,https://complejoteatral.gob.ar/,/noticias/
15,https://complejoteatral.gob.ar/,#
...,...,...
293,https://www.bellasartes.gob.ar/agenda/,/coleccion/arte-argentino/
294,https://www.bellasartes.gob.ar/agenda/,/coleccion/obras-maestras/
295,https://www.bellasartes.gob.ar/agenda/,/noticias
296,https://www.bellasartes.gob.ar/agenda/,/publicaciones


In [47]:
# drop anchors + cloudflare email protection
df = df[~df["event_url"].astype(str).str.startswith("#", na=False)]
df = df[~df["event_url"].astype(str).str.contains("cdn-cgi/l/email-protection", na=False)]

In [48]:
df.describe()
df.head()
df.tail()
df.info()
df.isnull().sum()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
Index: 284 entries, 0 to 300
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   page_url   284 non-null    object
 1   event_url  284 non-null    object
dtypes: object(2)
memory usage: 6.7+ KB


page_url     0
event_url    0
dtype: int64

In [49]:
pd.set_option("display.max_rows", 200)      # or None (careful)
pd.set_option("display.max_colwidth", None)
df

Unnamed: 0,page_url,event_url
0,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar
1,https://complejoteatral.gob.ar/,http://buenosaires.gob.ar/
3,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/agenda?fecha=18-02-2026
4,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/pdf/temporada2026.pdf
5,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/ver/visitas_guiadas_al_teatro_san_martín
...,...,...
295,https://www.bellasartes.gob.ar/agenda/,/noticias
296,https://www.bellasartes.gob.ar/agenda/,/publicaciones
298,https://www.bellasartes.gob.ar/agenda/,https://museoartedecorativo.cultura.gob.ar/
299,https://www.bellasartes.gob.ar/agenda/,https://museohistoriconacional.cultura.gob.ar/


In [51]:
df.to_csv("links.csv", index=False)

In [29]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Default model you can use elsewhere
MODEL = 'gpt-5-nano'

# Use a steadier model for link selection (avoid long hangs/timeouts)
LINK_MODEL = 'gpt-4.1-mini'

# Add a client-side timeout so a single slow request doesn't hang the notebook
openai = OpenAI(timeout=180)

In [None]:
import re
from urllib.parse import urljoin, urldefrag

df2 = df.copy()
df2["event_url"] = df2["event_url"].astype(str).str.strip()
df2 = df2[df2["event_url"].ne("")]

# normalize relative -> absolute + drop fragments (#...)
def normalize(row):
    u = row["event_url"]
    if not re.match(r"^https?://", u):
        u = urljoin(row["page_url"], u)
    u, _frag = urldefrag(u)
    return u

df2["event_url"] = df2.apply(normalize, axis=1)

# drop obvious junk
drop_pat = r"(cdn-cgi/l/email-protection|^javascript:|instagram\.com|facebook\.com|x\.com|twitter\.com|youtube\.com|tiktok\.com|flickr\.com|politicas-de-privacidad|aviso-legal|privacy|terms|cookies|/contacto/?$)"
df2 = df2[~df2["event_url"].str.contains(drop_pat, case=False, na=False)]

# keep likely event pages (adjust as you learn)
keep_pat = r"(/agenda|/evento/|/eventos/|/obra/|/calendario|/programacion|/proximamente|/ver/|/pdf/|entradasba\.buenosaires\.gob\.ar/evento/)"
df2 = df2[df2["event_url"].str.contains(keep_pat, case=False, na=False)]

df2 = df2.drop_duplicates(subset=["page_url", "event_url"]).reset_index(drop=True)
df2


  df2 = df2[~df2["event_url"].str.contains(drop_pat, case=False, na=False)]
  df2 = df2[df2["event_url"].str.contains(keep_pat, case=False, na=False)]


Unnamed: 0,page_url,event_url
0,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/agenda?fecha=18-02-2026
1,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/pdf/temporada2026.pdf
2,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/ver/visitas_guiadas_al_teatro_san_martín
3,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/ver/la_gaviota
4,https://complejoteatral.gob.ar/,https://entradasba.buenosaires.gob.ar/evento/d90f82ed-ec8f-46cf-b8b2-48e665a36fc3
5,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/ver/los-pilares-de-la-sociedad
6,https://complejoteatral.gob.ar/,https://entradasba.buenosaires.gob.ar/evento/42523311-973e-4a52-b2ee-63a59c48a2b7
7,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/ver/baco-polaco
8,https://complejoteatral.gob.ar/,https://entradasba.buenosaires.gob.ar/evento/f94c1c9a-151e-49a8-8ed8-c9aa9d0464c3
9,https://complejoteatral.gob.ar/,https://complejoteatral.gob.ar/ver/invasiones-1


In [54]:
grouped = df2.groupby("page_url")["event_url"].apply(list)
grouped

page_url
https://complejoteatral.gob.ar/           [https://complejoteatral.gob.ar/agenda?fecha=18-02-2026, https://complejoteatral.gob.ar/pdf/temporada2026.pdf, https://complejoteatral.gob.ar/ver/visitas_guiadas_al_teatro_san_martín, https://complejoteatral.gob.ar/ver/la_gaviota, https://entradasba.buenosaires.gob.ar/evento/d90f82ed-ec8f-46cf-b8b2-48e665a36fc3, https://complejoteatral.gob.ar/ver/los-pilares-de-la-sociedad, https://entradasba.buenosaires.gob.ar/evento/42523311-973e-4a52-b2ee-63a59c48a2b7, https://complejoteatral.gob.ar/ver/baco-polaco, https://entradasba.buenosaires.gob.ar/evento/f94c1c9a-151e-49a8-8ed8-c9aa9d0464c3, https://complejoteatral.gob.ar/ver/invasiones-1, https://entradasba.buenosaires.gob.ar/evento/e1281314-a634-47a0-a2fa-89f0c3c88b0b, https://complejoteatral.gob.ar/ver/buenas-palabras, https://entradasba.buenosaires.gob.ar/evento/67826298-9dd3-4680-bf3f-d3d652a513c0, https://complejoteatral.gob.ar/ver/chau-macoco, https://complejoteatral.gob.ar/ver/jazz_bue

In [56]:
# transform grouped into payloads - its a more efficient way to pass the data to the LLM

payloads = [
    {"homepage_url": page_url, "institution": None, "links": links}
    for page_url, links in grouped.items()
]

import json
payload_json_links = json.dumps(payloads, ensure_ascii=False, indent=2)
payload_json_links

'[\n  {\n    "homepage_url": "https://complejoteatral.gob.ar/",\n    "institution": null,\n    "links": [\n      "https://complejoteatral.gob.ar/agenda?fecha=18-02-2026",\n      "https://complejoteatral.gob.ar/pdf/temporada2026.pdf",\n      "https://complejoteatral.gob.ar/ver/visitas_guiadas_al_teatro_san_martín",\n      "https://complejoteatral.gob.ar/ver/la_gaviota",\n      "https://entradasba.buenosaires.gob.ar/evento/d90f82ed-ec8f-46cf-b8b2-48e665a36fc3",\n      "https://complejoteatral.gob.ar/ver/los-pilares-de-la-sociedad",\n      "https://entradasba.buenosaires.gob.ar/evento/42523311-973e-4a52-b2ee-63a59c48a2b7",\n      "https://complejoteatral.gob.ar/ver/baco-polaco",\n      "https://entradasba.buenosaires.gob.ar/evento/f94c1c9a-151e-49a8-8ed8-c9aa9d0464c3",\n      "https://complejoteatral.gob.ar/ver/invasiones-1",\n      "https://entradasba.buenosaires.gob.ar/evento/e1281314-a634-47a0-a2fa-89f0c3c88b0b",\n      "https://complejoteatral.gob.ar/ver/buenas-palabras",\n      "http

## Step 1. Use LLM to pick relevant links

### We will call an LLM so it picks only relevant links with events

We will use "one shot prompting" in which we provide an example of how it should respond in the prompt.

This is an excellent use case for an LLM, because it requires nuanced understanding - hard coding each scenario would take us quite some time.

In [30]:
link_system_prompt = """
You are selecting event-related links for a Buenos Aires cultural events scraper.

You will receive a JSON object as input with keys:
- homepage_url: string
- institution: string or null
- links: array of absolute URLs (strings)

Your job:
- Pick ONLY links that are relevant for finding current/upcoming cultural events (agenda/listings, event detail pages, ticket purchase pages, program PDFs, calendars).
- Exclude terms/privacy, contact/about, donations/sponsors, newsletters, login, generic navigation, accessibility pages, and any social media.
- Do NOT invent new URLs. Every returned URL must come from input.links.
- Return at most 30 links.

Return ONLY valid JSON (no markdown, no prose) with this schema:

{
  "homepage_url": "<string>",
  "institution": "<string or null>",
  "links": [
    {
      "url": "<string>"
    }
  ]
}
"""

In [57]:
def get_links_user_prompt(payload_json_links):
    user_prompt = f"""
Here is the list of links on the website in json format: {payload_json_links} -
Please decide which of these are relevant web links for a brochure listing current cultural
evens in Buenos Aires.
Do not include Terms of Service, Privacy, email, social media links, or general descriptions of the theater that´s not related to any event.

Links (some might be relative links):

"""
    links = fetch_website_links(payload_json_links)
    user_prompt += "\n".join(links)
    return user_prompt

In [61]:
def select_relevant_links(payload_json_links):
    print(f"Selecting relevant links for {payload_json_links} by calling {MODEL}")
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": payload_json_links}
        ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    links = json.loads(result)
    print(f"Found {len(links['links'])} relevant links")
    return links
    

In [60]:
outs = [select_relevant_links(p) for p in payloads]  # payloads is list of dicts or json strings

df_selected = pd.concat(
    [pd.DataFrame(o["links"]).assign(homepage_url=o.get("homepage_url"), institution=o.get("institution"))
     for o in outs],
    ignore_index=True
)

df_selected

Selecting relevant links for {'homepage_url': 'https://complejoteatral.gob.ar/', 'institution': None, 'links': ['https://complejoteatral.gob.ar/agenda?fecha=18-02-2026', 'https://complejoteatral.gob.ar/pdf/temporada2026.pdf', 'https://complejoteatral.gob.ar/ver/visitas_guiadas_al_teatro_san_martín', 'https://complejoteatral.gob.ar/ver/la_gaviota', 'https://entradasba.buenosaires.gob.ar/evento/d90f82ed-ec8f-46cf-b8b2-48e665a36fc3', 'https://complejoteatral.gob.ar/ver/los-pilares-de-la-sociedad', 'https://entradasba.buenosaires.gob.ar/evento/42523311-973e-4a52-b2ee-63a59c48a2b7', 'https://complejoteatral.gob.ar/ver/baco-polaco', 'https://entradasba.buenosaires.gob.ar/evento/f94c1c9a-151e-49a8-8ed8-c9aa9d0464c3', 'https://complejoteatral.gob.ar/ver/invasiones-1', 'https://entradasba.buenosaires.gob.ar/evento/e1281314-a634-47a0-a2fa-89f0c3c88b0b', 'https://complejoteatral.gob.ar/ver/buenas-palabras', 'https://entradasba.buenosaires.gob.ar/evento/67826298-9dd3-4680-bf3f-d3d652a513c0', 'http

Unnamed: 0,url,homepage_url,institution
0,https://complejoteatral.gob.ar/agenda?fecha=18-02-2026,https://complejoteatral.gob.ar/,
1,https://complejoteatral.gob.ar/pdf/temporada2026.pdf,https://complejoteatral.gob.ar/,
2,https://complejoteatral.gob.ar/ver/la_gaviota,https://complejoteatral.gob.ar/,
3,https://complejoteatral.gob.ar/ver/los-pilares-de-la-sociedad,https://complejoteatral.gob.ar/,
4,https://complejoteatral.gob.ar/ver/baco-polaco,https://complejoteatral.gob.ar/,
5,https://complejoteatral.gob.ar/ver/invasiones-1,https://complejoteatral.gob.ar/,
6,https://complejoteatral.gob.ar/ver/buenas-palabras,https://complejoteatral.gob.ar/,
7,https://complejoteatral.gob.ar/ver/chau-macoco,https://complejoteatral.gob.ar/,
8,https://complejoteatral.gob.ar/ver/jazz_buenos_aires,https://complejoteatral.gob.ar/,
9,https://complejoteatral.gob.ar/ver/festival_el_tornillo,https://complejoteatral.gob.ar/,


## Second step: lets classify those links

Assemble all the details into another prompt to GPT-5-nano

In [None]:
def fetch_page_and_all_relevant_links(payload_json_links):
    contents = fetch_website_contents(payload_json_links)
    relevant_links = select_relevant_links(payload_json_links)
    result = f"## Landing Page:\n\n{contents}\n## Relevant Links:\n"
    for link in relevant_links['links']:
        result += f"\n\n### Link: {link['type']}\n"
        result += fetch_website_contents(link["url"])
    return result

In [None]:
print(fetch_page_and_all_relevant_links("https://complejoteatral.gob.ar/"))

In [None]:
brochure_system_prompt = """
You are an assistant that analyzes the contents of several relevant pages from a cultural institution website
and creates a short brochure about the events that are happening in Buenos Aires.
Respond in markdown without code blocks.
Include event name, type, short description, date and time of the event, link to the event if available.
"""

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# brochure_system_prompt = """
# You are an assistant that analyzes the contents of several relevant pages from a company website
# and creates a short, humorous, entertaining, witty brochure about the company for prospective customers, investors and recruits.
# Respond in markdown without code blocks.
# Include details of company culture, customers and careers/jobs if you have the information.
# """


In [None]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"""
You are looking at a cultural institution called: {company_name}
Here are the contents of its pages that contain information about current cultural events in Buenos Aires;
use this information to build a short brochure about the events that are happening in Buenos Aires.\n\n
"""
    user_prompt += fetch_page_and_all_relevant_links(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [None]:
get_brochure_user_prompt("Teatro San Martín", "https://complejoteatral.gob.ar/")

In [None]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [None]:
create_brochure("Teatro San Martín", "https://complejoteatral.gob.ar/")

## Finally - a minor improvement

With a small adjustment, we can change this so that the results stream back from OpenAI,
with the familiar typewriter animation

In [None]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        update_display(Markdown(response), display_id=display_handle.display_id)