In [None]:
from pydantic import BaseModel, ConfigDict, BeforeValidator, Field
from enum import Enum
from datetime import datetime
from typing import Optional, Annotated

PyObjectId = Annotated[str, BeforeValidator(str)]

class ConcertStatus(Enum):
    TENTATIVE = 0
    CONFIRMED = 1
    ON_SALE = 2
    SOLD_OUT = 3
    POSTPONED = 4
    CANCELED = 5
    COMPLETED = 6

class NewConcert(BaseModel):
    id: Optional[PyObjectId] = Field(alias='_id', default=None) # primary key
    uid: str
    artist: str
    tour_name: Optional[str] 
    venue: str
    location: str
    concert_datetime: datetime
    status: Optional[ConcertStatus | int]
    update_datetime: datetime

    model_config = ConfigDict(use_enum_values=True)

In [31]:
import requests
from bs4 import BeautifulSoup
from uuid import uuid4
from datetime import datetime

def get_concert_data(url:str, location:str) -> list:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    m_boxes = soup.find_all('div', class_='MuiCardContent-root styles_cardContent__nFnOO styles_small__RCnB2 mui-15q2cw4')
    cap = soup.find_all(attrs={'data-testid': 'card-venue-name-0'})
    data = []

    for box, caption in zip(m_boxes, cap):
        date = box.div.find_next_sibling("div").find_all("div")[0].text.replace('•', ' ')
        date = datetime.strptime(date, '%a %b %d %I:%M %p')
        date = date.replace(year=datetime.now().year)
        data.append(NewConcert(
            uid=str(uuid4()),
            artist=box.div.span.text,
            tour_name=None,
            venue=caption.text,
            location=location,
            concert_datetime=date,
            status=ConcertStatus.CONFIRMED
        ))

    return data


In [32]:
concert_data = []
url_list = [('https://www.vividseats.com/geo/usa/tx/austin/concerts', 'Austin, TX, USA'),
            ('https://www.vividseats.com/geo/usa/tx/houston/concerts', 'Houston, TX, USA'),
            ('https://www.vividseats.com/geo/usa/tx/san-antonio/concerts', 'San Antonio, TX, USA'),
            ('https://www.vividseats.com/geo/usa/tx/dallas/concerts', 'Dallas, TX, USA'),
            ('https://www.vividseats.com/geo/usa/tx/el-paso/concerts', 'El Paso, TX, USA')]

for url, location in url_list:
    concert_data.extend(get_concert_data(url, location))

In [20]:
concert_data

[NewConcert(id=None, uid='d9a2cab0-89aa-4f6d-8ece-a46baa1c3fb1', artist='Tate McRae', tour_name='', venue='Moody Center ATX', location='Austin, TX, USA', datetime=datetime.datetime(2025, 9, 16, 19, 0), status=1),
 NewConcert(id=None, uid='7853ec15-9e9b-4273-b129-5b1eaa7cf5fe', artist='Lainey Wilson', tour_name='', venue='Moody Center ATX', location='Austin, TX, USA', datetime=datetime.datetime(2025, 9, 18, 19, 0), status=1),
 NewConcert(id=None, uid='09e855d9-8a46-4000-9bed-e936274c3606', artist='Lorde', tour_name='', venue='Moody Center ATX', location='Austin, TX, USA', datetime=datetime.datetime(2025, 9, 17, 19, 0), status=1),
 NewConcert(id=None, uid='bee75b11-d53a-4f46-9bca-282b0f6d8fab', artist='SuicideBoys', tour_name='', venue='Germania Insurance Amphitheater', location='Austin, TX, USA', datetime=datetime.datetime(2025, 9, 12, 18, 30), status=1),
 NewConcert(id=None, uid='30a68002-dc6c-48c4-950e-f707b3aeacd5', artist='Mumford and Sons', tour_name='', venue='Moody Center ATX', l

In [None]:
import os
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

# get password and uri for database connection
load_dotenv()
db_password = os.getenv('MONGO_PASSWORD')
uri = os.getenv('DATABASE_URI').replace('db_password', db_password)

# create client and connect to server
client = MongoClient(uri, server_api=ServerApi('1'))

# ping for confirmation and update databases
db = client['bigcat']
# for concert in concert_data:
#     db['concert'].insert_one(concert.__dict__)



In [None]:
# pipeline = [
#     {
#         '$group': {
#             '_id': {'artist': '$artist', 'tour_name': '$tour_name', 'venue': '$venue', 'location': '$location', 'datetime': '$datetime', 'status': '$status'},
#             'duplicate_ids': {'$push': '$uid'},
#             'count': {'$sum': 1}
#         }
#     },
#     {
#         '$match': {
#             'count': {'$gt': 1}
#         }
#     }
# ]

# duplicates = list(db['concert'].aggregate(pipeline))
# print(f'{len(duplicates)} duplicates found.')

# counter = 0

# for doc in duplicates:
#         ids_to_remove = doc['duplicate_ids'][1:]  # Keep the first, remove the rest
#         if ids_to_remove:
#             db['concert'].delete_many({'uid': {'$in': ids_to_remove}})
#         counter += 1

# print(f"{counter} duplicate documents removed.")

['fac3bfdd-3d27-43ec-81fe-13654edfe7c9']
['80f21e86-79db-415d-8cd1-d8391ff9b9be']
['cace0725-46e4-402b-b578-2dca477cac43']
['cae57419-86ec-4e6c-ba24-3c725be42e88']
['169058a6-cbb0-4206-9db2-4afb8f8eb39e']
['c8fad048-2436-4033-957f-d981082709df', '59de29a8-b1a1-4b4f-a2e3-8d3d9b312eb3']
['e4fc8d33-7e29-4cf7-9322-d2cd539c18ab']
['4e0ec9f4-9d99-4682-9645-c63db2bbc210']
['ae9d9ae5-4f4c-4263-a5e8-d15db431d204']
['05b9be65-0e25-4a9a-a26c-baeac47092e5']
['9aca81db-4ef1-4201-8948-eb01f375a617']
['8600d4bf-92a2-47a7-8c24-dfe2d6870dda']
['ef65448e-fc3f-4add-b9fc-9cf339afb4a8']
['46885503-1f71-4766-a7fe-225ede9fa3a0']
['a18796c0-ebcf-45c5-bec7-ec0a8edc6233']
['d4781fc0-2585-4f44-8402-43a527daa8af', '218bdf0e-f3dd-488b-9e63-a2918784f7e2']
['1427eeae-983c-490c-b152-c840eaabc824']
['55c5ee9a-8a6f-423b-bdc0-7f4d86741926']
['0c7ab01c-3181-4f2f-b226-a0b5e8c15427']
['37785226-ea13-48e6-8687-11cc07866fdb', '33a66b45-7e13-46ca-820b-e107798ac003']
['58b416b2-9213-4ec0-9730-7ee925423e48']
['b406d9ec-7afd-42a

In [35]:
set([concert.venue for concert in concert_data if concert.location == 'Austin, TX, USA'])

# moody@admin.bigcat.com
# antones@admin.bigcat.com
# germania@admin.bigcat.com
# paramount@admin.bigcat.com

{'3TEN Austin City Limits Live',
 "Antone's - Austin",
 'Austin City Limits Live at The Moody Theater',
 'Brauntex Performing Arts Theatre',
 'Cheatham Street Warehouse',
 'Empire - Control Room',
 'Germania Insurance Amphitheater',
 'Gruene Hall',
 'Long Center for the Performing Arts - Dell Hall',
 'Moody Amphitheater',
 'Moody Center ATX',
 'Paramount Theatre Austin',
 'Scoot Inn',
 'Stubbs BarBQ',
 'The Concourse Project',
 'Zilker Park'}