In [1]:
from pydantic import BaseModel, ConfigDict, BeforeValidator, Field
from enum import Enum
from datetime import datetime
from typing import Optional, Annotated

PyObjectId = Annotated[str, BeforeValidator(str)]

class ConcertStatus(Enum):
    TENTATIVE = 0
    CONFIRMED = 1
    ON_SALE = 2
    SOLD_OUT = 3
    POSTPONED = 4
    CANCELED = 5
    COMPLETED = 6

class NewConcert(BaseModel):
    id: Optional[PyObjectId] = Field(alias='_id', default=None) # primary key
    uid: str
    artist: str
    tour_name: Optional[str] 
    venue: str
    location: str
    concert_datetime: datetime
    status: Optional[ConcertStatus | int]
    update_datetime: datetime

    model_config = ConfigDict(use_enum_values=True)

In [2]:
import requests
from bs4 import BeautifulSoup
from uuid import uuid4
from datetime import datetime

def get_concert_data(url:str, location:str) -> list:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    m_boxes = soup.find_all('div', class_='MuiCardContent-root styles_cardContent__nFnOO styles_small__RCnB2 mui-15q2cw4')
    cap = soup.find_all(attrs={'data-testid': 'card-venue-name-0'})
    data = []

    for box, caption in zip(m_boxes, cap):
        date = box.div.find_next_sibling("div").find_all("div")[0].text.replace('•', ' ')
        date = datetime.strptime(date, '%a %b %d %I:%M %p')
        date = date.replace(year=datetime.now().year)
        data.append(NewConcert(
            uid=str(uuid4()),
            artist=box.div.span.text,
            tour_name=None,
            venue=caption.text,
            location=location,
            concert_datetime=date,
            status=ConcertStatus.CONFIRMED,
            update_datetime=datetime.now()
        ))

    return data


In [3]:
concert_data = []
url_list = [('https://www.vividseats.com/geo/usa/tx/austin/concerts', 'Austin, TX, USA'),
            ('https://www.vividseats.com/geo/usa/tx/houston/concerts', 'Houston, TX, USA'),
            ('https://www.vividseats.com/geo/usa/tx/san-antonio/concerts', 'San Antonio, TX, USA'),
            ('https://www.vividseats.com/geo/usa/tx/dallas/concerts', 'Dallas, TX, USA'),
            ('https://www.vividseats.com/geo/usa/tx/el-paso/concerts', 'El Paso, TX, USA')]

for url, location in url_list:
    concert_data.extend(get_concert_data(url, location))

In [4]:
concert_data

[NewConcert(id=None, uid='578ac440-a2da-419d-b317-1b29e74d96db', artist='Laufey', tour_name=None, venue='Moody Center ATX', location='Austin, TX, USA', concert_datetime=datetime.datetime(2025, 9, 21, 19, 30), status=1, update_datetime=datetime.datetime(2025, 9, 20, 7, 38, 39, 960569)),
 NewConcert(id=None, uid='0061b2cd-1601-4418-8b99-3c00db113a53', artist='Tate McRae', tour_name=None, venue='Moody Center ATX', location='Austin, TX, USA', concert_datetime=datetime.datetime(2025, 10, 31, 19, 0), status=1, update_datetime=datetime.datetime(2025, 9, 20, 7, 38, 39, 960684)),
 NewConcert(id=None, uid='8674dcd6-0efc-4e03-ae6e-52086998d2db', artist='Mumford and Sons', tour_name=None, venue='Moody Center ATX', location='Austin, TX, USA', concert_datetime=datetime.datetime(2025, 10, 24, 19, 30), status=1, update_datetime=datetime.datetime(2025, 9, 20, 7, 38, 39, 960742)),
 NewConcert(id=None, uid='c944e31b-113c-48ea-b965-dee30d60e58a', artist='Dom Dolla (18+ Event)', tour_name=None, venue='Germ

In [5]:
import os
from dotenv import load_dotenv
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

# get password and uri for database connection
load_dotenv()
db_password = os.getenv('MONGO_PASSWORD')
uri = os.getenv('DATABASE_URI').replace('db_password', db_password)

# create client and connect to server
client = MongoClient(uri, server_api=ServerApi('1'))

# ping for confirmation and update databases
db = client['bigcat']
for concert in concert_data:
    db['concert'].insert_one(concert.__dict__)

print('Done!')


Done!


In [6]:
pipeline = [
    {
        '$group': {
            '_id': {'artist': '$artist', 'tour_name': '$tour_name', 'venue': '$venue', 'location': '$location', 'datetime': '$datetime', 'status': '$status'},
            'duplicate_ids': {'$push': '$uid'},
            'count': {'$sum': 1}
        }
    },
    {
        '$match': {
            'count': {'$gt': 1}
        }
    }
]

duplicates = list(db['concert'].aggregate(pipeline))
print(f'{len(duplicates)} duplicates found.')

counter = 0

for doc in duplicates:
        ids_to_remove = doc['duplicate_ids'][1:]  # Keep the first, remove the rest
        if ids_to_remove:
            db['concert'].delete_many({'uid': {'$in': ids_to_remove}})
        counter += 1

print(f"{counter} duplicate documents removed.")

98 duplicates found.
98 duplicate documents removed.


In [8]:
set([concert.venue for concert in concert_data if concert.location == 'Austin, TX, USA'])

# moody@admin.bigcat.com
# antones@admin.bigcat.com
# germania@admin.bigcat.com
# paramount@admin.bigcat.com

{'3TEN Austin City Limits Live',
 "Antone's - Austin",
 'Austin City Limits Live at The Moody Theater',
 'Come and Take It Live',
 'Empire - Garage',
 'Germania Insurance Amphitheater',
 'Gruene Hall',
 "Johnny's Steaks and BBQ",
 'Moody Amphitheater',
 'Moody Center ATX',
 'Paramount Theatre Austin',
 'Scoot Inn',
 'Stubbs BarBQ',
 'The Concourse Project',
 'Zilker Park'}