In [100]:
from selenium.common.exceptions import WebDriverException, NoSuchElementException
from selenium.webdriver.common.by import By

import pandas as pd
import undetected_chromedriver as webdriver
import json
import random
import csv

In [130]:
# Get sample list of artists to search and get their web pages
artist_list = [    
     "Coldplay",
     "Radiohead",
     "Red Hot Chili Peppers",
     "Rihanna",
     "Eminem"  
]

In [131]:
# Initialize webdriver

driver = webdriver.Chrome()

In [133]:
# Search artist name on website, pull URL to first search result
# Write page URL to file every 50 artists pulled in case bot detection kicks in

band_page = []  # List to store band page URL
for artist in artist_list:
    artist_format = artist.replace(" ", "+")
    search_query = f"https://www.concertarchives.org/bands?search={artist_format}"
    driver.get(search_query)
    
    try:
        href_element = driver.find_element(By.XPATH, "//strong/a").get_attribute("href")
        band_page.append([artist, href_element])
    except NoSuchElementException:  # Skip if no search result found
        continue
        
    if len(band_page) == 50:  # Write to file every 50 artists
        with open("concert_band_page.csv", "a") as tfile:
            writer = csv.writer(tfile)
            writer.writerows(band_page)
            
        band_page = []
        
    time.sleep(random.random() + 0.5)  # Randomly wait 0.5 - 1.5 seconds before next artist
    
# Save remaining scraped band page URLs to file
if len(band_page) != 0:
    with open("concert_band_page.csv", "a") as tfile:
        writer = csv.writer(tfile)
        writer.writerows(band_page)

In [135]:
# Import list of band page URLs
band_page = []
with open("concert_band_page.csv", "r") as tfile:
    reader = csv.reader(tfile)
    
    for row in reader:
        band_page.append(row)
        
band_page[:5]

[['Coldplay', 'https://www.concertarchives.org/bands/coldplay'],
 ['Radiohead', 'https://www.concertarchives.org/bands/radiohead'],
 ['Red Hot Chili Peppers',
  'https://www.concertarchives.org/bands/red-hot-chili-peppers'],
 ['Rihanna', 'https://www.concertarchives.org/bands/rihanna'],
 ['Eminem', 'https://www.concertarchives.org/bands/eminem']]

In [128]:
# Initialize webdriver again if you took a break between getting
# artist URL and scraping artist page

driver = webdriver.Chrome()

In [137]:
# Loop through artists, get concerts info
master_df = None
is_master_df_set = False

for artist, artist_page in band_page:
    # Load page and get table HTML
    for page in range(1, 6):
        query = f"{artist_page}?page={page}"
        try:
            driver.get(query)
            table_element = driver.find_element(By.ID, "band-show-table-condensed")
            table_html = table_element.get_attribute("outerHTML")

            # Pass to Pandas, condense to one data frame
            df_list = pd.read_html(table_html)
            df = df_list[0]

            for i in range(1,len(df_list)):
                next_df = df_list[i]
                next_df.columns = df.columns  # Format columns for concat
                df = pd.concat([df, next_df])
        except:
            pass
        
        if is_master_df_set:
            master_df = pd.concat([master_df, df])
        else:
            master_df = df.copy()
    
    master_df["Artist"] = artist
    master_df.to_csv("concerts.csv", mode="a+", header=False)