In [3]:
import requests
import time
import re
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from urllib.request import urlopen
from urllib.parse import quote
from bs4 import BeautifulSoup
pd.options.mode.chained_assignment = None

In [38]:
url = "https://www.ibdb.com/shows/#current"
soup = BeautifulSoup(urlopen(url), 'html.parser')
showlinks = soup.find("div", {"id": "current"}).find_all("a", href=re.compile("/broadway-production/"))
shows = pd.DataFrame()
for showlink in showlinks:
    shows = pd.concat([shows, pd.DataFrame([[showlink.find("i").text, "https://www.ibdb.com/"+quote(showlink['href'])+"#OpeningNightCast"]])], ignore_index=True)
shows.columns = ["show","IBDB page"]
shows


Unnamed: 0,show,IBDB page
0,& Juliet,https://www.ibdb.com//broadway-production/-jul...
1,"A Beautiful Noise, The Neil Diamond Musical",https://www.ibdb.com//broadway-production/a-be...
2,Aladdin,https://www.ibdb.com//broadway-production/alad...
3,Camelot,https://www.ibdb.com//broadway-production/came...
4,Chicago,https://www.ibdb.com//broadway-production/chic...
5,Fat Ham,https://www.ibdb.com//broadway-production/fat-...
6,Funny Girl,https://www.ibdb.com//broadway-production/funn...
7,"Good Night, Oscar",https://www.ibdb.com//broadway-production/good...
8,Grey House,https://www.ibdb.com//broadway-production/grey...
9,Hadestown,https://www.ibdb.com//broadway-production/hade...


In [62]:
shows.to_csv("shows_with_opening_nights.csv")

In [39]:
# get all performers
def getperformers(show, page):
    soup=BeautifulSoup(urlopen(page),"html.parser")
    cast = soup.find("div", {"id": "OpeningNightCast"})
    if cast == None:
        return pd.DataFrame(["",""])

    castlinks = cast.find_all("a", href=re.compile("/broadway-cast-staff/"))
    names = set(i.get_text() for i in castlinks)
    performersdf = pd.DataFrame([name, show] for name in names)
    return performersdf
    
performers = pd.concat([getperformers(x, y) for x, y in zip(shows['show'], shows['IBDB page'])], ignore_index=True)
performers.columns = ["actor","show"]
performers.head()

Unnamed: 0,actor,show
0,Joomin Hwang,& Juliet
1,Alaina Vi Maderal,& Juliet
2,Betsy Wolfe,& Juliet
3,Megan Kane,& Juliet
4,Virgil Gadson,& Juliet


In [40]:
performers


Unnamed: 0,actor,show
0,Joomin Hwang,& Juliet
1,Alaina Vi Maderal,& Juliet
2,Betsy Wolfe,& Juliet
3,Megan Kane,& Juliet
4,Virgil Gadson,& Juliet
...,...,...
778,Melissa Fahn,Wicked
779,Corinne McFadden,Wicked
780,Kisha Howard,Wicked
781,Kristin Chenoweth,Wicked


In [42]:
def getwikipedias(name):
    subject = quote(name.replace(" ", "_").replace(".",""))
    link = 'https://en.wikipedia.org/wiki/'+subject

    if requests.get(link).status_code != 200:
        return ""

    summary = requests.get("https://en.wikipedia.org/api/rest_v1/page/summary/"+subject).json()['extract']
    if any([x in summary for x in ["actor", "actress"]]):
        return link
    else:
        return ""

performers['wikipedia link'] = performers['actor'].apply(lambda x: getwikipedias(x))
performers.head()


Unnamed: 0,actor,show,wikipedia link
0,Joomin Hwang,& Juliet,
1,Alaina Vi Maderal,& Juliet,
2,Betsy Wolfe,& Juliet,https://en.wikipedia.org/wiki/Betsy_Wolfe
3,Megan Kane,& Juliet,
4,Virgil Gadson,& Juliet,


In [44]:
performers.shape

(783, 3)

In [45]:
performers.to_csv("performer_list.csv")

In [49]:
perf = performers

def getdetails(link, taglist):
    if requests.get(link).status_code != 200:
        return ""

    soup = BeautifulSoup(urlopen(link),"html.parser")
    infobox = soup.find("table", class_="infobox")

    if infobox == None:
        return ""
    
    info = {}
    for th, td in zip(infobox.select('th.infobox-label'), infobox.select('td.infobox-data')):
        info[th.text.replace("\xa0"," ")] = td.text.replace("\xa0"," ")

    for key in info.keys():
        if key.lower() in taglist:
            return(info[key])
            
    return ""

schooltags = ['education', 'alma mater', 'school']
parenttags = ['parents']
relativestags = ['relatives']

perf['school'] = perf['wikipedia link'].apply(lambda x: getdetails(x, schooltags) if x!="" else "")
perf['parents'] = perf['wikipedia link'].apply(lambda x: getdetails(x, parenttags) if x!="" else "")
perf['relatives'] = perf['wikipedia link'].apply(lambda x: getdetails(x, relativestags) if x!="" else "")

perf

Unnamed: 0,actor,show,wikipedia link,school,parents,relatives
0,Joomin Hwang,& Juliet,,,,
1,Alaina Vi Maderal,& Juliet,,,,
2,Betsy Wolfe,& Juliet,https://en.wikipedia.org/wiki/Betsy_Wolfe,University of Cincinnati (BFA),,
3,Megan Kane,& Juliet,,,,
4,Virgil Gadson,& Juliet,,,,
...,...,...,...,...,...,...
778,Melissa Fahn,Wicked,https://en.wikipedia.org/wiki/Melissa_Fahn,California State University at Long Beach[3],,
779,Corinne McFadden,Wicked,,,,
780,Kisha Howard,Wicked,,,,
781,Kristin Chenoweth,Wicked,https://en.wikipedia.org/wiki/Kristin_Chenoweth,,,


In [51]:
def getcredits(link, credittags):
    if requests.get(link).status_code != 200:
        return ""

    sections = requests.get("https://en.wikipedia.org/api/rest_v1/page/mobile-sections-remaining/"+link.split("/")[-1]).json()['sections']
    for section in sections:
        if section['line'].lower().split(" ")[0] in credittags and "<table" in section['text']:
            table = BeautifulSoup(section['text'], "html.parser").find("table")
            return len(table.find_all("i"))

            # v = []
            # for tr in soup.find_all("tr")[1:]:
            #     # print(tr)
            #     # for td in tr.find_all('td'):
            #     row = [i.text for i in tr]
            #     print(row)
            #     print("_________")
            #     v.append(row)
            # print(len(v))
            # df = pd.DataFrame(v)
    return ""

filmtags = ["film"]
tvtags = ["television"]
theatretags = ["theatre","theater","stage"]

perf['film credits'] = perf['wikipedia link'].apply(lambda x: getcredits(x, filmtags) if x!="" else "")
perf['tv credits'] = perf['wikipedia link'].apply(lambda x: getcredits(x, tvtags) if x!="" else "")
perf['theater credits'] = perf['wikipedia link'].apply(lambda x: getcredits(x, theatretags) if x!="" else "")

perf

Unnamed: 0,actor,show,wikipedia link,school,parents,relatives,film credits,tv credits,theater credits
0,Joomin Hwang,& Juliet,,,,,,,
1,Alaina Vi Maderal,& Juliet,,,,,,,
2,Betsy Wolfe,& Juliet,https://en.wikipedia.org/wiki/Betsy_Wolfe,University of Cincinnati (BFA),,,4,2,22
3,Megan Kane,& Juliet,,,,,,,
4,Virgil Gadson,& Juliet,,,,,,,
...,...,...,...,...,...,...,...,...,...
778,Melissa Fahn,Wicked,https://en.wikipedia.org/wiki/Melissa_Fahn,California State University at Long Beach[3],,,2,,1
779,Corinne McFadden,Wicked,,,,,,,
780,Kisha Howard,Wicked,,,,,,,
781,Kristin Chenoweth,Wicked,https://en.wikipedia.org/wiki/Kristin_Chenoweth,,,,31,45,


In [52]:
perf.to_csv("performer_details.csv")

In [None]:
# # !pip install plotly
# import plotly.express as px
# school_data = perf[perf.school!=""].school.value_counts()
# fig = px.scatter(school_data)
# fig.write_html("schools.html")
# fig.show()

In [4]:
shows = pd.read_csv("shows.csv")
# shows['opening year'] = 

Unnamed: 0.1,Unnamed: 0,show,IBDB page
0,0,& Juliet,https://www.ibdb.com//broadway-production/-jul...
1,1,"A Beautiful Noise, The Neil Diamond Musical",https://www.ibdb.com//broadway-production/a-be...
2,2,Aladdin,https://www.ibdb.com//broadway-production/alad...
3,3,Camelot,https://www.ibdb.com//broadway-production/came...
4,4,Chicago,https://www.ibdb.com//broadway-production/chic...
5,5,Fat Ham,https://www.ibdb.com//broadway-production/fat-...
6,6,Funny Girl,https://www.ibdb.com//broadway-production/funn...
7,7,"Good Night, Oscar",https://www.ibdb.com//broadway-production/good...
8,8,Grey House,https://www.ibdb.com//broadway-production/grey...
9,9,Hadestown,https://www.ibdb.com//broadway-production/hade...


In [53]:

def getopeningyear(page):
    soup = BeautifulSoup(urlopen(page), 'html.parser')
    txt = soup.find("div", class_="production-info-panel").find("div", class_="xt-info-block").find_all("div", class_="col")

    for i in txt:
        if "Opening Date".lower() in i.get_text().lower():
            return i.select("div.xt-main-title")[0].text.split(',')[-1]

shows['opening night'] = shows['IBDB page'].apply(lambda x: getopeningyear(x) if x!="" else "")
# shows.drop(["Unnamed: 0"], axis=1)

shows


# import lxml.html
# dom = lxml.html.fromstring(requests.get(page).content)
# dom.cssselect(".production-info-panel")[0].text_content().strip()


Unnamed: 0.1,Unnamed: 0,show,IBDB page,opening night
0,0,& Juliet,https://www.ibdb.com//broadway-production/-jul...,2022
1,1,"A Beautiful Noise, The Neil Diamond Musical",https://www.ibdb.com//broadway-production/a-be...,2022
2,2,Aladdin,https://www.ibdb.com//broadway-production/alad...,2014
3,3,Camelot,https://www.ibdb.com//broadway-production/came...,2023
4,4,Chicago,https://www.ibdb.com//broadway-production/chic...,1996
5,5,Fat Ham,https://www.ibdb.com//broadway-production/fat-...,2023
6,6,Funny Girl,https://www.ibdb.com//broadway-production/funn...,2022
7,7,"Good Night, Oscar",https://www.ibdb.com//broadway-production/good...,2023
8,8,Grey House,https://www.ibdb.com//broadway-production/grey...,2023
9,9,Hadestown,https://www.ibdb.com//broadway-production/hade...,2019


In [161]:
perf = pd.read_csv("performer_details.csv")
mergeddf = pd.merge(shows, perf, on="show").drop(["Unnamed: 0_x", "Unnamed: 0_y"], axis=1)


In [181]:
screencredittags = ['film', 'television', 'filmography']

def getscreencreditsbefore(link, year):
    if type(link)!=str or requests.get(link).status_code != 200:
        return ""

    counter = 0
    sections = requests.get("https://en.wikipedia.org/api/rest_v1/page/mobile-sections-remaining/"+link.split("/")[-1]).json()['sections']
    for section in sections:
        if section['line'].lower().split(" ")[0] in screencredittags and "<table" in section['text']:
            table = BeautifulSoup(section['text'], "html.parser").find("table")
            for tr in table.find_all("tr"):
                td = tr.find("td")
                if td:
                    showyr = td.text.split('-')[0]
                    if showyr.isdigit() and int(showyr) < int(year):
                        counter+=1
            # l = []
            # for tr in table.find_all("tr"):
            #     td = tr.find_all('td')
            #     row = [tr.text for tr in td]
            #     l.append(row[0:2])
            # print(pd.DataFrame(l, columns=["year", "name"]))
    return counter


def getscreencreditsafter(link, year):
    if type(link)!=str or requests.get(link).status_code != 200:
        return ""

    counter = 0
    sections = requests.get("https://en.wikipedia.org/api/rest_v1/page/mobile-sections-remaining/"+link.split("/")[-1]).json()['sections']
    for section in sections:
        if section['line'].lower().split(" ")[0] in screencredittags and "<table" in section['text']:
            table = BeautifulSoup(section['text'], "html.parser").find("table")
            for tr in table.find_all("tr"):
                td = tr.find("td")
                if td:
                    showyr = td.text.split('-')[0]
                    if showyr.isdigit() and int(showyr) > int(year):
                        counter+=1

    return counter

# mergeddf['screen credits before opening night'] = 
# mergeddf.apply(lambda x: getscreencreditsbefore(x['wikipedia link'], x['opening night']), axis=1)
mergeddf['screen credits before opening night'] = mergeddf.apply(lambda x: getscreencreditsbefore(x['wikipedia link'], x['opening night']), axis=1)
mergeddf['screen credits after opening night'] = mergeddf.apply(lambda x: getscreencreditsafter(x['wikipedia link'], x['opening night']), axis=1)





In [188]:
mergeddf
# mergeddf.to_csv("performer_details_with_timed_credits.csv")


Unnamed: 0,show,IBDB page,opening night,actor,wikipedia link,school,parents,relatives,film credits,tv credits,theater credits,screen credits before opening night,screen credits after opening night
0,& Juliet,https://www.ibdb.com//broadway-production/-jul...,2022,Joomin Hwang,,,,,,,,,
1,& Juliet,https://www.ibdb.com//broadway-production/-jul...,2022,Alaina Vi Maderal,,,,,,,,,
2,& Juliet,https://www.ibdb.com//broadway-production/-jul...,2022,Betsy Wolfe,https://en.wikipedia.org/wiki/Betsy_Wolfe,University of Cincinnati (BFA),,,4.0,2.0,22.0,7,0
3,& Juliet,https://www.ibdb.com//broadway-production/-jul...,2022,Megan Kane,,,,,,,,,
4,& Juliet,https://www.ibdb.com//broadway-production/-jul...,2022,Virgil Gadson,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
778,Wicked,https://www.ibdb.com//broadway-production/wick...,2003,Melissa Fahn,https://en.wikipedia.org/wiki/Melissa_Fahn,California State University at Long Beach[3],,,2.0,,1.0,4,10
779,Wicked,https://www.ibdb.com//broadway-production/wick...,2003,Corinne McFadden,,,,,,,,,
780,Wicked,https://www.ibdb.com//broadway-production/wick...,2003,Kisha Howard,,,,,,,,,
781,Wicked,https://www.ibdb.com//broadway-production/wick...,2003,Kristin Chenoweth,https://en.wikipedia.org/wiki/Kristin_Chenoweth,,,,31.0,45.0,,4,28


In [186]:
perf_details = pd.read_csv("performer_EDA.csv")
perf_details


Unnamed: 0,actor,show,wikipedia link,terminal school,school (2),relatives,film credits,tv credits,theater credits
0,Anthony Ramos,Hamilton,https://en.wikipedia.org/wiki/Anthony_Ramos,American Musical and Dramatic Academy,,,0.0,8.0,9.0
1,Colton Ryan,"New York, New York",https://en.wikipedia.org/wiki/Colton_Ryan,Baldwin Wallace University,,,4.0,6.0,8.0
2,Jasmine Cephas Jones,Hamilton,https://en.wikipedia.org/wiki/Jasmine_Cephas_J...,Berklee College of Music,,Ron Cephas Jones (father)\nKim Lesley Hartman ...,12.0,8.0,3.0
3,Jennifer Simard,Once Upon a One More Time,https://en.wikipedia.org/wiki/Jennifer_Simard,Boston Conservatory,,,6.0,6.0,11.0
4,Myles Frost,MJ The Musical,https://en.wikipedia.org/wiki/Myles_Frost,Bowie State University,Belmont University,,1.0,4.0,
...,...,...,...,...,...,...,...,...,...
762,Winsome Brown,Grey House,,,,,,,
763,Wonza Johnson,MJ The Musical,,,,,,,
764,Yurel Echezarreta,Aladdin,,,,,,,
765,Yvette Gonzalez-Nacer,Hadestown,,,,,,,


In [202]:
mergeddf_t = mergeddf[["actor", "show", "opening night", "screen credits before opening night", "screen credits after opening night"]]
perf_details_t = perf_details.drop_duplicates()
final_perf_details = pd.merge(perf_details, mergeddf_t, on=["actor", "show"])

final_perf_details.to_csv("final_perf_details.csv")