<span style="font-size:3.0em">
    Wikipedia Scrape – Tony Nominees
    <br>
</span> 

<i> Some neat smooth functions for grabbing data from Wikipedia. There are some inadequacies, skip down to section 6 to read what they are.</i>

# Import Functions and Dependencies

## Prereqs

In [1]:
# ! pip install lxml setuptools requests bs4 pandas numpy IPython

## Load Packages

In [2]:
import requests
import pandas as pd
from IPython.display import display_html
from bs4 import BeautifulSoup
import numpy as np
import re
import os

## Function to get tables from url

<pre>
get_tables_from_url(url)
</pre>

In [3]:

def get_tables_from_url(url):
    """
    Gets a list of tables from an input url.
    
    """

    # Request the document
    r = requests.get(url)
    html_doc = r.text
    soup = BeautifulSoup(html_doc,'lxml')
    
    table_classes = {"class": ["wikitable"]}
    tables = soup.findAll("table", table_classes)
    
    return tables

## Improved!
<pre>
get_df_from_table(table_in)
</pre>

In [43]:
def get_df_from_table(table_in):
    """
    returns a structured table for a beautiful soup table
    """

    ### SET UP REGEX

    re_1 = "\d{4}"
    re_2 = "\d{1,2}(th|rd|st|nd) Tony Awards"

    # Get Winners!
    winner_vals=[]
    for row in table_in.find_all('tr',attrs={'style':'background:#B0C4DE'}):
        vals = [cell.text.strip() for cell in row.findAll('td')]
        winner_vals.append(vals)


    # JUMP INTO IT!
    parsed_table_data=[]
    rows = table_in.find_all('tr')

    row_1 = rows[0]
    variables = [cell.text.strip() for cell in row_1.findAll('th', style=True)]

    record_list = []
    for row in rows[1:]:
        record = {}
        # Add the year
        if re.findall(re_2, row.text):
            year = re.findall(re_1, row.text)[0]
            continue
        values = [cell.text.strip() for cell in row.findAll('td')]

        # Compare to winners
        if values in winner_vals:
            record['Winner']=True
        else:
            record['Winner']=False

        # Get busy   
        values.insert(0,year)
        while len(values)< len(variables):
            values.append(values[-1])
        for k, v in zip(variables,values):
            record[k]=v

        if row.find('a'):
            record['Link'] = 'https://en.wikipedia.org' + row.find('a').get('href')

        record_list.append(record)
    
    df = pd.DataFrame(record_list)
    return df

# Get Musicals
Scrape that wikipedia stuff.

In [58]:
url = 'https://en.wikipedia.org/wiki/Tony_Award_for_Best_Musical'

In [59]:
tables = get_tables_from_url(url)
df=pd.DataFrame()
for table in tables:
    sub_df = get_df_from_table(table)
    df = df.append(sub_df,ignore_index=True)
    
name_root = url.split('/')[-1]
df_name = 'Wikipedia_scrape_' + name_root + '.csv'
df = df[['Year','Winner','Musical','Music','Book','Lyrics','Link']]
df.to_csv(df_name)
df.head()

Unnamed: 0,Year,Winner,Musical,Music,Book,Lyrics,Link
0,1949,True,"Kiss Me, Kate",Cole Porter,Bella & Samuel Spewack,Cole Porter,"https://en.wikipedia.org/wiki/Kiss_Me,_Kate"
1,1950,True,South Pacific †,Richard Rodgers,Oscar Hammerstein II & Joshua Logan,Hammerstein,https://en.wikipedia.org/wiki/South_Pacific_(m...
2,1951,True,Guys and Dolls,Frank Loesser,Abe Burrows & Jo Swerling,Frank Loesser,https://en.wikipedia.org/wiki/Guys_and_Dolls
3,1952,True,The King and I,Richard Rodgers,Oscar Hammerstein II,Hammerstein,https://en.wikipedia.org/wiki/The_King_and_I
4,1953,True,Wonderful Town,Leonard Bernstein,Jerome Chodorov & Joseph Fields,Betty Comden & Adolph Green,https://en.wikipedia.org/wiki/Wonderful_Town


# Get Plays
Scrape that wikipedia stuff.

In [60]:
url = 'https://en.wikipedia.org/wiki/Tony_Award_for_Best_Play'

In [62]:
tables = get_tables_from_url(url)
df=pd.DataFrame()
for table in tables:
    sub_df = get_df_from_table(table)
    df = df.append(sub_df,ignore_index=True)
    
name_root = url.split('/')[-1]
df_name = 'Wikipedia_scrape_' + name_root + '.csv'
df = df[['Year','Winner','Production','Author','Link']]
df.to_csv(df_name)
df.head()

Unnamed: 0,Year,Winner,Production,Author,Link
0,1948,True,Mister Roberts,Thomas Heggen & Joshua Logan,https://en.wikipedia.org/wiki/Mister_Roberts_(...
1,1949,True,Death of a Salesman †,Arthur Miller,https://en.wikipedia.org/wiki/Death_of_a_Salesman
2,1950,True,The Cocktail Party,T. S. Eliot,https://en.wikipedia.org/wiki/The_Cocktail_Party
3,1951,True,The Rose Tattoo,Tennessee Williams,https://en.wikipedia.org/wiki/The_Rose_Tattoo
4,1952,True,The Fourposter,Jan de Hartog,https://en.wikipedia.org/wiki/The_Fourposter


# Get Revival of a Musical
Scrape that wikipedia stuff.

In [63]:
url = 'https://en.wikipedia.org/wiki/Tony_Award_for_Best_Revival_of_a_Musical'

In [68]:
tables = get_tables_from_url(url)
df=pd.DataFrame()
for table in tables:
    sub_df = get_df_from_table(table)
    df = df.append(sub_df,ignore_index=True)
    
name_root = url.split('/')[-1]
df_name = 'Wikipedia_scrape_' + name_root + '.csv'
df = df[['Year','Winner','Musical','Music','Book','Lyrics','Link']]
df.to_csv(df_name)
df.head()

Unnamed: 0,Year,Winner,Musical,Music,Book,Lyrics,Link
0,1994,True,Carousel,Richard Rodgers,Oscar Hammerstein II,Hammerstein,https://en.wikipedia.org/wiki/Carousel_(musical)
1,1994,False,Damn Yankees,Richard Adler,George Abbott & Douglass Wallop,Jerry Ross,https://en.wikipedia.org/wiki/Damn_Yankees
2,1994,False,Grease,Warren Casey & Jim Jacobs,Warren Casey & Jim Jacobs,Warren Casey & Jim Jacobs,https://en.wikipedia.org/wiki/Grease_(musical)
3,1994,False,She Loves Me,Jerry Bock,Joe Masteroff,Sheldon Harnick,https://en.wikipedia.org/wiki/She_Loves_Me
4,1995,True,Show Boat,Jerome Kern,Oscar Hammerstein II,Hammerstein & P. G. Wodehouse,https://en.wikipedia.org/wiki/Show_Boat


# Get Revival of Plays
Scrape that wikipedia stuff.

In [69]:
url = 'https://en.wikipedia.org/wiki/Tony_Award_for_Best_Revival_of_a_Play'

In [72]:
tables = get_tables_from_url(url)
df=pd.DataFrame()
for table in tables:
    sub_df = get_df_from_table(table)
    df = df.append(sub_df,ignore_index=True)
    
name_root = url.split('/')[-1]
df_name = 'Wikipedia_scrape_' + name_root + '.csv'
df = df[['Year','Winner','Play','Author','Link']]
# df.to_csv(df_name)
df.head()

Unnamed: 0,Year,Winner,Play,Author,Link
0,1994,True,An Inspector Calls,J. B. Priestley,https://en.wikipedia.org/wiki/An_Inspector_Calls
1,1994,False,Abe Lincoln in Illinois,Robert E. Sherwood,https://en.wikipedia.org/wiki/Abe_Lincoln_in_I...
2,1994,False,Medea,Euripides,https://en.wikipedia.org/wiki/Medea_(play)
3,1994,False,Timon of Athens,William Shakespeare,https://en.wikipedia.org/wiki/Timon_of_Athens
4,1995,True,The Heiress,Augustus Goetz and Ruth Goetz,https://en.wikipedia.org/wiki/The_Heiress_(194...


# Finished!
Done! Each is in its own df.

<hr>
<i>
    Maybe make a single df with a column representing the category of a tony nominee? Also, would be nice to include which ones are winners. This can be done in the next iteration.  
    </i>
    <hr>

## Load df
<i>
If you want to load the files you just downloaded... 
</i>
<pre>
get_files(dir_in = os.getcwd(), csv_in = True):
</pre>

In [73]:
import os

def get_files(dir_in = os.getcwd(), csv_in = True):
    """
    returns a list of paths of files
    option to include csvs
    requires you load os
    """
    files = [x for x in os.listdir(dir_in)]
    file_paths = [os.path.abspath(x) for x in os.listdir(dir_in)]
    csv_paths = [x for x in file_paths if x.endswith('.csv')]
    
    if csv_in is False:
        return file_paths
    else:
        return csv_paths
        

In [74]:
import pandas as pd

csvs = get_files()
for x in csvs:
    if x.endswith('Wikipedia_scrape_Tony_Award_for_Best_Musical.csv'):
        df = pd.read_csv(x,index_col=0)
        print ('file is here!')
        print(df.head(5))
    else:
        print ('the file you want isn\'t here')

file is here!
   Year  Winner          Musical              Music  \
0  1949    True    Kiss Me, Kate        Cole Porter   
1  1950    True  South Pacific †    Richard Rodgers   
2  1951    True   Guys and Dolls      Frank Loesser   
3  1952    True   The King and I    Richard Rodgers   
4  1953    True   Wonderful Town  Leonard Bernstein   

                                  Book                       Lyrics  \
0               Bella & Samuel Spewack                  Cole Porter   
1  Oscar Hammerstein II & Joshua Logan                  Hammerstein   
2            Abe Burrows & Jo Swerling                Frank Loesser   
3                 Oscar Hammerstein II                  Hammerstein   
4      Jerome Chodorov & Joseph Fields  Betty Comden & Adolph Green   

                                                Link  
0        https://en.wikipedia.org/wiki/Kiss_Me,_Kate  
1  https://en.wikipedia.org/wiki/South_Pacific_(m...  
2       https://en.wikipedia.org/wiki/Guys_and_Dolls  
3       