In [6]:
'''
MAIN PROMPT: Investigating if the age of a person has a higher chance of 
winning an Oscar or not for each category
'''
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as urllib2
import chardet


df = pd.read_csv("oscarnominees.csv", encoding="latin1")
df


Unnamed: 0,Title,Winner,Award,Year
0,Nomadland,Y,Picture,2020
1,Judas and the Black Messiah,N,Picture,2020
2,Mank,N,Picture,2020
3,Minari,N,Picture,2020
4,Promising Young Woman,N,Picture,2020
...,...,...,...,...
2339,Janet Gaynor,Y,Actress,1928
2340,Louise Dresser,N,Actress,1928
2341,Janet Gaynor,N,Actress,1928
2342,Janet Gaynor,N,Actress,1928


In [7]:
# Drop Picture Category
df = df.drop(df[df["Award"] == "Picture"].index)
df

Unnamed: 0,Title,Winner,Award,Year
8,Anthony Hopkins,Y,Actor,2020
9,Riz Ahmed,N,Actor,2020
10,Chadwick Boseman,N,Actor,2020
11,Gary Oldman,N,Actor,2020
12,Steven Yeun,N,Actor,2020
...,...,...,...,...
2339,Janet Gaynor,Y,Actress,1928
2340,Louise Dresser,N,Actress,1928
2341,Janet Gaynor,N,Actress,1928
2342,Janet Gaynor,N,Actress,1928


In [8]:
print(set(df["Award"]))

{'Supporting Actor', 'Actor', 'Actress', 'Supporting Actress'}


In [9]:
# Loads the page of a URL and downloads it HTML contents to be used
def loadPage(url):
    page = urllib2.urlopen(url)
    data = BeautifulSoup(page, "html.parser")

    return data

In [10]:
# Search for a person given a URL and find info about their DOB
def searchActor(url):
    try:
        # Load HTML page
        data = loadPage(url)

        # Find bday class from <span> element
        spanElement = data.find("span", {"class" : "bday"})

        if spanElement is None:
            # If it can't find the bday class, the birthday is in the <p> element 
            spanElement = data.find_all("p")[1]
        
        if "Nick_Nolte" in url:
            # Get birth year
            birthYear = int(spanElement.text[38:42])

        else:
            birthYear = int(spanElement.text[:4])

    except ValueError as e:
        # Value error incase it doesn't give a numerical age
        return "None"

    return birthYear

bestActorData = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor")
def findActor(name, year):
    # Decode name as name contains special characters that are corrupt in the dataset
    name = name.encode("iso-8859-1").decode("utf-8")

    # Find the "Winners and nominees" table in wikipedia HTML
    tables = bestActorData.find_all("table", {"class": "wikitable"})

    for table in tables:
        # Loop through all elements with <tr> element
        tableRows = table.find_all("tr")

        for tableRow in tableRows:
            # Loop through all elements with <td> element
            tableCells = tableRow.find_all("td")

            for tableCell in tableCells:
                # Find all <a> elements as they contain hrefs to the person's wikipedia
                aElement = tableCell.find("a", href=True)
                
                # Check if name exists in <a> element as they contain the person's name in the link
                if aElement is not None and name in aElement:
                    # Search for actor's age from wikipedia link
                    birthYear = searchActor("https://en.wikipedia.org" + aElement.get("href"))
                    
                    print(name, ":", year - birthYear)
                    # Return age
                    return year - birthYear

    print(name, ":", "Fail!")
    return 1


actors = df.loc[df["Award"] == "Actor"]
actors["Age"] = actors.apply(lambda r: findActor(r["Title"], r["Year"]), axis=1)


Anthony Hopkins : 83
Riz Ahmed : 38
Chadwick Boseman : 44
Gary Oldman : 62
Steven Yeun : 37
Joaquin Phoenix : 45
Leonardo DiCaprio : 45
Adam Driver : 36
Antonio Banderas : 59
Jonathan Pryce : 72
Rami Malek : 37
Christian Bale : 44
Bradley Cooper : 43
Willem Dafoe : 63
Viggo Mortensen : 60
Gary Oldman : 59
Timothée Chalamet : 22
Daniel Day-Lewis : 60
Daniel Kaluuya : 28
Denzel Washington : 63
Casey Affleck : 41
Andrew Garfield : 33
Ryan Gosling : 36
Viggo Mortensen : 58
Denzel Washington : 62
Leonardo DiCaprio : 41
Bryan Cranston : 59
Matt Damon : 45
Michael Fassbender : 38
Eddie Redmayne : 33
Eddie Redmayne : 32
Benedict Cumberbatch : 38
Bradley Cooper : 39
Michael Keaton : 63
Steve Carell : 52
Matthew McConaughey : 44
Christian Bale : 39
Bruce Dern : 77
Leonardo DiCaprio : 39
Chiwetel Ejiofor : 36
Bradley Cooper : 37
Daniel Day-Lewis : 55
Hugh Jackman : 44
Joaquin Phoenix : 38
Denzel Washington : 58
Jean Dujardin : 39
Demián Bichir : 48
George Clooney : 50
Gary Oldman : 53
Brad Pitt :

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actors["Age"] = actors.apply(lambda r: findActor(r["Title"], r["Year"]), axis=1)


In [11]:
def searchActress(url):
    try:
        data = loadPage(url)

        spanElement = data.find("span", {"class": "bday"})

        if spanElement is None:
            spanElement = data.find_all("div", {"class" : "shortdescription nomobile noexcerpt noprint searchaux"})[0]

        if "Rosamund_Pike" in url:
            birthYear = int(spanElement.text[-5:-1])

        elif "Grace_Moore" in url or "Joan_Crawford" in url:
            birthYear = int(spanElement.text[-10:-6])

        else:
            birthYear = int(spanElement.text[:4])
        
        return birthYear
    except ValueError as e:
        return "None"


bestActressData = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actress")
def findActress(name, year):
    name = name.encode("iso-8859-1").decode("utf-8")

    # Find the "Winners and nominees" table
    tables = bestActressData.find_all("table", {"class" : "wikitable"})

    for table in tables:
        tableRows = table.find_all("tr")

        for tableRow in tableRows:
            tableCells = tableRow.find_all("td")

            for tableCell in tableCells:
                aElement = tableCell.find("a", href=True)

                if aElement is not None and name in aElement.text:
                    birthYear = searchActress("https://en.wikipedia.org" + aElement.get("href"))

                    # In case a person's birth year can't be found, we return "None" to be dropped later
                    if birthYear != "None":
                        print(name, ":", year - birthYear)
                        return year - birthYear
                    
                    return birthYear

    print(name, ":", "Fail!")
    return 1

actress = df.loc[df["Award"] == "Actress"]
actress["Age"] = actress.apply(lambda r: findActress(r["Title"], r["Year"]), axis=1)

Frances McDormand : 63
Viola Davis : 55
Andra Day : 36
Vanessa Kirby : 32
Carey Mulligan : 35
Renée Zellweger : 50
Cynthia Erivo : 32
Scarlett Johansson : 35
Saoirse Ronan : 25
Charlize Theron : 44
Olivia Colman : 44
Yalitza Aparicio : 25
Glenn Close : 71
Lady Gaga : 32
Melissa McCarthy : 48
Frances McDormand : 60
Sally Hawkins : 41
Margot Robbie : 27
Saoirse Ronan : 23
Meryl Streep : 68
Emma Stone : 28
Isabelle Huppert : 63
Ruth Negga : 35
Natalie Portman : 35
Meryl Streep : 67
Brie Larson : 26
Cate Blanchett : 46
Jennifer Lawrence : 25
Charlotte Rampling : 69
Saoirse Ronan : 21
Julianne Moore : 54
Felicity Jones : 31
Marion Cotillard : 39
Reese Witherspoon : 38
Rosamund Pike : 35
Cate Blanchett : 44
Amy Adams : 39
Sandra Bullock : 49
Judi Dench : 79
Meryl Streep : 64
Jessica Chastain : 35
Jennifer Lawrence : 22
Emmanuelle Riva : 85
Quvenzhané Wallis : 9
Naomi Watts : 44
Meryl Streep : 62
Glenn Close : 64
Viola Davis : 46
Rooney Mara : 26
Michelle Williams : 31
Natalie Portman : 29
An

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actress["Age"] = actress.apply(lambda r: findActress(r["Title"], r["Year"]), axis=1)


In [14]:
bestSupportingActorData = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actor")

def searchSupportingActor(url):
    try:
        data = loadPage(url)

        spanElement = data.find("span", {"class" : "bday"})

        if spanElement is None:
            spanElement = data.find_all("p")

        if "Ralph_Richardson" in url:
            birthYear = int(spanElement[2].text[40:44])

        elif "John_Gielgud" in url:
            birthYear = int(spanElement[5].text[29:33])

        elif "Stanley_Holloway" in url:
            birthYear = int(spanElement[1].text[41:45])
            
        elif "Nick_Nolte" in url:
            spanElement = data.find_all("p")[1].text
            birthYear = int(spanElement[38:42])
            
        else:
            birthYear = int(spanElement.text[:4])
        
        return birthYear
    
    except ValueError as e:
        return "None"


def findSupportingActor(name, year):
    name = name.encode("iso-8859-1").decode("utf-8")

    # Find the "Winners and nominees" table
    tables = bestSupportingActorData.find_all("table", {"class" : "wikitable"})

    for table in tables:
        tableRows = table.find_all("tr")

        for tableRow in tableRows:
            tableCells = tableRow.find_all("td")

            for tableCell in tableCells:
                aElement = tableCell.find("a", href=True)

                if aElement is not None and name in aElement.text:
                    birthYear = searchSupportingActor("https://en.wikipedia.org" + aElement.get("href"))

                    if birthYear != "None":
                        print(name, ":", year - birthYear)
                        return year - birthYear
                    
                    return birthYear

    print(name, ":", "Fail!")
    return 1

bestSupportingActor = df.loc[df["Award"] == "Supporting Actor"]
bestSupportingActor["Age"] = bestSupportingActor.apply(lambda r: findSupportingActor(r["Title"], r["Year"]), axis=1)

Daniel Kaluuya : 31
Sacha Baron Cohen : 49
Leslie Odom Jr. : 39
Paul Raci : 72
LaKeith Stanfield : 29
Brad Pitt : 56
Tom Hanks : 63
Anthony Hopkins : 82
Al Pacino : 79
Joe Pesci : 76
Mahershala Ali : 44
Adam Driver : 35
Sam Elliott : 74
Richard E. Grant : 61
Sam Rockwell : 50
Sam Rockwell : 49
Willem Dafoe : 62
Woody Harrelson : 56
Richard Jenkins : 70
Christopher Plummer : 88
Mahershala Ali : 42
Jeff Bridges : 67
Lucas Hedges : 20
Dev Patel : 26
Michael Shannon : 42
Mark Rylance : 55
Christian Bale : 41
Tom Hardy : 38
Mark Ruffalo : 48
Sylvester Stallone : 69
J. K. Simmons : 59
Edward Norton : 45
Ethan Hawke : 44
Mark Ruffalo : 47
Robert Duvall : 83
Jared Leto : 42
Barkhad Abdi : 28
Bradley Cooper : 38
Michael Fassbender : 36
Jonah Hill : 30
Alan Arkin : 78
Robert De Niro : 69
Philip Seymour Hoffman : 45
Tommy Lee Jones : 66
Christoph Waltz : 56
Christopher Plummer : 82
Kenneth Branagh : 51
Jonah Hill : 28
Nick Nolte : 70
Max von Sydow : 82
Christian Bale : 36
John Hawkes : 51
Jeremy 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bestSupportingActor["Age"] = bestSupportingActor.apply(lambda r: findSupportingActor(r["Title"], r["Year"]), axis=1)


In [13]:
bestSupportingActressData = loadPage("https://en.wikipedia.org/wiki/Academy_Award_for_Best_Supporting_Actress")

def searchSupportingActress(url):
    try:
        data = loadPage(url)
        spanElement = data.find("span", {"class" : "bday"})

        if spanElement is None:
            spanElement = data.find_all("p")

        if "Rachel_Griffiths" in url:
            birthYear = int(spanElement[1].text[34:38])
        
        elif "Katina_Paxinou" in url:
            birthYear = int(spanElement[0].text[51:55])

        else:
            birthYear = int(spanElement.text[:4])
        
        return birthYear
    except AttributeError as e:
        return "None"
    

def findSupportingActress(name, year):
    name = name.encode("iso-8859-1").decode("utf-8")

    # Find the "Winners and nominees" table
    tables = bestSupportingActressData.find_all("table", {"class" : "wikitable"})

    for table in tables:
        tableRows = table.find_all("tr")

        for tableRow in tableRows:
            tableCells = tableRow.find_all("td")

            for tableCell in tableCells:
                aElement = tableCell.find("a", href=True)

                if aElement is not None and name in aElement.text:
                    birthYear = searchSupportingActress("https://en.wikipedia.org" + aElement.get("href"))

                    if birthYear != "None":
                        print(name, ":", year - birthYear)
                        return year - birthYear
                    
                    return birthYear

    print(name, ":", "Fail!")
    return 1
bestSupportingActress = df.loc[df["Award"] == "Supporting Actress"]
bestSupportingActress["Age"] = bestSupportingActress.apply(lambda r: findSupportingActress(r["Title"], r["Year"]), axis=1)

Youn Yuh-jung : 73
Maria Bakalova : 24
Glenn Close : 73
Olivia Colman : 46
Amanda Seyfried : 35
Laura Dern : 52
Kathy Bates : 71
Scarlett Johansson : 35
Florence Pugh : 23
Margot Robbie : 29
Regina King : 47
Amy Adams : 44
Marina de Tavira : 44
Emma Stone : 30
Rachel Weisz : 48
Allison Janney : 58
Mary J. Blige : 46
Lesley Manville : 61
Laurie Metcalf : 62
Octavia Spencer : 47
Viola Davis : 51
Naomie Harris : 40
Nicole Kidman : 49
Octavia Spencer : 46
Michelle Williams : 36
Alicia Vikander : 27
Jennifer Jason Leigh : 53
Rooney Mara : 30
Rachel McAdams : 37
Kate Winslet : 40
Patricia Arquette : 46
Emma Stone : 26
Keira Knightley : 29
Laura Dern : 47
Meryl Streep : 65
Lupita Nyong'o : 30
Sally Hawkins : 37
Jennifer Lawrence : 23
Julia Roberts : 46
June Squibb : 84
Amy Adams : 38
Sally Field : 66
Anne Hathaway : 30
Helen Hunt : 49
Jacki Weaver : 65
Octavia Spencer : 41
Bérénice Bejo : 35
Jessica Chastain : 34
Melissa McCarthy : 41
Janet McTeer : 50
Melissa Leo : 50
Amy Adams : 36
Helena B

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bestSupportingActress["Age"] = bestSupportingActress.apply(lambda r: findSupportingActress(r["Title"], r["Year"]), axis=1)


In [15]:
# Add in age column
df["Age"] = "None"

# Merge temporary datasets to actual dataset
df.loc[df["Award"] == "Actor"] = actors
df.loc[df["Award"] == "Actress"] = actress
df.loc[df["Award"] == "Supporting Actor"] = bestSupportingActor
df.loc[df["Award"] == "Supporting Actress"] = bestSupportingActress

pd.set_option("display.max_rows", None)

# Drop all ages with "None" as their age can't be determined
df = df.drop(df[df["Age"] == "None"].index)

df

Unnamed: 0,Title,Winner,Award,Year,Age
8,Anthony Hopkins,Y,Actor,2020,83
9,Riz Ahmed,N,Actor,2020,38
10,Chadwick Boseman,N,Actor,2020,44
11,Gary Oldman,N,Actor,2020,62
12,Steven Yeun,N,Actor,2020,37
13,Frances McDormand,Y,Actress,2020,63
14,Viola Davis,N,Actress,2020,55
15,Andra Day,N,Actress,2020,36
16,Vanessa Kirby,N,Actress,2020,32
17,Carey Mulligan,N,Actress,2020,35
