In [10]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

## [Best Picture](https://www.imdb.com/search/title/?count=100&groups=oscar_best_picture_winners&sort=year%2Cdesc&ref_=nv_ch_osc) - Oscar Winners Per Year

### Scraping

In [6]:
url = "https://www.imdb.com/search/title/?count=100&groups=oscar_best_picture_winners&sort=year%2Cdesc&ref_=nv_ch_osc"
resp = requests.get(url)

In [12]:
soup = BeautifulSoup(resp.text)

In [17]:
headers = soup.findAll("h3", class_="lister-item-header")


In [47]:
titles, years, imdb_ids = [], [], []

for h in headers:
    title = h.a.text
    year = re.search("(\d\d\d\d)", h.find(class_="lister-item-year").text).group(1)
    url = h.a["href"]
    imdb_id = re.search("title/(.*)/", url).group(1)
    
    titles.append(title)
    imdb_ids.append(imdb_id)
    years.append(year)

In [53]:
imdb_bestpicture = pd.DataFrame({'titles': titles, 'years': years, 'imdb_ids': imdb_ids},
                  columns=['titles', 'years', 'imdb_ids'])


In [54]:
imdb_bestpicture

Unnamed: 0,titles,years,imdb_ids
0,Parasite,2019,tt6751668
1,Green Book,2018,tt6966692
2,The Shape of Water,2017,tt5580390
3,Moonlight,2016,tt4975722
4,Spotlight,2015,tt1895587
...,...,...,...
88,Cimarron,1931,tt0021746
89,All Quiet on the Western Front,1930,tt0020629
90,The Broadway Melody,1929,tt0019729
91,Wings,1927,tt0018578


In [58]:
imdb_bestpicture.to_csv("data/raw/oscar_winners.csv")

### Getting Ratings

In [3]:
df = pd.read_csv("data/raw/oscar_winners.csv")
df.drop(columns="Unnamed: 0", inplace=True)
df.imdb_ids = df.imdb_ids.apply(lambda t: re.sub("tt","",t))

In [4]:
df

Unnamed: 0,titles,years,imdb_ids
0,Parasite,2019,6751668
1,Green Book,2018,6966692
2,The Shape of Water,2017,5580390
3,Moonlight,2016,4975722
4,Spotlight,2015,1895587
...,...,...,...
88,Cimarron,1931,0021746
89,All Quiet on the Western Front,1930,0020629
90,The Broadway Melody,1929,0019729
91,Wings,1927,0018578


In [None]:
for i in df.index:
    print(i)
    id = df.loc[i, "imdb_ids"]
    url = f"http://bechdeltest.com/api/v1/getMovieByImdbId?imdbid={id}"
    req = requests.get(url)
    
    if 'year' in req.json() and req.json()['year'] == df.loc[i, "years"]:
        df.loc[i, "rating"] = req.json()["rating"]
    else:
        print("Something went wrong", i)

In [41]:
def manuallySetRating(movieName, rating):
    df.loc[df.title==movieName, "rating"] = rating

In [None]:
manuallySetRating("Parasite", 3.0)

In [45]:
df[df.rating.isnull()]

Unnamed: 0,titles,years,imdb_ids,rating
37,Gandhi,1982,83987,
49,Patton,1970,66206,
51,Oliver!,1968,63385,
67,The Greatest Show on Earth,1952,44672,
70,All the King's Men,1949,41113,
72,Gentleman's Agreement,1947,39416,
75,Going My Way,1944,36872,
81,You Can't Take It with You,1938,30993,


In [47]:
df.to_csv("data/ratings_best_picture_oscar_winners.csv")

## [Best Picture](https://www.widescreenings.com/list-best-picture-oscar.html) - Oscar Nominees Per Year

### Scraping

In [4]:
headers = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"}

In [5]:
url = "https://www.widescreenings.com/list-best-picture-oscar.html"
req = requests.get(url, headers=headers)

In [8]:
soup = BeautifulSoup(req.text)

In [148]:
years, movies = [], []

In [None]:
movies_page = re.sub(r"</?i+>", '', soup.findAll("p")[2].text)
for m in movies_page.split('\n')[3:]:
#     print(m)
    year = re.search("(\d\d\d\d)?", m).group(1)
    winner = re.search(": (.+) —", m).group(1)
    otherslist = re.search("beat (.*) \(Presented", m).group(1)  
    movieslist = [re.sub('^\s+', '', a) for a in otherslist.split(',')]
    movieslist.append(winner)
    
    for mov in movieslist:
        years.append(year)
        movies.append(mov)
#     print(year, winner)


In [227]:
oscar = pd.DataFrame({'year': years, 'title': movies},
                  columns=['year', 'title'])

In [158]:
oscar.to_csv("data/raw/oscar_nominees.csv")

### Getting Ratings

In [None]:
df = pd.read_csv("data/raw/oscar_nominees.csv")

In [None]:
notfoundindex = []

for i in df.index:

    title = df.loc[i, "title"].strip()
    new_title = title
    url = f"http://bechdeltest.com/api/v1/getMoviesByTitle?title={title}"    
    req = requests.get(url)
    
    if len(req.json()) == 0:
        new_title = re.sub(r"^\bThe\b ", '', title)
        new_title = re.sub(r"^\bA\b ", '', new_title)
        new_title = re.sub(r"^\bAn\b ", '', new_title)
        new_title = re.sub(r"III", '', new_title)
        new_title = re.sub(r"II", '', new_title)
        new_title = new_title.split(":")[0]
        
        url = f"http://bechdeltest.com/api/v1/getMoviesByTitle?title={new_title}"    
        req = requests.get(url)
        print("new title:", new_title, len(req.json()))
        
    if len(req.json()) == 0:
        new_title = new_title.split("-")[0]
        print("new title:", new_title, len(req.json()))
    
    rating = None
    
    for r in req.json():
        if int(df.loc[i, "year"]) == int(r['year']):   
            rating = r['rating']
            print(new_title, title, "Matched!")
            break
            
    if rating == None:
#         print(req_len, title, i)
        notfoundindex.append(i)
    
    df.loc[i, "rating"] = rating


Manually add some of the ratings

In [242]:
def manuallySetRating(movieName, rating):
    df.loc[df.title==movieName, "rating"] = rating

In [251]:
manuallySetRating("Parasite", 3.0)
manuallySetRating("Once Upon a Time... in Hollywood", 3.0)
manuallySetRating("Birdman or (The Unexpected Virtue of Ignorance)", 3.0)
manuallySetRating("Les MisÃ©rables", 3.0)
manuallySetRating("Winter’s Bone", 3.0)
manuallySetRating("Men in Black 3", 1.0)
manuallySetRating("Harry Potter and the Deathly Hallows - Part 2", 3.0)
manuallySetRating("Harry Potter and the Deathly Hallows - Part 1", 1.0)
manuallySetRating("The Twilight Saga: New Moon", 3.0)
manuallySetRating("The Dark Knight", 3.0)
manuallySetRating("300", 0.0)
manuallySetRating("Star Wars Episode III: Revenge of the Sith", 1.0)
manuallySetRating("Harry Potter and the Prisoner of Azkaban", 3.0)

In [253]:
manuallySetRating("Star Wars Episode II: Attack of the Clones", 3.0)
manuallySetRating("Harry Potter and the Sorcerer's Stone", 3.0)
manuallySetRating("Ocean's Eleven", 1.0)
manuallySetRating("Star Wars Episode I: The Phantom Menace", 3.0)
manuallySetRating("There's Something About Mary", 3.0)
manuallySetRating("A Bug's Life", 3.0)

manuallySetRating("My Best Friend's Wedding", 2.0)
manuallySetRating("The Rock", 1.0)
manuallySetRating("Dumb and Dumber", 2.0)
manuallySetRating("Die Hard with a Vengeance", 1.0)
manuallySetRating("Schindler's List", 3.0)
manuallySetRating("Lethal Weapon 3", 1.0)
manuallySetRating("Bram Stoker's Dracula", 2.0)
manuallySetRating("Wayne's World", 1.0)

manuallySetRating("Three Men and a Baby", 1.0)
manuallySetRating("The Secret of My Success", 1.0)
manuallySetRating("Crocodile Dundee", 3.0)
manuallySetRating("Star Wars Episode VI: Return of the Jedi", 1.0)
manuallySetRating("Footloose", 3.0)
manuallySetRating("E.T. the Extra-Terrestrial", 3.0)
manuallySetRating("Star Trek II: The Wrath of Khan", 3.0)
manuallySetRating("Porky's", 2.0)
manuallySetRating("Coal Miner's Daughter", 3.0)
manuallySetRating("Star Wars Episode V: The Empire Strikes Back", 0.0)

In [263]:
manuallySetRating("The King’s Speech", 3.0)

manuallySetRating("The Hurt Locker", 1.0)
manuallySetRating("Good Night and Good Luck", 1.0)
manuallySetRating("Crouching Tiger Hidden Dragon", 3.0)

manuallySetRating("Life is Beautiful", 0.0)
manuallySetRating("The Piano", 3.0)
manuallySetRating("A Room with a View", 3.0)
manuallySetRating("A Soldier’s Story", 0.0)

In [None]:
df.to_csv("data/ratings_best_picture_oscar_nominees.csv")

## [Kaggle Top 10 Highest Grossing Films](https://www.kaggle.com/bidyutchanda/top-10-highest-grossing-films-19752018#)

In [8]:
df = pd.read_csv("data/raw/blockbusters.csv")
df = df[["title", "year"]].copy()

In [9]:
df.head()

Unnamed: 0,title,year
0,Black Panther,2018
1,Avengers: Infinity War,2018
2,Incredibles 2,2018
3,Jurassic World: Fallen Kingdom,2018
4,Deadpool 2,2018


In [None]:
notfoundindex = []

for i in df.index:
#     if df.iloc[i].rating == df.iloc[i].rating: 
#         continue
#     else: 
#         # Need to redo these indices
#         print("Index", i)

    title = df.loc[i, "title"].strip()
    new_title = title
    url = f"http://bechdeltest.com/api/v1/getMoviesByTitle?title={title}"    
    req = requests.get(url)
    
    if len(req.json()) == 0:
        new_title = re.sub(r"^\bThe\b ", '', title)
        new_title = re.sub(r"^\bA\b ", '', new_title)
        new_title = re.sub(r"^\bAn\b ", '', new_title)
        new_title = re.sub(r"III", '', new_title)
        new_title = re.sub(r"II", '', new_title)
        new_title = new_title.split(":")[0]
        
        url = f"http://bechdeltest.com/api/v1/getMoviesByTitle?title={new_title}"    
        req = requests.get(url)
        print("new title:", new_title, len(req.json()))
        
    if len(req.json()) == 0:
        new_title = new_title.split("-")[0]
        print("new title:", new_title, len(req.json()))
    
    rating = None
    
    for r in req.json():
        if int(df.loc[i, "year"]) == int(r['year']):   
            rating = r['rating']
            print(new_title, title, "Matched!")
            break
            
    if rating == None:
#         print(req_len, title, i)
        notfoundindex.append(i)
    
    df.loc[i, "rating"] = rating
    


In [272]:
def manuallySetRating(movieName, rating):
    df.loc[df.title==movieName, "rating"] = rating

In [285]:
manuallySetRating("Men in Black 3", 1.0)
manuallySetRating("Harry Potter and the Deathly Hallows - Part 2", 3.0)
manuallySetRating("Harry Potter and the Deathly Hallows - Part 1", 1.0)
manuallySetRating("The Twilight Saga: New Moon", 3.0)
manuallySetRating("The Dark Knight", 3.0)

manuallySetRating("300", 0.0)
manuallySetRating("Star Wars Episode III: Revenge of the Sith", 1.0)
manuallySetRating("Harry Potter and the Prisoner of Azkaban", 3.0)

In [311]:
manuallySetRating("The King’s Speech", 3.0)

manuallySetRating("The Hurt Locker", 1.0)
manuallySetRating("Good Night and Good Luck", 1.0)
manuallySetRating("Crouching Tiger Hidden Dragon", 3.0)

manuallySetRating("Life is Beautiful", 0.0)
manuallySetRating("The Piano", 3.0)
manuallySetRating("A Room with a View", 3.0)
manuallySetRating("A Soldier’s Story", 0.0)

In [317]:
df.to_csv("data/ratings_highest_grossing.csv")

## [IMDB Emmys](https://www.imdb.com/event/ev0000223/2019/1/?ref_=ev_eh)