# Data Scraping

## Import libraries

In [1]:
import pandas as pd

import re
import os
import time

import requests
from bs4 import BeautifulSoup

from functools import partial
from typing import Union

## Helper functions

In [2]:
def get_page_content(url: str) -> str:
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception('Response code is not equal to 200')
    return response.content

In [3]:
def parse_year(content: str) -> str:
    match = re.search('[0-9]{4}', content)
    return match.group(0) if match else ''

In [4]:
def parse_number(content: str) -> Union[int, None]:
    match = re.search('\d+', content)
    return match.group(0) if match else None   

## Get movies awards (Oscar, BAFTA, Golden Globe)

### Helpers to parse Wikipedia pages

In [5]:
def get_real_year(movie_year: int, row_content: str) -> int:
    row_year = parse_year(row_content)
    real_year = row_year if row_year else str(movie_year)
    return int(real_year) if real_year.isdigit() else 0

In [6]:
def is_winner(row: BeautifulSoup) -> bool:
    colors = ['b0c4de', 'faeb86', 'd3d3d3', 'ccc']
    content = row['style'] if row.get('style') else str(row)
    return any(clr in content.lower() for clr in colors)

In [48]:
def parse_award_row(row: BeautifulSoup, movie_year: int, 
                    nominee_idx: int, movie_idx: int) -> dict[str, Union[str, int, bool]]:
    columns = row.find_all('td')
    
    if len(columns) < max(max(movie_idx, nominee_idx) + 1, 2):
        return {}
    
    f_column = columns[0].text.replace('\n', '').replace('[', '').replace(']', '')
    if f_column.isdigit(): # year value
        columns.pop(0)
        
    movie_dict = {'year': movie_year, 'winner': is_winner(row)}
    nominee = columns[nominee_idx]
    movie = columns[movie_idx]
    movie_link = movie.find('a')
    
    movie_dict['nominee'] = nominee.text.replace('\n', '').strip()
    movie_dict['movie'] = movie.text.replace('&', 'and').replace('\n', '').strip()
    movie_dict['wiki_url'] = movie_link['href'] if movie_link else ''

    return movie_dict

In [49]:
def parse_award_page(options: dict[str, str], min_year: int = 1980, max_year: int = 2021) -> dict[str, str]:
    wiki_content = get_page_content(options['wiki_url'])
    wiki_soup = BeautifulSoup(wiki_content)
    wiki_tables = wiki_soup.find_all('table', class_='wikitable') 
    
    movies_list = []
    movie_award = options['award']
    movie_category = options['category']
    
    parse_row = partial(
        parse_award_row,
        nominee_idx=options['nominee_idx'],
        movie_idx=options['movie_idx'],
    )
    
    for table in wiki_tables:
        table_rows = table.find_all('tr')[1:] # skip headers
        movie_year = parse_year(table.text[0:25])
                
        for row in table_rows:
            movie_year = get_real_year(movie_year, row.text[0:25])
            
            if any((not movie_year, movie_year < min_year, movie_year > max_year)):
                continue
            
            if (movie := parse_row(row, movie_year)):
                movie['award'] = movie_award
                movie['category'] = movie_category
                movies_list.append(movie)
    
    return movies_list

### Define categories

In [50]:
categories = ['picture', 'director', 'actor', 'actrees']
print(f'There are {len(categories)} categories in total')

There are 4 categories in total


### Define search queries

In [51]:
def get_query(query: str, award: str, category: str, movie_idx: int, nominee_idx: int) -> dict[str, Union[str, int]]:
    return {
        'wiki_url': 'https://en.wikipedia.org/wiki/' + query,
        'award': award,
        'category': category,
        'movie_idx': int(movie_idx),
        'nominee_idx': int(nominee_idx)
    }

In [52]:
queries = [
    # actor
    get_query('Academy_Award_for_Best_Actor', 'oscar', 'actor', 2, 0),
    get_query('BAFTA_Award_for_Best_Actor_in_a_Leading_Role', 'bafta', 'actor', 1, 0),
    get_query('Golden_Globe_Award_for_Best_Actor_–_Motion_Picture_Drama', 'globe', 'actor', 2, 0),
    get_query('Golden_Globe_Award_for_Best_Actor_–_Motion_Picture_Musical_or_Comedy', 'globe', 'actor', 2, 0),
    # actress
    get_query('Academy_Award_for_Best_Actress', 'oscar', 'actress', 2, 0),
    get_query('BAFTA_Award_for_Best_Actress_in_a_Leading_Role', 'bafta', 'actress', 1, 0),
    get_query('Golden_Globe_Award_for_Best_Actress_–_Motion_Picture_Drama', 'globe', 'actress', 2, 0),
    get_query('Golden_Globe_Award_for_Best_Actress_–_Motion_Picture_Musical_or_Comedy', 'globe', 'actress', 2, 0),
    # director
    get_query('Academy_Award_for_Best_Director', 'oscar', 'director', 1, 0),
    get_query('BAFTA_Award_for_Best_Direction', 'bafta', 'director', 1, 0),
    get_query('Golden_Globe_Award_for_Best_Director', 'globe', 'director', 1, 0),
    # picture
    get_query('Academy_Award_for_Best_Picture', 'oscar', 'picture', 0, 0),
    get_query('BAFTA_Award_for_Best_Film', 'bafta', 'picture', 1, 1),
    get_query('Golden_Globe_Award_for_Best_Motion_Picture_–_Drama', 'globe', 'picture', 0, 0),
    get_query('Golden_Globe_Award_for_Best_Motion_Picture_–_Musical_or_Comedy', 'globe', 'picture', 0, 0),
]

### Scrap data from Wikipedia

In [53]:
awards = []

for query in queries:
    award = parse_award_page(query)
    awards.extend(award)
    time.sleep(1)

In [43]:
len(awards)

3051

In [54]:
awards[0]

{'year': 1980,
 'winner': True,
 'nominee': 'Robert De Niro',
 'movie': 'Raging Bull',
 'wiki_url': '/wiki/Raging_Bull',
 'award': 'oscar',
 'category': 'actor'}

### Create dataframe

In [55]:
award_df = pd.DataFrame.from_dict(awards)

In [56]:
award_df.head()

Unnamed: 0,year,winner,nominee,movie,wiki_url,award,category
0,1980,True,Robert De Niro,Raging Bull,/wiki/Raging_Bull,oscar,actor
1,1980,False,Robert Duvall,The Great Santini,/wiki/The_Great_Santini,oscar,actor
2,1980,False,John Hurt,The Elephant Man,/wiki/The_Elephant_Man_(film),oscar,actor
3,1980,False,Jack Lemmon,Tribute,/wiki/Tribute_(1980_film),oscar,actor
4,1980,False,Peter O'Toole,The Stunt Man,/wiki/The_Stunt_Man,oscar,actor


### Save awards to .csv file

In [57]:
award_df.to_csv('../data/awards.csv', index=False)

### Save unique oscar nominees to .csv file

In [18]:
oscar_movies = award_df[award_df['award'] == 'oscar'][['movie', 'year']]
oscar_movies.drop_duplicates(inplace=True)
print(f'There are {len(oscar_movies)} unique movies in total')

There are 499 unique movies in total


In [19]:
oscar_movies.to_csv('../data/oscar_movies.csv', index=False)

## Scrap general data about movies

### Read unique oscar movies

In [20]:
movies = pd.read_csv('../data/oscar_movies.csv')

### Get general movies information from OMDB API

In [21]:
def get_omdb_json(movie: str, year: int) -> dict[str, Union[str, int, list]]:
    omdb_api_key = os.environ.get('OMDB_API_KEY')
    omdb_base_url = f'http://www.omdbapi.com/?apikey={omdb_api_key}'
    omdb_query_url = '&'.join([omdb_base_url, f't={movie}', f'y={row.year}'])
    omdb_json = requests.get(omdb_query_url).json()
    return omdb_json if omdb_json['Response'] != 'False'\
        else requests.get(f'{omdb_base_url}&t={movie}').json()

In [36]:
def parse_awards(content: str) -> dict[str, int]:
    numbers = [int(e) for e in content.split() if e.isdigit()]
    return {'other_win': numbers[1], 'other_nom': numbers[2]}\
        if len(numbers) >= 3 else {}

In [23]:
def parse_rt_score(ratings: list[dict[str, str]]) -> Union[str, None]:
    return ratings[1]['Value'][:-1] if len(ratings) >=2 else None

In [24]:
def get_omdb_movie(movie: str, year: int) -> dict[str, Union[str, int]]:
    omdb_json = get_omdb_json(movie, year)
    movie = {'movie': movie, 'year': year}
    movie['success'] = omdb_json['Response'] != 'False'
    
    if not movie['success']: # not found
        return movie
    
    movie['imdb_id'] = omdb_json.get('imdbID')
    movie['rated'] = omdb_json.get('Rated')
    movie['runtime'] = omdb_json.get('Runtime')
    movie['genres'] = omdb_json.get('Genre')
    movie['director'] = omdb_json.get('Director')
    movie['writer'] = omdb_json.get('Writer')
    movie['cast'] = omdb_json.get('Actors')
    movie['language'] = omdb_json.get('Language')
    movie['country'] = omdb_json.get('Country')
    movie['imdb_score'] = omdb_json.get('imdbRating')
    movie['imdb_votes'] = omdb_json.get('imdbVotes')
    movie['rt_score'] = parse_rt_score(omdb_json.get('Ratings', []))
    movie['metascore'] = omdb_json.get('Metascore')
    movie['box_office'] = omdb_json.get('BoxOffice')
    awards = parse_awards(omdb_json.get('Awards', ''))
    
    return {**movie, **awards }

In [35]:
%set_env OMDB_API_KEY 'your_key'

env: OMDB_API_KEY='your_key'


In [26]:
omdb_movies = []

for idx, row in movies.iterrows():
    omdb_movie = get_omdb_movie(row.movie, row.year)
    omdb_movies.append(omdb_movie)
    if idx % 100 == 0: print(idx)

0
100
200
300
400


In [27]:
omdb_movies[0]

{'movie': 'Raging Bull',
 'year': 1980,
 'success': True,
 'imdb_id': 'tt0081398',
 'rated': 'R',
 'runtime': '129 min',
 'genres': 'Biography, Drama, Sport',
 'director': 'Martin Scorsese',
 'writer': 'Jake LaMotta (based on the book by), Joseph Carter (with), Peter Savage (with), Paul Schrader (screenplay), Mardik Martin (screenplay)',
 'cast': 'Robert De Niro, Cathy Moriarty, Joe Pesci, Frank Vincent',
 'language': 'English',
 'country': 'USA',
 'imdb_score': '8.2',
 'imdb_votes': '326,193',
 'rt_score': '93',
 'metascore': '89',
 'box_office': '$23,383,987',
 'other_win': 22,
 'othen_nom': 28}

In [28]:
len(omdb_movies)

499

### Create dataframe

In [29]:
omdb_df = pd.DataFrame.from_dict(omdb_movies)
omdb_df.head()

Unnamed: 0,movie,year,success,imdb_id,rated,runtime,genres,director,writer,cast,language,country,imdb_score,imdb_votes,rt_score,metascore,box_office,other_win,othen_nom
0,Raging Bull,1980,True,tt0081398,R,129 min,"Biography, Drama, Sport",Martin Scorsese,"Jake LaMotta (based on the book by), Joseph Ca...","Robert De Niro, Cathy Moriarty, Joe Pesci, Fra...",English,USA,8.2,326193,93,89.0,"$23,383,987",22.0,28.0
1,The Great Santini,1980,True,tt0079239,PG,115 min,Drama,Lewis John Carlino,"Pat Conroy (novel), Lewis John Carlino (writte...","Robert Duvall, Blythe Danner, Michael O'Keefe,...",English,USA,7.2,5756,95,64.0,"$4,702,575",3.0,4.0
2,The Elephant Man,1980,True,tt0080678,PG,124 min,"Biography, Drama",David Lynch,"Christopher De Vore (screenplay), Eric Bergren...","Anthony Hopkins, John Hurt, Anne Bancroft, Joh...","English, French","USA, UK",8.1,224116,92,78.0,"$26,010,864",10.0,14.0
3,Tribute,1980,True,tt0081656,PG,121 min,"Comedy, Drama",Bob Clark,"Bernard Slade (play), Bernard Slade (screenplay)","Jack Lemmon, Robby Benson, Lee Remick, Colleen...",English,Canada,6.2,642,55,,"$9,000,000",2.0,13.0
4,The Stunt Man,1980,True,tt0081568,R,131 min,"Action, Comedy, Drama, Romance, Thriller",Richard Rush,"Lawrence B. Marcus (screenplay), Richard Rush ...","Peter O'Toole, Steve Railsback, Barbara Hershe...",English,USA,7.0,8848,90,77.0,"$7,063,886",4.0,8.0


### Check for empty values

In [32]:
omdb_empty_df = omdb_df[omdb_df['success'] == False][['movie', 'year']]
print(f'There are {len(omdb_empty_df)} empty movies')

There are 7 empty movies


In [33]:
omdb_empty_df

Unnamed: 0,movie,year
77,Il Postino: The Postman,1995
247,Mr. and Mrs. Bridge,1990
261,Tom and Viv,1994
330,Julie and Julia,2009
408,Three Colours: Red,1994
478,Precious: Based on the Novel 'Push' by Sapphire,2009
483,Extremely Loud and Incredibly Close,2011


### Save OMDB data to .csv file

In [34]:
omdb_df.to_csv('../data/movies.csv', index=False)