## Genre Genie - Multi-label Classification with NLP
### Part 1.2: Scraping IMDb

#### Tom Keith

---


IMDb has a great url structure for scraping. Using Star Wars for example: https://www.imdb.com/title/tt0076759/

The IMDb id (or `tconst` in the IMDb open datasets) is all that is needed to fetch the page.

This notebook is not meant to be run all at once. Rather, the main cell (that is not a function) is mean to be manually updated before each running of the cell. See notes before that cell.

---

In [1]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup
    
from PIL import Image
from io import BytesIO

import re
import json

import time
import random
from datetime import datetime
from pprint import pprint

In [2]:
def to_numeric(string, num_type='int'):
    '''
    Function to strip all non-numeric characters from string and return int or float
    INPUT - String to convert
          - num_type: either 'int' or 'float'
    OUTPUT - int or float type (returns original string if neither specified)
    '''
    if num_type == 'float':
        x = float( re.sub("[^0-9]", "", string ) )
    elif num_type == 'int':
        x = int( re.sub("[^0-9]", "", string ) )
    else:
        x = string
    return x


def savePoster(imdb_id, img_url):
    '''
    Function that fetches and save the poster image from provided url
    and saves it with the provided id (corresponding with IMDb).
    Won't replace (or even fetch) if file already exists.
    
    INPUT:  id from imdb, url where to find image
    OUTPUT: boolean flag if saved or not.
    '''
    import os.path
    
    # Get file extension
    ext = img_url.split('.')[-1]
    
    # Check to see if I already have it
    if os.path.isfile(f'posters/{imdb_id}.{ext}'):
        return False
    
    # Get image data, and save it as imdb_id
    response = requests.get(img_url)
    img = Image.open(BytesIO(response.content))    
    img.save(f'posters/{imdb_id}.{ext}')
    
    return True

def concatenate_list_data(my_list):
    result = ''
    for element in my_list:
        result += str(element)
    return result

def time_since(start_time):
    '''
    Simple timer calculating time difference between
    start_time input parameter, and now
    
    OUTPUT: string ' 2m45s'
    INPUT: timestamp of starting time
    '''
    end_time = time.time()
    mins = (end_time - start_time)//60
    secs = (end_time - start_time) - (60*mins)
    return f'{mins:2.0f}m{secs:2.0f}s'

In [3]:
def imdb_scrape(imdb_id, save_image=True, debug=False):
    '''
    Function which hits IMDb 3 times. Once for main info, second for the long plot
    summary, and third for the movie poster (saved in /posters/ folder).
    
    This function is mean to be used in a loop. As such, the print outputs may lack
    meaning if used outside of the cells below.
    
    INPUT:  - ID of movie to scrape from IMDB e.g. "tt0076759"
            - boolean to save the movie poster or not (default True)
           
    OUTPUT: Dictionary of various scrapped information.
    
             {'tconst':imdb_id, 'title':'',     'release_year':'',     'release_date':'',
              'MPAA':'',        'genre':[],     'runtime':'',          'poster_url':'',
              'plot_short':'',  'plot_long':'', 'imdb_rating':'',      'num_imdb_votes':'',
              'plot_full':'',   'metacritic':'','num_user_reviews':'', 'num_critic_reviews':''
             }
    '''
    # Target datapoints to scrape (with provided imdb_id)
    imdb_info_dict = {'tconst':imdb_id,'title':'',    'release_year':'',      'release_date':'',
                      'MPAA':'',       'genre':[],    'runtime':'',           'poster_url':'',
                      'plot_short':'', 'plot_long':'', 'imdb_rating':'',      'num_imdb_votes':'',
                      'plot_full':'',  'metacritic':'','num_user_reviews':'', 'num_critic_reviews':''
                     }
    imdb_info_dict['tconst'] = imdb_id
    
    imdb_base_url = 'https://www.imdb.com/title/'
    print(f'{imdb_id.ljust(10)} ', end='')
    # Main content - build URL, and soup content
    imdb_full_url = imdb_base_url + imdb_id
    r = requests.get(imdb_full_url).content
    soup = BeautifulSoup(r, 'html.parser')
    
    # For Summary only
    imdb_plot_url = imdb_base_url + imdb_id + '/plotsummary'
    r2 = requests.get(imdb_plot_url).content
    soup_summ = BeautifulSoup(r2, 'html.parser')
    print(f'[x]   ', end='')
    
    # Full plot summary
    plot_summary = soup_summ.find('ul',{'id':'plot-synopsis-content'}).li.text
    plot_summary = re.sub('\\n', ' ', plot_summary)
    plot_summary = re.sub('[\.]', '. ', plot_summary)
    plot_summary = re.sub('  ', ' ', plot_summary)
    plot_summary = re.sub('  ', ' ', plot_summary)

    imdb_info_dict['plot_full'] = plot_summary.strip()
    if 'It looks like we' in plot_summary.strip():
        imdb_info_dict['plot_full'] = ''
        print(f'[ ]   ', end='')
    else:
        print(f'[x]   ', end='')
    
    # Code from js section has json variables
    json_dict = json.loads( str( soup.findAll('script', {'type':'application/ld+json'})[0].text ))

    # Info - Movie title, year, parental content rating, poster url
    imdb_info_dict['title'] = json_dict['name']
    if 'contentRating' in json_dict:
        imdb_info_dict['MPAA'] = json_dict['contentRating'] 
    imdb_info_dict['poster_url'] = json_dict['image']
    imdb_info_dict['release_year'] = int( soup.find('span', {'id':'titleYear'}).a.text )
    imdb_info_dict['runtime'] = to_numeric( soup.find('time')['datetime'] )

    # Release date (from top header) - format date info datetime format
    date_string = soup.find('div', {'class':'title_wrapper'}).findAll('a')[-1].text.split(' (')[0]
    imdb_info_dict['release_date'] = date_string
    
    # Genres (up to 3)
    imdb_info_dict['genre'] = json_dict['genre']

    # Ratings - IMDb rating (and vote count), Metacritic
    imdb_info_dict['imdb_rating'] = float( json_dict['aggregateRating']['ratingValue'] )
    imdb_info_dict['num_imdb_votes'] = json_dict['aggregateRating']['ratingCount']

    # Metacritic score, if there is one
    if soup.find('div', {'class':'metacriticScore'}) != None:
        imdb_info_dict['metacritic'] = int( soup.find('div', {'class':'metacriticScore'}).span.text )

    # Reviews - Number of critic and public reviews (different than ratings/votes)
    num_review_list = soup.findAll('div',{'class':'titleReviewBarItem titleReviewbarItemBorder'})
    if num_review_list != []:
        reviews = num_review_list[0].findAll('a')
        if len(reviews) > 1:
            imdb_info_dict['num_critic_reviews'] = to_numeric( reviews[1].text )
        if len(reviews) > 0:
            imdb_info_dict['num_user_reviews'] = to_numeric( reviews[0].text )

    # Plots - long and short versions
    imdb_info_dict['plot_short'] = soup.find('div',{'class':'summary_text'}).text.strip()
    if 'Add a Plot' in imdb_info_dict['plot_short']:
        imdb_info_dict['plot_short'] = ''
    if soup.find('div',{'id':'titleStoryLine'}).div.p != None:
        imdb_info_dict['plot_long'] = soup.find('div',{'id':'titleStoryLine'}).div.p.span.text.strip()
    
    # Plot output
    print(f'[x]   ', end='')

    if save_image == True:
        #print(f'Img..', end='')
        img_status = savePoster(imdb_id, imdb_info_dict['poster_url'])
        if img_status == True:
            print(f'[x]   ', end='')
        else:
            print(f'[ ]   ', end='')
    
    print(f"{(imdb_info_dict['title']+' ('+str(imdb_info_dict['release_year'])+')')[:100]:100} ", end='')
    time.sleep(random.randint(1,10) / 100)
    
    print('')
    if(debug):
        pprint(imdb_info_dict)
    return imdb_info_dict

---

### Sample use of scraper (Jurassic Park)

In [5]:
#imdb_scrape('tt0107290')
try:
    imdb_scrape('tt0107290', False, True)
except Exception:
    print('error')

tt0107290  [x]   [x]   [x]   Jurassic Park (1993)                                                                                 
{'MPAA': 'PG-13',
 'genre': ['Action', 'Adventure', 'Sci-Fi', 'Thriller'],
 'imdb_rating': 8.1,
 'metacritic': 68,
 'num_critic_reviews': 355,
 'num_imdb_votes': 819174,
 'num_user_reviews': 1082,
 'plot_full': 'The story begins on Isla Nublar, a small island 120 miles off '
              'the coast of Costa Rica. A large group of construction workers '
              'and animal handlers offload a large container, the creature '
              'within unseen. During the process, the animal attempts to '
              'escape, an act which leads to a mass panic, and the death of '
              'one of the workers. The story jumps forward to an Amber mine in '
              'the Dominican Republic, where we learn that miners extracting '
              'amber are involved with a genetic-engineering company called '
              'InGen. We also learn that the 

The above is just an example of the information scraped for 1 movie. Time to automate this!

---

Below are two cells. The first one scrapes all the movies from the csv (using the IDs) in a *range* of years. While, the second cell scrapes all movies from one provided year.

After each year, the scraped content (list of dictionaries) are converted to a DataFrame and then saved as a .tsv (one for each year - `imdb_scrape_2001.tsv`). We should end up with 100 .tsv files.

In [98]:
import csv
import sys

#failed_list = []
years_to_scrape_list = range(1920,1930)#[1973, 1972, 1971, 1970]
for yr in years_to_scrape_list:
    #yr = 1995
    movies_in_year = movie_df[(movie_df['year'] == yr)]

    print('--------------------------------------------------------------------------')
    print(f'Scraping movies: {len(movies_in_year)}   Year: {yr}')
    print('')
    print('Count   tconst     Get   Summ  Parse Img   Title')
    start_time = time.time()
    annual_movie_list = []

    fails=0
    for i, tconst in enumerate(movies_in_year['tconst'].values):
        print(f'{i+1:5d}   ', end = '')
        try:
            scraped_movie_info = imdb_scrape(tconst)
            annual_movie_list.append(scraped_movie_info)
        except Exception:
            print(f'--------- FAILED ----------  {tconst}  ---------- FAILED ----------  {tconst}  ---------- FAILED ----------')
            failed_list.append(tconst)
            fails+=1

    print(f'Movies scraped: {len(annual_movie_list)}   Fails: {fails}   ', end='')

    my_df = pd.DataFrame(annual_movie_list)
    my_df.to_csv(f'imdb_scrape_{yr}.tsv', sep='\t', quoting=csv.QUOTE_ALL)
    print('\n')
    print(f'Saved: imdb_scrape_{yr}.tsv     ', end='')
    print(f'Time taken: {time_since(start_time)}')
    print('')

--------------------------------------------------------------------------
Scraping movies: 11   Year: 1920

Count   tconst     Get   Summ  Parse Img   Title
    1   tt0010323  [x]   [x]   [x]   [ ]   Das Cabinet des Dr. Caligari (1920)                                                                  
    2   tt0011237  [x]   [ ]   [x]   [ ]   Der Golem, wie er in die Welt kam (1920)                                                             
    3   tt0011841  [x]   [ ]   [x]   [ ]   Way Down East (1920)                                                                                 
    4   tt0011130  [x]   [ ]   [x]   [ ]   Dr. Jekyll and Mr. Hyde (1920)                                                                       
    5   tt0011870  [x]   [ ]   [x]   [ ]   Within Our Gates (1920)                                                                              
    6   tt0011439  [x]   [ ]   [x]   [ ]   The Mark of Zorro (1920)                                                  

In [None]:
import csv
import sys

movie_df = pd.read_csv('imdb_movie_list.csv')

#years_to_scrape_list = list(range(2019,2020))
#years_to_scrape_list.sort(reverse=True)

failed_list = []

#for yr in years_to_scrape_list:
yr = 2009
movies_in_year = movie_df[(movie_df['year'] == yr)]

print('--------------------------------------------------------------------------')    
print(f'Scraping movies: {len(movies_in_year)}   Year: {yr}')
print('')
print('Count   tconst     Get   Summ  Parse Img   Title')
start_time = time.time()
annual_movie_list = []

fails=0
for i, tconst in enumerate(movies_in_year['tconst'].values):
    print(f'{i+1:5d}   ', end = '')
    try:
        scraped_movie_info = imdb_scrape(tconst)
        annual_movie_list.append(scraped_movie_info)
    except Exception:
        print(f'--------- FAILED ----------  {tconst}  ---------- FAILED ----------')
        failed_list.append(tconst)
        fails+=1

print(f'Movies scraped: {len(annual_movie_list)}   Fails: {fails}   ', end='')

my_df = pd.DataFrame(annual_movie_list)
my_df.to_csv(f'rawdata/imdb_scrape_{yr}.tsv', sep='\t', quoting=csv.QUOTE_ALL)
print('\n')
print(f'Saved: imdb_scrape_{yr}.tsv     ', end='')
print(f'Time taken: {time_since(start_time)}')
print('')

--------------------------------------------------------------------------
Scraping movies: 964   Year: 2009

Count   tconst     Get   Summ  Parse Img   Title
    1   tt0361748  [x]   [x]   [x]   [x]   Inglourious Basterds (2009)                                                                          
    2   tt0499549  [x]   [x]   [x]   [x]   Avatar (2009)                                                                                        
    3   tt1049413  [x]   [x]   [x]   [x]   Up (2009)                                                                                            
    4   tt1119646  [x]   [x]   [x]   [x]   The Hangover (2009)                                                                                  
    5   tt1136608  [x]   [x]   [x]   [x]   District 9 (2009)                                                                                    
    6   tt0796366  [x]   [x]   [x]   [x]   Star Trek (2009)                                                         

We've kept track of failed scrapes if we want to go back at some point. I didn't end up using them.

In [101]:
len(failed_list)

23

In [102]:
failed_list

['tt7535666',
 'tt8510324',
 'tt6793470',
 'tt3957098',
 'tt2741216',
 'tt2243471',
 'tt1821478',
 'tt1286129',
 'tt0484905',
 'tt3505016',
 'tt1171672',
 'tt1423961',
 'tt0902254',
 'tt0423087',
 'tt0907619',
 'tt0419086',
 'tt0241998',
 'tt0327419',
 'tt0290402',
 'tt0419765',
 'tt0285940',
 'tt2186669',
 'tt2210725']