# Web Scraping Movie Data from the IMDb Website



In [1]:
# Custom function to scrape the top grossing n movies from desired year.

def scrape_movies(n_movies, year):
    import requests
    import pandas as pd
    from bs4 import BeautifulSoup
    movie_data= []

    # the 100 is testing for 100 results or two pages, use the total number desired or max of 10,000 due to URL routing format
    result_num=n_movies
    result_per_page=50
    reverseCount = round(result_num/result_per_page)
    count=1
    
    #Stop looping when we 
    while reverseCount != 0:
        # 
        URL= 'https://www.imdb.com/search/title/?title_type=feature&year={0}-01-01,{0}-12-31&sort=boxoffice_gross_us,desc&start={1}&ref_=adv_nxt'.format(year,count)
        page= requests.get(URL)
        soup = BeautifulSoup(page.content, 'html.parser')
        results= soup.find(id = 'pagecontent')
        movie_elements= results.find_all( class_= 'lister-item')

        count+=50
        reverseCount-=1
        columns= ['movie_id','title','year','content_rating','runtime','genres','rating','summary','director','main_actors','votes','revenue']

        for movie_element in movie_elements:
            try:
                #Get movie id
                movie_id= movie_element.find_all('a')[1]
                movie_id=str(movie_id)
                movie_id=movie_id[18:25]
                

                #Get title
                title=movie_element.find_all('a')[1].text

                #Get movie year
                year= movie_element.find('span', class_='lister-item-year text-muted unbold').text
                #Trim parentheses and other characters
                year=year[-5:-1]


                #Get content rating
                content_rating= movie_element.find('span', class_='certificate').text


                #Get runtime
                runtime= movie_element.find('span', class_='runtime').text


                #Get Genres
                genres= movie_element.find('span', class_='genre').text.strip()


                #Get rating
                rating= movie_element.find('div', class_='inline-block ratings-imdb-rating').text.strip()


                #Get movie summary
                summary= movie_element.find_all('p', class_='text-muted')
                summary=summary[1].text.strip()


                #Get director and main actors
                cast_elements= movie_element.find_all('a')
                director= cast_elements[13].text

                actors=cast_elements[14:18]
                actors_string= ""
                for actor in actors:
                    actor=actor.text
                    actors_string+= actor+", "
                main_actors=actors_string[:-2]
                
                #Get votes
                votes= movie_element.find_all('span')[33].text
                #Get Metascore
                metascore= movie_element.find('div', class_='inline-block ratings-metascore').text.strip()[:2]
                #Get gross revenue
                revenue= movie_element.find_all('span')[36].text
            except:
                # Fill with empty string if value not available
                movie_id= ''
                title= ''
                year= ''
                content_rating= ''
                runtime= ''
                genres= ''
                rating= ''
                summary= ''
                director= ''
                main_actors=''
                votes=''
                revenue=''

            
            # Add row of data into list of lists
            movie_data.append([movie_id,title,year,content_rating,runtime,genres,rating,summary,director,main_actors,votes,revenue])

    # Return a datframe made from the list of lists
    movie_df= pd.DataFrame(movie_data,columns=columns)
    return movie_df

In [3]:
# Retrieve top 150 movies for 2017
movies_2017=scrape_movies(n_movies=150, year=2017)
#Display top 5 movies
movies_2017.head(5)

Unnamed: 0,movie_id,title,year,content_rating,runtime,genres,rating,summary,director,main_actors,votes,revenue
0,2527336,Star Wars: Episode VIII - The Last Jedi,2017,PG-13,152 min,"Action, Adventure, Fantasy",7.0,Rey develops her newly discovered abilities wi...,Rian Johnson,"Daisy Ridley, John Boyega, Mark Hamill, Carrie...",555773,$620.18M
1,2771200,Beauty and the Beast,2017,PG,129 min,"Family, Fantasy, Musical",7.1,A selfish Prince is cursed to become a monster...,Bill Condon,"Emma Watson, Dan Stevens, Luke Evans, Josh Gad",269507,$504.01M
2,451279,Wonder Woman,2017,PG-13,141 min,"Action, Adventure, Fantasy",7.4,When a pilot crashes and tells of conflict in ...,Patty Jenkins,"Gal Gadot, Chris Pine, Robin Wright, Lucy Davis",543440,$412.56M
3,2283362,Jumanji: Welcome to the Jungle,2017,PG-13,119 min,"Action, Adventure, Comedy",6.9,Four teenagers are sucked into a magical video...,Jake Kasdan,"Dwayne Johnson, Karen Gillan, Kevin Hart, Jack...",314701,$404.52M
4,3896198,Guardians of the Galaxy Vol. 2,2017,PG-13,136 min,"Action, Adventure, Comedy",7.6,The Guardians struggle to keep together as a t...,James Gunn,"Chris Pratt, Zoe Saldana, Dave Bautista, Vin D...",560528,$389.81M


In [4]:
# Retrieve data for the top 100 movies of 2018
movies_2018=scrape_movies(n_movies=100, year=2018)
#Display 5 movies
movies_2018.head(5)

Unnamed: 0,movie_id,title,year,content_rating,runtime,genres,rating,summary,director,main_actors,votes,revenue
0,1825683,Black Panther,2018,PG-13,134 min,"Action, Adventure, Sci-Fi",7.3,"T'Challa, heir to the hidden but advanced king...",Ryan Coogler,"Chadwick Boseman, Michael B. Jordan, Lupita Ny...",615854,$700.06M
1,4154756,Avengers: Infinity War,2018,PG-13,149 min,"Action, Adventure, Sci-Fi",8.4,The Avengers and their allies must be willing ...,Anthony Russo,"Joe Russo, Robert Downey Jr., Chris Hemsworth,...",816175,$678.82M
2,3606756,Incredibles 2,2018,PG,118 min,"Animation, Action, Adventure",7.6,The Incredibles family takes on a new mission ...,Brad Bird,"Craig T. Nelson, Holly Hunter, Sarah Vowell, H...",244720,$608.58M
3,4881806,Jurassic World: Fallen Kingdom,2018,PG-13,128 min,"Action, Adventure, Sci-Fi",6.2,When the island's dormant volcano begins roari...,J.A. Bayona,"Chris Pratt, Bryce Dallas Howard, Rafe Spall, ...",262282,$417.72M
4,1477834,Aquaman,2018,PG-13,143 min,"Action, Adventure, Fantasy",6.9,"Arthur Curry, the human-born heir to the under...",James Wan,"Jason Momoa, Amber Heard, Willem Dafoe, Patric...",361819,$335.06M
