In [1]:
# The following function scrapes the top 50 movies data for the year range from IMDB
def extract_data(year_from, year_to):
    # importing the required modules
    import pandas as pd
    import requests
    from bs4 import BeautifulSoup
    import time
    from time import sleep
    from random import randint
    from IPython.core.display import clear_output

    # creating a list of start ranges. This is based on the url definition where only 50 results are shown in a page and then when we hit next, start begins from the following index
    indexes = [str(x) for x in range(year_from,year_to +1)]
    names = []
    years = []
    genre = []
    runtime = []
    synopsis = []
    director = []
    gross = []
    start_time = time.time()
    req = 0
    # looping through all the indexes
    for ind in indexes:
        url = 'https://www.imdb.com/search/title/?title_type=feature&release_date='+ind+'-01-01,'+ind+'-12-31&sort=boxoffice_gross_us,desc'
        response = requests.get(url)
        # checking if the response to our request does not result in an error. 200 response is fine but anything else might be an error
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        # using beautifulsoup to parse the html data
        html_soup = BeautifulSoup(response.text,'html.parser')
        # after inspecting the elements of the website, we found out thatthe following class contains the name of the movies
        mv_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
        # looping through each 'div' of the class to get the name of each movie
        for container in mv_containers:
            # appending the movie names, year of release, genre, runtime, synopsis & cast information to respective lists
            names.append(container.h3.a.text)
            years.append(container.h3.find('span', class_='lister-item-year text-muted unbold').text)
            genre.append(str(container.p.find('span', class_ = 'genre')))
            runtime.append(str(container.p.find('span', class_ = 'runtime')))
            synopsis.append(container.find_all('p', class_='text-muted')[1].text)
            director.append(container.find_all('p')[2].get_text())
            gross.append(str(container.find('p', class_='sort-num_votes-visible')))
        # counting requests to print the requests per second rates and ensuring that we do not overload the server
        req += 1
        # using the sleep module to oscillate between 1 & 3 seconds between each request
        sleep(randint(1, 3))
        current_time = time.time()
        elapsed_time = current_time - start_time
        clear_output(wait=True)
        print('Request: {}; Frequency: {} requests/s'.format(req, req / elapsed_time))
    df = pd.DataFrame({'Movie': names, 'Release_year': years, 'Genre': genre, 'Runtime': runtime, 'Synopsis': synopsis,
                       'Director': director, 'Gross': gross})
    return df

# The following function cleans the data and formats the columns into right data types to return a clean datafarme 
def transform_data(df):
#     extracting the year from the data. The year part is stored within parenthesis
    df['Year_release'] = df.Release_year.str.extract(r'(\d+)')
    df['Movie_genre'] = [x[21:len(x)-7] for x in df.Genre]
    df['Run_time'] = df.Runtime.str.extract(r'(\d+)')
    df.Run_time = df.Run_time.fillna('0')
    df['Run_time'] = df.Run_time.apply(int)
    df['Director'] = df.Director.str.replace('\n','')
    df['Synopsis'] = df.Synopsis.str.replace('\n','')
    df['Directors'] = df.Director.str.extract(r'.*?:(.*)\|')
    df['Stars'] = df.Director.str.extract(r'Stars:(.*)')
    df['Gross_earnings'] = df.Gross.str.extract(r'.*?Gross:</span>\n<span data-value=\"(.*)\" name')
    df.Gross_earnings = df.Gross_earnings.str.replace(',','').fillna('0').apply(int)
    # dropping the old columns with unstructured data. inplace=True is used to make the changes in the dataframe itself without doing the column dropping on a copy of the data frame
    df.drop(df.columns[[1,2,3,5,6]],axis=1,inplace=True)
#     returning the clean dataframe
    return df

In [47]:
df = clean_data(get_data(2015, 2020))
df.head()

Request: 6; Frequency: 0.3570909295857339 requests/s


Unnamed: 0,Movie,Synopsis,Year_release,Movie_genre,Run_time,Directors,Stars,Gross_earnings
0,Star Wars - Episode VII: Das Erwachen der Macht,"Three decades after the Empire's defeat, a...",2015,"Action, Adventure, Sci-Fi",138,J.J. Abrams,"Daisy Ridley, John Boyega, Oscar Isaac, Domhna...",936662225
1,Jurassic World,"A new theme park, built on the original si...",2015,"Action, Adventure, Sci-Fi",124,Colin Trevorrow,"Chris Pratt, Bryce Dallas Howard, Ty Simpkins,...",652270625
2,Avengers: Age of Ultron,When Tony Stark and Bruce Banner try to ju...,2015,"Action, Adventure, Sci-Fi",141,Joss Whedon,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",459005868
3,Alles steht Kopf,After young Riley is uprooted from her Mid...,2015,"Animation, Adventure, Comedy",95,"Pete Docter, Ronnie Del Carmen","Amy Poehler, Bill Hader, Lewis Black, Mindy Ka...",356461711
4,Fast & Furious 7,Deckard Shaw seeks revenge against Dominic...,2015,"Action, Adventure, Crime",137,James Wan,"Vin Diesel, Paul Walker, Dwayne Johnson, Jason...",353007020
