In [1]:
# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint

In [2]:
# initialize empty lists where you'll store your data
titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []

In [3]:
# function for extracting number in a string
def extract_number(inputString):
    out = ''
    for char in inputString:
        if char.isdigit():
            out = out + char
    return int(out)

In [4]:
# function for extracting float number in a string
def extract_float(inputString):
    out = ''
    for char in inputString:
        if char.isdigit() or char == '.':
            out = out + char
    return float(out)

In [5]:
# get first page url
base_url = "https://www.imdb.com"
url = "https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv"

In [6]:
while url:
    print(f'Getting data from {url}')
    headers = {"Accept-Language": "en-US, en;q=0.5"}
    results = requests.get(url, headers=headers)
    # beautiful soup
    soup = BeautifulSoup(results.text, "html.parser")
    movie_div = soup.find_all('div', class_='lister-item mode-advanced')
    # scraping data
    for container in movie_div:

        name = container.h3.a.text if container.h3.a else None
        titles.append(name)

        year = extract_number(container.h3.find('span', class_='lister-item-year').text) if container.h3.find('span', class_='lister-item-year') else None
        years.append(year)

        runtime = extract_number(container.p.find('span', class_='runtime').text) if container.p.find('span', class_='runtime') else None
        time.append(runtime)

        rating = float(container.strong.text) if container.strong else None
        imdb_ratings.append(rating)

        meta_score = int(container.find('span', class_='metascore').text) if container.find('span', class_='metascore') else None
        metascores.append(meta_score)

        nv = container.find_all('span', attrs={'name': 'nv'})
        gross = None
        vote = None
        for elem in nv:
            inp = elem.text
            if '$' in inp:
                gross = extract_float(inp)
            elif '#' not in inp:
                vote = extract_number(inp)
        votes.append(vote)
        us_gross.append(gross)
    next_url = soup.find(class_='lister-page-next next-page')
    if next_url:
        url = f"{base_url}{next_url['href']}"
    else:
        url = None
    sleep(randint(1,5))

Getting data from https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv
Getting data from https://www.imdb.com/search/title/?groups=top_1000&start=51
Getting data from https://www.imdb.com/search/title/?groups=top_1000&start=101
Getting data from https://www.imdb.com/search/title/?groups=top_1000&start=151
Getting data from https://www.imdb.com/search/title/?groups=top_1000&start=201
Getting data from https://www.imdb.com/search/title/?groups=top_1000&start=251
Getting data from https://www.imdb.com/search/title/?groups=top_1000&start=301
Getting data from https://www.imdb.com/search/title/?groups=top_1000&start=351
Getting data from https://www.imdb.com/search/title/?groups=top_1000&start=401
Getting data from https://www.imdb.com/search/title/?groups=top_1000&start=451
Getting data from https://www.imdb.com/search/title/?groups=top_1000&start=501
Getting data from https://www.imdb.com/search/title/?groups=top_1000&start=551
Getting data from https://www.imdb.com/search/titl

In [7]:
# create dataframe
movies = pd.DataFrame({
'movie': titles,
'year': years,
'timeMin': time,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
'us_grossMillions': us_gross,
})

In [8]:
# save to csv
movies.to_csv('movies.csv', index=False)