In [1]:
# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
# get English-translated titles from all the movies
url = "https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv"
headers = {"Accept-Language": "en-US, en;q=0.5"}
results = requests.get(url, headers=headers)

In [3]:
# beautiful soup
soup = BeautifulSoup(results.text, "html.parser")

In [4]:
# initialize empty lists where you'll store your data
titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []

In [5]:
movie_div = soup.find_all('div', class_='lister-item mode-advanced')

In [6]:
# function for extracting numbers in a string
def extract_numbers(inputString):
    out = ''
    for char in inputString:
        if char.isdigit():
            out = out + char
    return int(out)

In [7]:
# scraping data
for container in movie_div:
    
    name = container.h3.a.text if container.h3.a.text else None
    titles.append(name)
    
    year = container.h3.find('span', class_='lister-item-year').text if container.h3.find('span', class_='lister-item-year') else None
    year = extract_numbers(year)
    years.append(year)
    
    runtime = container.p.find('span', class_='runtime').text if container.p.find('span', class_='runtime').text else None
    time.append(runtime)
    
    rating = float(container.strong.text)
    imdb_ratings.append(rating)
    
    meta_score = container.find('span', class_='metascore').text
    metascores.append(meta_score)
    
    nv = container.find_all('span', attrs={'name': 'nv'})
    gross = None
    vote = None
    for elem in nv:
        inp = elem.text
        if '$' in inp:
            gross = inp
        elif '#' not in inp:
            vote = inp
    votes.append(vote)
    us_gross.append(gross)


In [8]:
# create dataframe
movies = pd.DataFrame({
'movie': titles,
'year': years,
'timeMin': time,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
'us_grossMillions': us_gross,
})

In [9]:
# save to csv
movies.to_csv('movies.csv', index=False)