# Cralwer for IMDB top 250

The following notebook was created for retrieving top 250 from [IMDB](https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc)



In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np


In [73]:
# instantiate empty lists
title = []
runtime = []
genre = []
rating = []
metascore = []
votes = []
gross = []
year = []
director = []
description = []

# create dictionary with all the list
movies = {
    'title': title, 
    'runtime': runtime, 
    'genre': genre, 
    'rating': rating, 
    'metascore': metascore, 
    'votes': votes, 
    'gross': gross, 
    'year': year, 
    'director': director, 
    'description': description
    }


In [68]:
def content_cleaner(soup):
    # from soup get all elements with class lister-item-content
    movie_containers = soup.find_all('div', class_ = 'lister-item-content')

    for movie in movie_containers:
        # from first movie_container get the title
        title.append(movie.h3.a.text)
        # from first movie_container get the runtime
        runtime.append(movie.find('span', class_ = 'runtime').text.split(' ')[0])
        # from first movie_container get the genre
        genre.append(movie.find('span', class_ = 'genre').text.strip())
        # from first movie_container get the rating
        rating.append(movie.strong.text)
        # boolean check for metascore
        if movie.find('span', class_ = 'metascore') is not None:
            # from first movie_container get the metascore
            metascore.append(movie.find('span', class_ = 'metascore').text)
        else:
            # from first movie_container get the metascore
            metascore.append(' ')
        # from first movie_container get the number of votes
        votes.append(movie.find('span', attrs = {'name':'nv'})['data-value'])
        # from first movie_container get the year
        year.append(movie.h3.find('span', class_ = 'lister-item-year text-muted unbold').text.strip('()'))
        # from first movie_container get the description
        description.append(movie.find_all('p', class_ = 'text-muted')[1].text.strip())
        # from first movie_container get the director
        director.append(movie.find_all('p', class_ = '')[0].a.text)
        # from first movie_container get the gross
        gross.append(movie.find('p', class_ ='sort-num_votes-visible').find_all('span', attrs = {'name':'nv'})[1].attrs['data-value'])

In [74]:
# make request to the url
# https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc&start=1&ref_=adv_nxt
# set parameters for the request
url = 'https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc'

params = {
    'start': 1,
    'ref_': 'adv_nxt'
}

for i in range(5):
    params['start'] = 1+i*50
    # make request to the url with the parameters and get the response in english
    response = requests.get(url, params=params, headers={'Accept-Language': 'en-US, en;q=0.5'})
    soup = BeautifulSoup(response.text, 'html.parser')
    content_cleaner(soup)

In [76]:
# create dataframe from dictionary to variable df
df = pd.DataFrame(movies)
df.head()

Unnamed: 0,title,runtime,genre,rating,metascore,votes,gross,year,director,description
0,The Shawshank Redemption,142,Drama,9.3,81,2663689,28341469,1994,Frank Darabont,Two imprisoned men bond over a number of years...
1,The Godfather,175,"Crime, Drama",9.2,100,1845919,134966411,1972,Francis Ford Coppola,The aging patriarch of an organized crime dyna...
2,The Dark Knight,152,"Action, Crime, Drama",9.0,84,2636663,534858444,2008,Christopher Nolan,When the menace known as the Joker wreaks havo...
3,The Lord of the Rings: The Return of the King,201,"Action, Adventure, Drama",9.0,94,1836493,377845905,2003,Peter Jackson,Gandalf and Aragorn lead the World of Men agai...
4,Schindler's List,195,"Biography, Drama, History",9.0,94,1348869,96898818,1993,Steven Spielberg,"In German-occupied Poland during World War II,..."


In [84]:
# replace ' ' with 0 in metascore column
df['metascore'] = df['metascore'].replace(' ', 0)
# replace ',' with '' in gross column
df['gross'] = df['gross'].str.replace(',', '')
# convert the columns to the correct data type
df['runtime'] = df['runtime'].astype(int)
df['metascore'] = df['metascore'].astype(int)
df['votes'] = df['votes'].astype(int)
df['gross'] = df['gross'].astype(int)

In [86]:
df.dtypes

title          object
runtime         int64
genre          object
rating         object
metascore       int64
votes           int64
gross           int64
year           object
director       object
description    object
dtype: object

In [87]:
df

Unnamed: 0,title,runtime,genre,rating,metascore,votes,gross,year,director,description
0,The Shawshank Redemption,142,Drama,9.3,81,2663689,28341469,1994,Frank Darabont,Two imprisoned men bond over a number of years...
1,The Godfather,175,"Crime, Drama",9.2,100,1845919,134966411,1972,Francis Ford Coppola,The aging patriarch of an organized crime dyna...
2,The Dark Knight,152,"Action, Crime, Drama",9.0,84,2636663,534858444,2008,Christopher Nolan,When the menace known as the Joker wreaks havo...
3,The Lord of the Rings: The Return of the King,201,"Action, Adventure, Drama",9.0,94,1836493,377845905,2003,Peter Jackson,Gandalf and Aragorn lead the World of Men agai...
4,Schindler's List,195,"Biography, Drama, History",9.0,94,1348869,96898818,1993,Steven Spielberg,"In German-occupied Poland during World War II,..."
...,...,...,...,...,...,...,...,...,...,...
245,Mr. Smith Goes to Washington,129,"Comedy, Drama",8.1,73,116199,9600000,1939,Frank Capra,A naive youth leader is appointed to fill a va...
246,It Happened One Night,105,"Comedy, Romance",8.1,87,104581,4360000,1934,Frank Capra,A renegade reporter trailing a young runaway h...
247,The Incredibles,115,"Animation, Action, Adventure",8.0,90,735178,261441092,2004,Brad Bird,"While trying to lead a quiet suburban life, a ..."
248,Aladdin,90,"Animation, Adventure, Comedy",8.0,86,419455,217350219,1992,Ron Clements,A kindhearted street urchin and a power-hungry...


In [None]:

pd.to_csv('movies.csv', index=False)