#  Scrape the IMDB information about movies with movie ID released from year 1991-2022 (sorted according to number of votes)

In [None]:
from bs4 import BeautifulSoup as soup
import requests
import numpy as np
import pandas as pd
from decouple import config
import csv
from tqdm import tqdm

In [2]:
def get_imdb_id(movie_id):
    response=requests.get(
        'https://api.themoviedb.org/3/movie/{}/external_ids?api_key={}'.format(movie_id,config('API_KEY')))
    return response.json()['imdb_id']

def get_movie_id(title,year):
    response=requests.get(
        'https://api.themoviedb.org/3/search/movie?api_key={}&language=en-US&year={}&query={}'
        .format(config('API_KEY'),year,title))
    try:
        result=response.json()['results']
        if result==[]:
            return np.nan,np.nan
        else:                    
            tmdb_id=result[0]['id']
            imdb_id=get_imdb_id(tmdb_id)
    except:
        return np.nan,np.nan
    
    return str(tmdb_id),str(imdb_id)

In [3]:
def get_cast(movieCast):
    all_cast=[]
    all_directors=[]
    all_stars=[]
    try:
        casts=movieCast.text.replace("\n","").split('|')
        casts = [x.strip() for x in casts]
        # directors
        directors="".join(map(str,casts[0]))
        first_ele=directors.partition(":")
        if(first_ele[0]=="Stars"):
            direct=np.nan
        else:
            direct=first_ele[2]
        all_directors.append(direct)
        # stars
        star_cast="".join(map(str,casts))
        dirs = str(star_cast).partition(":")
        if(dirs[0]!="Stars"):
            stars=str(dirs[2]).partition(":")[2]
        else: stars=dirs[2]
        all_stars.append(stars)
        
        all_cast.append(all_directors)
        all_cast.append(all_stars)
    except:
        casts = movieCast.text.replace("\n","").strip()
    return all_directors,all_stars

In [1]:
def get_movies_details(year,pn):
    response=requests.get(
        "https://www.imdb.com/search/title/?title_type=feature&year={}&sort=num_votes,desc&start={}".format(year,pn))
    page=soup(response.content)
    movies_info=[]

    for movieFrame in page.find_all('div',attrs={"class":"lister-item mode-advanced"}):
        movies=[]
        directors=[]
        stars=[]

        movie_name=movieFrame.find("h3",class_="lister-item-header").find("a")
        imdb_rating=movieFrame.find('div',attrs={"class":"inline-block ratings-imdb-rating"})
        desc=movieFrame.find_all("p", class_="text-muted")[-1]
        
        movieCast=movieFrame.find("p",class_="")
        genre=movieFrame.find("span",class_="genre")

        if movie_name is not None:
            tmdb_id,imdb_id=get_movie_id(movie_name.text.lstrip(),year)
            if tmdb_id is np.nan and imdb_id is np.nan:
                movies.append(np.nan)
                movies.append(np.nan)
            else:                    
                movies.append("".join(map(str,tmdb_id)))
                movies.append("".join(map(str,imdb_id)))
            movies.append(year)
            movies.append(movie_name.text.lstrip())
        else:
            movies.append(np.nan)  
            continue      

        if imdb_rating is not None:
            movies.append(imdb_rating.text.strip())
        else:
            movies.append(np.nan)
        if desc is not None:
            movies.append(desc.text.strip())
        else:
            movies.append(np.nan)
        
        if movieCast is not None:
            directors,stars=get_cast(movieCast)
            movies.append("".join(map(str,directors)))
            movies.append("".join(map(str,stars)))
        else: movies.append(np.nan)
        if genre is not None:
            movies.append(genre.text.strip())
        else:
            movies.append(np.nan)
        movies_info.append(movies)
    return movies_info


In [5]:
filename="scrapped_movies.csv"
with open(filename, "w") as filehandle:
    writer = csv.writer(filehandle, delimiter=',', lineterminator='\n')
    cols=['tmdb_id','imdb_id','year','name','rating','description','directors','cast','genres']
    writer.writerow(cols)

In [6]:
flatten=lambda l:[item for sublist in l for item in sublist]

In [7]:
with open(filename, "a", encoding='utf-8') as filehandle:
    writer = csv.writer(filehandle, delimiter=',',lineterminator='\n')
    for i in tqdm(range(1991,2001)):
        for j in tqdm(range(1,252,50)):
            writer.writerows(get_movies_details(i,j))

100%|██████████| 6/6 [10:28<00:00, 104.79s/it]
100%|██████████| 6/6 [09:09<00:00, 91.63s/it]/it]
100%|██████████| 6/6 [09:49<00:00, 98.32s/it]/it]
100%|██████████| 6/6 [09:15<00:00, 92.58s/it]/it]
100%|██████████| 6/6 [08:11<00:00, 81.94s/it]t]  
100%|██████████| 6/6 [09:06<00:00, 91.07s/it]t]
100%|██████████| 6/6 [09:57<00:00, 99.64s/it]t]
100%|██████████| 6/6 [09:14<00:00, 92.38s/it]/it]
100%|██████████| 6/6 [10:06<00:00, 101.11s/it]it]
100%|██████████| 6/6 [09:08<00:00, 91.35s/it]/it]
100%|██████████| 10/10 [1:34:29<00:00, 566.90s/it]


In [8]:
with open(filename, "a", encoding='utf-8') as filehandle:
    writer = csv.writer(filehandle, delimiter=',',lineterminator='\n')
    for i in tqdm(range(2001,2022)):
        for j in tqdm(range(1,252,50)):
            writer.writerows(get_movies_details(i,j))

100%|██████████| 6/6 [10:12<00:00, 102.01s/it]
100%|██████████| 6/6 [09:27<00:00, 94.61s/it]/it]
100%|██████████| 6/6 [09:12<00:00, 92.01s/it]/it]
100%|██████████| 6/6 [09:04<00:00, 90.73s/it]/it]
100%|██████████| 6/6 [09:07<00:00, 91.24s/it]/it]
100%|██████████| 6/6 [09:06<00:00, 91.16s/it]/it]
100%|██████████| 6/6 [09:01<00:00, 90.27s/it]/it]
100%|██████████| 6/6 [08:27<00:00, 84.53s/it]4s/it]
100%|██████████| 6/6 [08:33<00:00, 85.63s/it]2s/it]
100%|██████████| 6/6 [08:46<00:00, 87.72s/it]6s/it]
100%|██████████| 6/6 [11:30<00:00, 115.06s/it]8s/it]
100%|██████████| 6/6 [10:38<00:00, 106.49s/it]5s/it]
100%|██████████| 6/6 [09:49<00:00, 98.26s/it] 6s/it]
100%|██████████| 6/6 [08:52<00:00, 88.83s/it]30s/it]
100%|██████████| 6/6 [11:11<00:00, 111.99s/it]8s/it]
100%|██████████| 6/6 [09:40<00:00, 96.81s/it]78s/it]
100%|██████████| 6/6 [10:01<00:00, 100.32s/it]/it]  
100%|██████████| 6/6 [08:52<00:00, 88.83s/it]s/it]
100%|██████████| 6/6 [09:03<00:00, 90.62s/it]s/it]
100%|██████████| 6/6 [09

In [9]:
with open(filename, "a", encoding='utf-8') as filehandle:
    writer = csv.writer(filehandle, delimiter=',',lineterminator='\n')
    for i in tqdm(range(1,152,50)):
        writer.writerows(get_movies_details(2022,i))

100%|██████████| 4/4 [05:44<00:00, 86.15s/it]
