# IMDB Top 50 Movies and Shows in Each Genre

- Desc : I am scraping top 50 movies from each Genre in IMDB and extract movie title, rating , year of release,                    certificate rating given to a movie 
- Tools: Python, requests, Beautiful Soup, Pandas

## The steps followed:

- Scrape genres from : https://www.imdb.com/feature/genre
- we'll get 15 genres. For each Genre we'll get genre title, genre page URL.
- From each genre page, we'll get the top 50 movies in that genre from that particular genre page
- For each genre, we'll grab the movie title, rating, year of realese and certificate rating given to that movie.
- The null values will be handled as blanks when scraping from the pages
- For each genre we'll create a CSV file in the following format:
  Movie Title,Rating,Year of release,certificate rating

# Scrape the list of genres from IMDB

- used requests to downlaod the page
- used BS4 to parse and extract information from the page.
- convert to a Pandas dataframe to contain all the genres and their URLs

In [1]:
import requests
from bs4 import BeautifulSoup

def get_genres():
    Genre_URL = 'https://www.imdb.com/feature/genre'
    req = requests.get(Genre_URL)
    genre_page = req.text
    if (req.status_code != 200):
        raise Exception('Failed to load page {}'.format(topic_url))
    doc = BeautifulSoup(genre_page, 'html.parser')
    return doc

In [2]:
def Genres_topics(doc):
    Genres = []
    Img_tag = doc.find_all('img')
    for img in range(len(Img_tag)) :
        Genres.append(Img_tag[img]['title']) 
    return Genres

In [3]:
def Genres_URLs(doc):
    Img_tag = doc.find_all('img')
    URLs = []
    for img in range(len(Img_tag)) :
        ref = Img_tag[img].parent
        URLs.append(ref['href'])
    return URLs

In [4]:
import pandas as pd

def scrape_genres():
    topics_url = 'https://www.imdb.com/feature/genre'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    doc = BeautifulSoup(response.text, 'html.parser')
    Gen_dict = {
        'Genres': Genres_topics(doc),
        'URLs': Genres_URLs(doc)
    }
    return pd.DataFrame(Gen_dict)

## The Dataframe containing Genres and their URLs

In [5]:
scrape_genres()

Unnamed: 0,Genres,URLs
0,Comedy,https://www.imdb.com/search/title?genres=comed...
1,Sci-Fi,https://www.imdb.com/search/title?genres=sci-f...
2,Horror,https://www.imdb.com/search/title?genres=horro...
3,Romance,https://www.imdb.com/search/title?genres=roman...
4,Action,https://www.imdb.com/search/title?genres=actio...
5,Thriller,https://www.imdb.com/search/title?genres=thril...
6,Drama,https://www.imdb.com/search/title?genres=drama...
7,Mystery,https://www.imdb.com/search/title?genres=myste...
8,Crime,https://www.imdb.com/search/title?genres=crime...
9,Animation,https://www.imdb.com/search/title?genres=anima...


## Function to extract Movie info from Genre URL

In [6]:
import os

#used BS4 to parse and extract information from the page of each genre

def get_genre_page(gen_url):
    # Download the page
    res = requests.get(gen_url)
    # Check successful response
    if res.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    # Parse using Beautiful soup
    gen_doc = BeautifulSoup(res.text, 'html.parser')
    return gen_doc

#getting Movie and Show Title

def movie_name(gen_doc):
    par = gen_doc.find_all('h3')
    Titles = []
    for i in range(50):
        atag = par[i].find_all('a')
        title = atag[0].text.strip()
        Titles.append(title)
    return Titles

# getting Movie and Show Review

def review(gen_doc):
    div = gen_doc.find_all('div' , {'class' :'lister-item-content' })
    ran = len(div)
    Reviews = []
    for i in range(ran):
        result = div[i].find_all('strong')
        if (len(result) == 0):
            Reviews.append(' ')
        else:
            rat= div[i].find_all('strong')[0].text
            Reviews.append(rat)
    return Reviews

#getting year of release for the movie and Show

def yearOfRelease(gen_doc):
    tagh3 = gen_doc.find_all('h3')
    YOR = []
    for i in range(50):
        span = tagh3[i].find_all('span')
        sp1 = span[1]
        yr = sp1.text.strip()
        res = yr.lstrip("(").rstrip(")")
        YOR.append(res)
    return YOR

#getting certificate rating for the movie and Show

def rating(gen_doc):
    p = gen_doc.find_all('p' , {'class' :'text-muted' })
    ran = len(p)
    Rating = []
    for i in range(0,ran,2):
        result = p[i].find_all('span' , {'class': 'certificate' })
        if (len(result) == 0):
            Rating.append(' ')
        else:
            rat= p[i].find_all('span' , {'class': 'certificate' })[0].text
            Rating.append(rat)
    return Rating

#Function to get all the show or movie info and convert into a pandas dataframe

def get_genres_movies(url):
    gen_doc = url
    genre_movies_dict = { 'Title': [], 'Review': [], 'Year Of Release': [],'Certificate Rating': []}
    Titles = movie_name(gen_doc)
    Review = review(gen_doc)
    YOR = yearOfRelease(gen_doc)
    Rating = rating(gen_doc)
    # Get movies info
    for i in range(50):
        genre_movies_dict['Title'].append(Titles[i])
        genre_movies_dict['Review'].append(Review[i])
        genre_movies_dict['Year Of Release'].append(YOR[i])
        genre_movies_dict['Certificate Rating'].append(Rating[i])
        
    return pd.DataFrame(genre_movies_dict)

#Function to convert each genre related movie or show to a csv file

def scrape_topic(gen_url,path):
    if os.path.exists(path):
        print("The file {} already exists. Skipping...".format(path))
        return 
    genre_df = get_genres_movies(get_genre_page(gen_url))
    genre_df.to_csv(path, index=None)
    return genre_df

In [7]:
#storing all the csv files into a folder named Data

def get_movies():    
    print('Scraping list of Genres')
    genres_df = scrape_genres()    
    os.makedirs('Data', exist_ok=True)
    for index, row in genres_df.iterrows():
        print('Scraping top movies for "{}"'.format(row['Genres']))
        scrape_topic(row['URLs'], 'Data/{}.csv'.format(row['Genres']))
    

## Scraping all Movies and its info from Genres

In [8]:
get_movies()

Scraping list of Genres
Scraping top movies for "Comedy"
Scraping top movies for "Sci-Fi"
Scraping top movies for "Horror"
Scraping top movies for "Romance"
Scraping top movies for "Action"
Scraping top movies for "Thriller"
Scraping top movies for "Drama"
Scraping top movies for "Mystery"
Scraping top movies for "Crime"
Scraping top movies for "Animation"
Scraping top movies for "Adventure"
Scraping top movies for "Fantasy"
Scraping top movies for "Comedy-Romance"
Scraping top movies for "Action-Comedy"
Scraping top movies for "Superhero"
