## Project-1: Using web scraping to build a database of movie related information from: The Movie Database (TMDB) movie data**

1. Establish a connection to the webpage -"https://www.themoviedb.org/movie"


In [1]:
import requests

base_url = "https://www.themoviedb.org"
movie_url =  f'{base_url}/movie'
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}

def get_data_for_url(url, headers):
    return requests.get(url, headers=headers)   


# Prepare the url to fetch the data
movie_response = get_data_for_url(movie_url, headers)

In [2]:
def verify_response(response):
    assert response.status_code == 200

# Verify the request
verify_response(movie_response)

In [3]:
# Printing the variable and store it to content
content = movie_response.content
print(content)

b'<!DOCTYPE html>\n<html lang="en" class="no-js">\n  <head>\n    <title>Popular Movies &#8212; The Movie Database (TMDB)</title>\n    <meta http-equiv="cleartype" content="on">\n    <meta charset="utf-8">\n    <meta name="keywords" content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast">\n    <meta name="mobile-web-app-capable" content="yes">\n    <meta name="apple-mobile-web-app-capable" content="yes">\n    <meta name="viewport" content="width=device-width,initial-scale=1">\n      <meta name="description" content="The Movie Database (TMDB) is a popular, user editable database for movies and TV shows.">\n    <meta name="msapplication-TileImage" content="/assets/2/v4/icons/mstile-144x144-30e7905a8315a080978ad6aeb71c69222b72c2f75d26dab1224173a96fecc962.png">\n<meta name="msapplication-TileColor" content="#032541">\n<meta name="theme-color" content="#032541">\n<link rel="apple-touch-icon" sizes="180x180" href

In [4]:
# Type of data
print(type(content))

<class 'bytes'>


In [5]:
def get_first_n_chars(response, n):
    return response[0:n]

# print first 200 chars
print(get_first_n_chars(content, 200))

b'<!DOCTYPE html>\n<html lang="en" class="no-js">\n  <head>\n    <title>Popular Movies &#8212; The Movie Database (TMDB)</title>\n    <meta http-equiv="cleartype" content="on">\n    <meta charset="utf-8">\n  '


## Parse the content of HTML response using the BeautifulSoup library and execute the tasks specified in the guidelines mentioned below

In [6]:
from bs4 import BeautifulSoup

## a. create beautiful soup object from the response content

def get_beautiful_soup_object(content):
    try:
        return BeautifulSoup(content, 'html.parser')
    except:
        raise Exception("Error in getting the beautiful soup object")
        

get_beautiful_soup_object(content)

<!DOCTYPE html>

<html class="no-js" lang="en">
<head>
<title>Popular Movies — The Movie Database (TMDB)</title>
<meta content="on" http-equiv="cleartype"/>
<meta charset="utf-8"/>
<meta content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast" name="keywords"/>
<meta content="yes" name="mobile-web-app-capable"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="The Movie Database (TMDB) is a popular, user editable database for movies and TV shows." name="description"/>
<meta content="/assets/2/v4/icons/mstile-144x144-30e7905a8315a080978ad6aeb71c69222b72c2f75d26dab1224173a96fecc962.png" name="msapplication-TileImage"/>
<meta content="#032541" name="msapplication-TileColor"/>
<meta content="#032541" name="theme-color"/>
<link href="/assets/2/apple-touch-icon-57ed4b3b0450fd5e9a0c20f34e814b82adaa1085c79bdde2f00ca8787b63d

In [7]:
def extract_page_title(movie_soup):
    return movie_soup.find('title').text
    
soup = get_beautiful_soup_object(content)

print(extract_page_title(soup))

Popular Movies — The Movie Database (TMDB)


In [8]:
def get_beautiful_soup_object(url):
    try:
        headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}
        response = requests.get(url, headers=headers)
        return BeautifulSoup(response.content, 'html.parser')
    except Exception as e:
        # Handle exceptions
        raise ValueError(f"An unexpected error occurred: {e}")
        

import unittest

class TestGetSoupFromUrl(unittest.TestCase):

    def test_working_url(self):
        url = "https://www.themoviedb.org/movies"
        soup = get_beautiful_soup_object(url)
        self.assertIsNotNone(soup)

    def test_404_response(self):
        invalid_url = "example_invlid_url"

        with self.assertRaises(ValueError) as context:
            get_beautiful_soup_object(invalid_url)

        self.assertIn("An unexpected error occurred:", str(context.exception))


# Run the tests
unittest.TextTestRunner().run(unittest.TestLoader().loadTestsFromTestCase(TestGetSoupFromUrl))


..
----------------------------------------------------------------------
Ran 2 tests in 0.363s

OK


<unittest.runner.TextTestResult run=2 errors=0 failures=0>

### Extract the content of the webpage - https://www.themoviedb.org/movie - that hosts a current dated listing of popular movies.

In [9]:
url = "https://www.themoviedb.org/movie"
soup = get_beautiful_soup_object(url)
print(soup)

<!DOCTYPE html>

<html class="no-js" lang="en">
<head>
<title>Popular Movies — The Movie Database (TMDB)</title>
<meta content="on" http-equiv="cleartype"/>
<meta charset="utf-8"/>
<meta content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast" name="keywords"/>
<meta content="yes" name="mobile-web-app-capable"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="The Movie Database (TMDB) is a popular, user editable database for movies and TV shows." name="description"/>
<meta content="/assets/2/v4/icons/mstile-144x144-30e7905a8315a080978ad6aeb71c69222b72c2f75d26dab1224173a96fecc962.png" name="msapplication-TileImage"/>
<meta content="#032541" name="msapplication-TileColor"/>
<meta content="#032541" name="theme-color"/>
<link href="/assets/2/apple-touch-icon-57ed4b3b0450fd5e9a0c20f34e814b82adaa1085c79bdde2f00ca8787b63d

##### First Movie HTML content

In [10]:
results = soup.find(id="page_1")
first_movie_html = results.find("div", class_="style_1")
print(first_movie_html.prettify())

<div class="card style_1">
 <div class="image">
  <div class="wrapper">
   <a class="image" href="/movie/901362" title="Trolls Band Together">
    <img alt="" class="poster" loading="lazy" src="/t/p/w220_and_h330_face/sEaLO9s7CIN3fjz8R3Qksum44en.jpg" srcset="/t/p/w220_and_h330_face/sEaLO9s7CIN3fjz8R3Qksum44en.jpg 1x, /t/p/w440_and_h660_face/sEaLO9s7CIN3fjz8R3Qksum44en.jpg 2x"/>
   </a>
  </div>
  <div class="options" data-id="901362" data-media-type="movie" data-object-id="619bea97c0ae360089136cff">
   <a class="no_click" href="#">
    <div class="glyphicons_v2 circle-more white">
    </div>
   </a>
  </div>
 </div>
 <div class="content">
  <div class="consensus tight">
   <div class="outer_ring">
    <div class="user_score_chart 619bea97c0ae360089136cff" data-bar-color="#21d07a" data-percent="71.27" data-track-color="#204529">
     <div class="percent">
      <span class="icon icon-r71">
      </span>
     </div>
    </div>
   </div>
  </div>
  <h2>
   <a href="/movie/901362" title="T

Display the name of the first movie

In [11]:
first_movie_html.find("h2").a.text

'Trolls Band Together'

Display the user rating of the first movie

In [12]:
# As it's not mentioned to round off the rating so I'm not doing it, simply displaying the rating field
first_movie_html.find("div", class_="outer_ring").div["data-percent"]

'71.27'

 extract the part of the url following the string

In [13]:
movie_url = first_movie_html.find("h2").a["href"]
movie_url[1:]

'movie/901362'

Write user defined functions for each subsection below

In [14]:
# Titles of all the movies on the page as a Python list
def all_movie_titles(soup):
    results = soup.find_all("div", class_="style_1")
    movie_titles = []
    for movie_html in results:
        try:
            movie_title = movie_html.find("h2").a.text
            movie_titles.append(movie_title)
        except:
            "ignore ;)"
    return movie_titles

In [15]:
def all_movie_ratings(soup):
    ratings = []
    results = soup.find_all("div", class_="outer_ring")
    
    for movie_html in results:
        try:
            movie_rating = movie_html.find("div", class_=["user_score_chart"])
            ratings.append(movie_rating['data-percent'])
        except:
            ratings.append("not rated")
        
    return ratings


all_movie_ratings(soup)

['71.27',
 '81.56',
 '71.49',
 '79.0',
 '78.57000000000001',
 '64.32000000000001',
 '71.46',
 '72.03',
 '76.0',
 '65.03999999999999',
 '62.5',
 '71.22',
 '74.18',
 '73.0',
 '72.15',
 '67.42',
 '69.41',
 '65.78999999999999',
 '68.03',
 '77.48']

In [16]:
# HTML content of all the individual pages of movies collected into a Python list
def get_html_for_movies(soup):
    movie_html_list = []
    headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}
    base_url = "https://www.themoviedb.org"
    movie_urls = results.find_all("div", class_="wrapper")

    for movie_url in movie_urls:
        url = movie_url.a["href"]
        movie_url =  f'{base_url}{url}'
        movie_html = get_data_for_url(movie_url, headers)
        movie_html_list.append(movie_html.text)
    
    return movie_html_list

all_movie_html = get_html_for_movies(soup)

In [17]:
def get_genre_for_movies(all_movies_html):
    all_genres = []
    for movie_html in all_movies_html:
        movie_genres = []
        soup = BeautifulSoup(movie_html, "html.parser")
        genres = soup.find('span', class_="genres")
        agen = genres.find_all("a")
        
        for ag in agen:
            movie = ag.text
            movie_genres.append(movie)
            
        movie = ",".join(movie_genres)
        all_genres.append(movie)
    return all_genres

In [18]:
def get_cast_for_movies(movies_html):
    casts = []
    movie_casts = []
    for movie_html in movies_html:
        movie_casts = []
        soup = BeautifulSoup(movie_html, "html.parser")
        cards = soup.findAll('li', class_="card")
        
        for card in cards:
            movie_casts.append(card.findChildren()[2].text)
        
        movie_casts_str = ",".join(movie_casts)
        casts.append(movie_casts_str)
    return casts

In [19]:
import pandas as pd

def create_dataframe(soup, all_movie_html):
    all_titles = all_movie_titles(soup)
    all_ratings = all_movie_ratings(soup)
    all_casts = get_cast_for_movies(all_movie_html)
    all_genre = get_genre_for_movies(all_movie_html)
    
    data = {'Titles': all_titles, 'Ratings': all_ratings, 'Cast': all_casts, 'Genre': all_genre}
    return pd.DataFrame(data)

create_dataframe(soup, all_movie_html)

Unnamed: 0,Titles,Ratings,Cast,Genre
0,Trolls Band Together,71.27,"Anna Kendrick,Justin Timberlake,Camila Cabello...","Animation,Family,Music,Fantasy,Comedy"
1,Oppenheimer,81.56,"Cillian Murphy,Emily Blunt,Matt Damon,Robert D...","Drama,History"
2,The Creator,71.49,"John David Washington,Madeleine Yuna Voyles,Ge...","Science Fiction,Action,Thriller"
3,Leo,79.0,"Adam Sandler,Bill Burr,Cecily Strong,Jason Ale...","Animation,Comedy,Family"
4,Five Nights at Freddy's,78.57000000000001,"Josh Hutcherson,Piper Rubio,Elizabeth Lail,Mat...","Horror,Mystery"
5,Expend4bles,64.32000000000001,"Sylvester Stallone,Jason Statham,50 Cent,Megan...","Action,Adventure,Thriller"
6,Jawan,71.46,"Shah Rukh Khan,Nayanthara,Vijay Sethupathi,Pri...","Action,Adventure,Thriller"
7,Fast X,72.03,"Vin Diesel,Michelle Rodriguez,Tyrese Gibson,Lu...","Action,Crime,Thriller"
8,Mission: Impossible - Dead Reckoning Part One,76.0,"Tom Cruise,Hayley Atwell,Ving Rhames,Simon Peg...","Action,Thriller"
9,Napoleon,65.03999999999999,"Joaquin Phoenix,Vanessa Kirby,Tahar Rahim,Ben ...","Drama,History,War"


In [20]:
import os

def get_df_from_pages(start, end):
    base_url = "https://www.themoviedb.org"
    movie_url =  f'{base_url}/movie'
    headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}
    data_frames = []
    path = './csv_dir'
    os.makedirs(path, mode = 0o777, exist_ok = True) 
    
    for i in range(start, end):
        url = f"{movie_url}?page={i}"
        response = get_data_for_url(url, headers)
        content = response.content
        soup = BeautifulSoup(content, "html.parser")
        all_movie_html = get_html_for_movies(soup)
        df = create_dataframe(soup, all_movie_html)
        print(df)
        df.to_csv(f"csv_dir/my_csv_{i}.csv", sep='\t')
        data_frames.append(df)
    
    return data_frames
    
data_frames = get_df_from_pages(1,6)

                                               Titles            Ratings  \
0                                Trolls Band Together              71.27   
1                                         Oppenheimer              81.56   
2                                         The Creator              71.49   
3                                                 Leo               79.0   
4                             Five Nights at Freddy's  78.57000000000001   
5                                         Expend4bles  64.32000000000001   
6                                               Jawan              71.46   
7                                              Fast X              72.03   
8       Mission: Impossible - Dead Reckoning Part One               76.0   
9                                            Napoleon  65.03999999999999   
10                                      The Mercenary               62.5   
11                                         Believer 2              71.22   
12          

                                         Titles             Ratings  \
0                              After Everything   70.28999999999999   
1                      The Mongolian Connection                65.0   
2         Indiana Jones and the Dial of Destiny               66.76   
3                                    The Jester                45.0   
4                                       Boudica   60.57000000000001   
5                                 The Locksmith               58.66   
6                      Dashing Through the Snow               71.28   
7                       The Monkey King: Reborn                71.0   
8   Teenage Mutant Ninja Turtles: Mutant Mayhem               73.04   
9                                       Radical                76.0   
10                                     The Lake               58.67   
11                           The Little Mermaid               64.84   
12                           Mad Max: Fury Road               75.85   
13    

In [30]:
# Append all dataframes
def combine_dfs(data_frames):
    return pd.concat(data_frames)
    
combine_dfs(data_frames)


Unnamed: 0,Titles,Ratings,Cast,Genre
0,Trolls Band Together,71.27,"Anna Kendrick,Justin Timberlake,Camila Cabello...","Animation,Family,Music,Fantasy,Comedy"
1,Oppenheimer,81.56,"Cillian Murphy,Emily Blunt,Matt Damon,Robert D...","Drama,History"
2,The Creator,71.49,"John David Washington,Madeleine Yuna Voyles,Ge...","Science Fiction,Action,Thriller"
3,Leo,79.0,"Adam Sandler,Bill Burr,Cecily Strong,Jason Ale...","Animation,Comedy,Family"
4,Five Nights at Freddy's,78.57000000000001,"Josh Hutcherson,Piper Rubio,Elizabeth Lail,Mat...","Horror,Mystery"
...,...,...,...,...
15,The Batman,77.00999999999999,"Jason Statham,Wu Jing,Shuya Sophia Cai,Page Ke...","Action,Science Fiction,Horror"
16,Genie,79.58,"Xolo Mariduena,Bruna Marquezine,Susan Sarandon...","Action,Science Fiction,Adventure"
17,Interstellar,84.21,"Brie Larson,Teyonah Parris,Iman Vellani,Zawe A...","Science Fiction,Adventure,Action"
18,When Evil Lurks,73.0,"Masako Nozawa,Hiromi Tsuru,Mayumi Tanaka,Kōhei...","Action,Animation"
