## Project-1: Using web scraping to build a database of movie related information from: The Movie Database (TMDB) movie data

#### import all the required libraries 

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import os

#### Establish a connection to the webpage - "https://www.themoviedb.org/movie"

#### 1a. formulate a get request to download the contents of the webpage

In [2]:
needed_headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}

r= requests.get(("https://www.themoviedb.org/movie"),headers = needed_headers)
r

<Response [200]>

#### 1b. Verify the status code of the request and confirm that the request was executed appropriately

In [3]:
r.status_code

200

#### 1c. Print the contents of the page obtained from the response and save it in a variable

In [52]:
content=r.content
print(content)

b'<!DOCTYPE html>\n<html lang="en" class="no-js">\n  <head>\n    <title>Popular Movies &#8212; The Movie Database (TMDB)</title>\n    <meta http-equiv="cleartype" content="on">\n    <meta charset="utf-8">\n    <meta name="keywords" content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast">\n    <meta name="mobile-web-app-capable" content="yes">\n    <meta name="apple-mobile-web-app-capable" content="yes">\n    <meta name="viewport" content="width=device-width,initial-scale=1">\n      <meta name="description" content="The Movie Database (TMDB) is a popular, user editable database for movies and TV shows.">\n    <meta name="msapplication-TileImage" content="/assets/2/v4/icons/mstile-144x144-30e7905a8315a080978ad6aeb71c69222b72c2f75d26dab1224173a96fecc962.png">\n<meta name="msapplication-TileColor" content="#032541">\n<meta name="theme-color" content="#032541">\n<link rel="apple-touch-icon" sizes="180x180" href

#### 1d. Infer the type of the variable created in part 1c and display the first 200 characters of the content from the server’s response 

In [53]:
# print(type(content))
content=content.decode("utf-8") # convert the type of content from bytes to str


In [54]:
# text=contents.text
content[:200]

'<!DOCTYPE html>\n<html lang="en" class="no-js">\n  <head>\n    <title>Popular Movies &#8212; The Movie Database (TMDB)</title>\n    <meta http-equiv="cleartype" content="on">\n    <meta charset="utf-8">\n  '

#### 2a. Pass the contents of the webpage obtained from step 1c as an argument to create an instance of the BeautifulSoup class 

In [56]:
soup=bs(content,'html.parser')
soup

<!DOCTYPE html>

<html class="no-js" lang="en">
<head>
<title>Popular Movies — The Movie Database (TMDB)</title>
<meta content="on" http-equiv="cleartype"/>
<meta charset="utf-8"/>
<meta content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast" name="keywords"/>
<meta content="yes" name="mobile-web-app-capable"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="The Movie Database (TMDB) is a popular, user editable database for movies and TV shows." name="description"/>
<meta content="/assets/2/v4/icons/mstile-144x144-30e7905a8315a080978ad6aeb71c69222b72c2f75d26dab1224173a96fecc962.png" name="msapplication-TileImage"/>
<meta content="#032541" name="msapplication-TileColor"/>
<meta content="#032541" name="theme-color"/>
<link href="/assets/2/apple-touch-icon-57ed4b3b0450fd5e9a0c20f34e814b82adaa1085c79bdde2f00ca8787b63d

#### 2b.Extract the title of the parsed web page content using an appropriate method or attribute of the document object created in part 2a

In [57]:
soup.title

<title>Popular Movies — The Movie Database (TMDB)</title>

In [58]:
title=soup.title.text
print(title)

Popular Movies — The Movie Database (TMDB)


#### 2c. Write a user defined function to generalize the task presented in Q2a to any URL that retrieves the content of the webpage. Your function should take a URL string as an input and return a correctly formulated BeautifulSoup instance as the output. In your function definition, ensure that appropriate exceptions are raised to the user (through status codes) if they pass in malformed/incorrect URLs. Write two test cases for your function - one with a working URL and another with an URL that gets a 404 response.

In [63]:

def create_soup():
    URL=input()
    needed_headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}
    r= requests.get((URL),headers = needed_headers)
    if r.ok==True:
        content=r.content
        soup=bs(r.content,'html.parser')
        return soup
    else:
        print("enter valid URL")
        
# can raise exception by using status_codes which are more than 399


In [60]:
# test case 1. valid URL:https://www.w3schools.com/python/python_lists_comprehension.asp
create_soup()

https://www.themoviedb.org/movie


<!DOCTYPE html>

<html class="no-js" lang="en">
<head>
<title>Popular Movies — The Movie Database (TMDB)</title>
<meta content="on" http-equiv="cleartype"/>
<meta charset="utf-8"/>
<meta content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast" name="keywords"/>
<meta content="yes" name="mobile-web-app-capable"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="The Movie Database (TMDB) is a popular, user editable database for movies and TV shows." name="description"/>
<meta content="/assets/2/v4/icons/mstile-144x144-30e7905a8315a080978ad6aeb71c69222b72c2f75d26dab1224173a96fecc962.png" name="msapplication-TileImage"/>
<meta content="#032541" name="msapplication-TileColor"/>
<meta content="#032541" name="theme-color"/>
<link href="/assets/2/apple-touch-icon-57ed4b3b0450fd5e9a0c20f34e814b82adaa1085c79bdde2f00ca8787b63d

In [61]:
# test case 2. invalid URL:https://httpbin.org/status/404
create_soup()

https://httpbin.org/status/404
enter valid URL


#### 3a. Write a function call to the user defined function created in 2c with the url https://www.themoviedb.org/movie as an input and store the response in a variable

In [64]:
tmdb=create_soup()
tmdb

https://www.themoviedb.org/movie


<!DOCTYPE html>

<html class="no-js" lang="en">
<head>
<title>Popular Movies — The Movie Database (TMDB)</title>
<meta content="on" http-equiv="cleartype"/>
<meta charset="utf-8"/>
<meta content="Movies, TV Shows, Streaming, Reviews, API, Actors, Actresses, Photos, User Ratings, Synopsis, Trailers, Teasers, Credits, Cast" name="keywords"/>
<meta content="yes" name="mobile-web-app-capable"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<meta content="The Movie Database (TMDB) is a popular, user editable database for movies and TV shows." name="description"/>
<meta content="/assets/2/v4/icons/mstile-144x144-30e7905a8315a080978ad6aeb71c69222b72c2f75d26dab1224173a96fecc962.png" name="msapplication-TileImage"/>
<meta content="#032541" name="msapplication-TileColor"/>
<meta content="#032541" name="theme-color"/>
<link href="/assets/2/apple-touch-icon-57ed4b3b0450fd5e9a0c20f34e814b82adaa1085c79bdde2f00ca8787b63d

#### 3b. Print the HTML content associated with the first movie displayed on the web page using appropriate HTML tags to access this listing on the object created in part 3a

In [65]:
movie1_contents = tmdb.find('div', class_ = 'card style_1')
movie1_contents

<div class="card style_1">
<div class="image">
<div class="wrapper">
<a class="image" href="/movie/667538" title="Transformers: Rise of the Beasts">
<img alt="" class="poster" loading="lazy" src="/t/p/w220_and_h330_face/gPbM0MK8CP8A174rmUwGsADNYKD.jpg" srcset="/t/p/w220_and_h330_face/gPbM0MK8CP8A174rmUwGsADNYKD.jpg 1x, /t/p/w440_and_h660_face/gPbM0MK8CP8A174rmUwGsADNYKD.jpg 2x"/>
</a>
</div>
<div class="options" data-id="667538" data-media-type="movie" data-object-id="5e2f8f9d98f1f10012fd62b1">
<a class="no_click" href="#"><div class="glyphicons_v2 circle-more white"></div></a>
</div>
</div>
<div class="content">
<div class="consensus tight">
<div class="outer_ring">
<div class="user_score_chart 5e2f8f9d98f1f10012fd62b1" data-bar-color="#21d07a" data-percent="74.0" data-track-color="#204529">
<div class="percent">
<span class="icon icon-r74"></span>
</div>
</div>
</div>
</div>
<h2><a href="/movie/667538" title="Transformers: Rise of the Beasts">Transformers: Rise of the Beasts</a></h2>

#### 3c. Display the name of the first movie using appropriate HTML tags to access this listing on the object created in part 3a

In [66]:
movie1_name=[item['title'] for item in tmdb.find_all('a',class_='image', attrs={'title' : True})]
movie1_name[0]

'Transformers: Rise of the Beasts'

#### 3d. Display the user rating of the first movie by using appropriate HTML tags to access this listing on the object created in part 3a

In [67]:
user_rating=[item['data-percent'] for item in tmdb.find_all('div', attrs={'data-percent' : True})] 
print(user_rating[0])

74.0


#### 3e. For the first movie, extract the part of the url following the string “https://www.themoviedb.org/” using the appropriate HTML tags to extract this portion on the object created in part 3a (do not use built-in string methods). (1 mark )For example, if the first movie on the web page had the URL https://www.themoviedb.org/movie/779782 “ your output should be movie/779782

In [68]:
movie1_code=tmdb.find('a',class_='image',href=re.compile(r'[/]["https://www.themoviedb.org/"]')).attrs['href']
movie1_code

'/movie/667538'

#### 4a. Titles of all the movies on the page as a Python list

In [73]:
def movie_titles(tmdb):
    movie_titles=[item['title'] for item in tmdb.find_all('a',class_='image', attrs={'title' : True})]
    return movie_titles
    
print(movie_titles(tmdb),end=" ")

    

['Transformers: Rise of the Beasts', 'Barbie', 'Guardians of the Galaxy Vol. 3', 'The Flash', 'Warhorse One', 'Fast X', 'Knights of the Zodiac', 'Sound of Freedom', 'Bird Box Barcelona', 'Spider-Man: Across the Spider-Verse', 'John Wick: Chapter 4', 'Elemental', 'The Super Mario Bros. Movie', 'The Out-Laws', 'San Andreas', 'Oppenheimer', 'Insidious: The Last Key', 'Mavka: The Forest Song', 'Sheroes', 'The Darkest Minds'] 

#### 4b. User ratings of all the movies on the page as a Python list

In [53]:
# Using list comprehenssion
# def user_ratings():
    
#     user_rating=[item['data-percent'] for item in tmdb.find_all('div', attrs={'data-percent' : True})] 
#     return user_rating

# user_ratings()

In [74]:
def user_ratings(tmdb):
    rating=[]
    u_rating=tmdb.find_all('div', attrs={'data-percent' : True})
    for val in u_rating:
        get_val=val['data-percent']
        get_val=int(float(get_val))
        if get_val in range(1,101) :
            rating.append(get_val)
        else:
            rating.append('not_rated')
    return rating

print(user_ratings(tmdb),end=" ")

[74, 79, 81, 69, 68, 73, 66, 81, 60, 85, 79, 76, 78, 63, 62, 83, 63, 75, 62, 70] 

In [23]:
# rat=[]
# art=tmdb.find_all('div', attrs={'data-percent' : True})
# # output = [art["data-percent"] for art in art]
# # output

# for val in art:
#     get_val=val['data-percent']
#     get_val=int(float(get_val))
#     if get_val in range(1,101) :
#         rat.append(get_val)
#     else:
#         rat.append('not_rated')
        
# rat
    
    
    

#### 4c. Write user defined function for HTML content of all the individual pages of movies collected into a Python list.

In [77]:
def articles_movie_codes(tmdb):

    movie_codes = [a['href'] for a in tmdb.find_all('a',class_='image', attrs={'href' : True})]
#     articles=[tmdb.find_all('div', class_ = 'card style_1')]
    return movie_codes
print(articles_movie_codes(tmdb),end=" ")

['/movie/667538', '/movie/346698', '/movie/447365', '/movie/298618', '/movie/1076487', '/movie/385687', '/movie/455476', '/movie/678512', '/movie/805320', '/movie/569094', '/movie/603692', '/movie/976573', '/movie/502356', '/movie/921636', '/movie/254128', '/movie/872585', '/movie/406563', '/movie/459003', '/movie/1130818', '/movie/445651'] 

#### 4d. Write user defined function for Genres of all the movies on the page as a Python list

In [78]:
# generalize the task presented in Q2a to any URL that retrieves the content of the webpage and to not take URL as input rather helps access URL from list of URLs
def create_soups(URL):
    
    needed_headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36"}
    r= requests.get((URL),headers = needed_headers)
    if r.ok==True:
        content=r.content
        soup=bs(r.content,'html.parser')
        return soup
    else:
        print("enter valid URL")

In [79]:
def movie_genres(tmdb):
    movie_codes=articles_movie_codes(tmdb)
    genres=[]
# for i in articles_movie_codes:
#     i="https://www.themoviedb.org"+i
#     print(i)
    link = ["https://www.themoviedb.org"+i for i in movie_codes]

    for i in link[:]:
        URL=i
        movie1=create_soups(URL)
        genz=movie1.find('span',class_='genres')
        genz=genz.find_all('a')
        genre=[]
        for item in genz:
            genr=item.text
            genre.append(genr)
        genres.append(genre)
#     print(genre)
    return genres
movie_genres(tmdb)
# print(len(movie_genres(articles_movie_codes)))

[['Action', 'Adventure', 'Science Fiction'],
 ['Comedy', 'Adventure', 'Fantasy'],
 ['Science Fiction', 'Adventure', 'Action'],
 ['Action', 'Adventure', 'Science Fiction'],
 ['Action', 'Drama', 'War'],
 ['Action', 'Crime', 'Thriller'],
 ['Fantasy', 'Action', 'Adventure'],
 ['Action', 'Drama'],
 ['Thriller', 'Drama', 'Science Fiction', 'Horror'],
 ['Action', 'Adventure', 'Animation', 'Science Fiction'],
 ['Action', 'Thriller', 'Crime'],
 ['Animation', 'Comedy', 'Family', 'Fantasy', 'Romance'],
 ['Animation', 'Family', 'Adventure', 'Fantasy', 'Comedy'],
 ['Romance', 'Action', 'Comedy'],
 ['Action', 'Drama', 'Thriller'],
 ['Drama', 'History'],
 ['Horror', 'Mystery', 'Thriller'],
 ['Animation', 'Adventure', 'Family', 'Fantasy'],
 ['Action', 'Comedy', 'Crime', 'Adventure'],
 ['Action', 'Adventure', 'Drama']]

In [108]:
# def movie_genres(articles_movie_codes):
#     genres=[]

#     link = ["https://www.themoviedb.org"+i for i in articles_movie_codes]

#     for i in link[:]:
#         URL=i
#         movie1=create_soups(URL)
#         genz=movie1.find('span',class_='genres')
#         genz=genz.find_all('a')
#         genre=[]
#         for item in genz:
#             genr=item.text
#             genre.append(genr)
#         genres.append(genre)

#     return genres
# movie_genres(articles_movie_codes)


In [84]:
# genres=[]
# # for i in articles_movie_codes:
# #     i="https://www.themoviedb.org"+i
# #     print(i)
# link = ["https://www.themoviedb.org"+i for i in articles_movie_codes]
# link
# for i in link[:]:
#     URL=i
#     movie1=create_soups(URL)
#     genz=movie1.find('span',class_='genres')
#     genz=genz.find_all('a')
#     genre=[]
#     for item in genz:
#         genr=item.text
#         genre.append(genr)
#     genres.append(genre)
# #     print(genre)
# print(genres)

['Fantasy', 'Action', 'Adventure']


In [35]:
# def movie_genres(articles_movie_codes):
#     genres=[]
# for i in articles_movie_codes:
#     i="https://www.themoviedb.org"+i
#     print(i)
#     link = ["https://www.themoviedb.org"+i for i in articles_movie_codes]
#     link
#     for i in link[:]:
#         URL=i
#         movie1=create_soups(URL)
#         genz=movie1.find('span',class_='genres')
#         genz=genz.find_all('a')
#         for item in genz:
#             genre=item.text
        
#             genres.append(genre)
#     return list(genres)
#     return list(set(genres))
# movie_genres(articles_movie_codes)
# print(len(movie_genres(articles_movie_codes)))


# def movie_genres():
#     genres=[]
#     gen=tmdb.find('ul',id='with_genres')
#     for li in gen.find_all('li'):
#         genre_name = li.find('a', class_ = "no_click").text
#         genres.append(genre_name)
#     return genres
# movie_genres()


['Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western']

#### 4e. Write user defined function for Cast of all the movies on the page as a Python list 

In [27]:
# for item in i.find_all('div',class_='info'):
#         for x in item.find

In [47]:
# cast_list=[]
# link = ["https://www.themoviedb.org"+i for i in articles_movie_codes]
# URL=link[0]
# print(URL)
# movie1=create_soups(URL)

# cc=movie1.find('ol',class_='people scroller')
# cc=cc.find_all('li',class_='filler view_more')
# for item in cc:
#     for a in item.find('a',attrs={'href':True}):
#         cast=item.find('a').get('href')
#         cast="https://www.themoviedb.org"+cast
#     cast_all=create_soups(cast)
# # print(cast_all)
# casts=cast_all.find('ol',class_='people credits')
# casts=casts.find_all('li')
# for i in casts:
#     for item in i.find_all('div',class_='info'):
#         for x in item.find_all('a'):
#             xyz=x.get_text()
#             cast_list.append(xyz)
# print(cast_list)
    
        


    
    





In [80]:
def cast_list(tmdb):
    movie_codes=articles_movie_codes(tmdb)
    cast_list=[]
    link = ["https://www.themoviedb.org"+i for i in movie_codes]
    for URL in link:
        movie1=create_soups(URL)
        cc=movie1.find('ol',class_='people scroller')
        cc=cc.find_all('li',class_='filler view_more')
        for item in cc:
            for a in item.find('a',attrs={'href':True}):
                cast=item.find('a').get('href')
                cast="https://www.themoviedb.org"+cast
            cast_all=create_soups(cast)
# print(cast_all)
        casts=cast_all.find('ol',class_='people credits')
        casts=casts.find_all('li')
        cast=[]
        for i in casts:
            for item in i.find_all('div',class_='info'):
                for x in item.find_all('a'):
                    xyz=x.get_text()
                    cast.append(xyz)
        cast_list.append(cast)
    return cast_list
cast_list(tmdb)

[['Anthony Ramos',
  'Dominique Fishback',
  'Peter Cullen',
  'Ron Perlman',
  'Peter Dinklage',
  'Michelle Yeoh',
  'Pete Davidson',
  'Liza Koshy',
  'Cristo Fernández',
  'Luna Lauren Velez',
  'Dean Scott Vazquez',
  'Tobe Nwigwe',
  'Sarah Stiles',
  'Leni Parker',
  'Frank Marrs',
  'Aidan Devine',
  'Kerwin Jackson',
  'Mike Chute',
  'Tyler Hall',
  'Sean Tucker',
  'Jay Farrar',
  'Lucas Huarancca',
  'Amiel Cayo',
  'Santusa Cutipa',
  'Yesenia Inquillay',
  "Sumac T'Ika",
  'Josue Sallo',
  'Mellissa Alvarez',
  'Gloria Cusi',
  'Michael Kelly',
  'Jason D. Avalos',
  'Lesley Stahl',
  'John DiMaggio',
  'David Sobolov',
  'Michaela Jaé Rodriguez',
  'Colman Domingo',
  'Tongayi Chirisa',
  'Luke Jones'],
 ['Margot Robbie',
  'Ryan Gosling',
  'America Ferrera',
  'Kate McKinnon',
  'Michael Cera',
  'Issa Rae',
  'Ariana Greenblatt',
  'Alexandra Shipp',
  'Emma Mackey',
  'Hari Nef',
  'Sharon Rooney',
  'Ana Kayne',
  'Ritu Arya',
  'Dua Lipa',
  'Nicola Coughlan',
  'E

#### 5a.Write an user defined function that returns a pandas data frame for Titles of the movies listed on the page

In [96]:
# def mtd(tmdb):
#     df_titles=movie_titles(tmdb)
#     return df_titles
    
# mt=mtd(tmdb)
# mt = pd.DataFrame(mt)
# print (mt)

                                                0
0                Transformers: Rise of the Beasts
1                  Guardians of the Galaxy Vol. 3
2                                       The Flash
3                                          Barbie
4                                          Fast X
5                           Knights of the Zodiac
6                                Sound of Freedom
7                                    Warhorse One
8             Spider-Man: Across the Spider-Verse
9                                    The Out-Laws
10                           John Wick: Chapter 4
11                                      Elemental
12                    The Super Mario Bros. Movie
13                                    San Andreas
14                                        Sheroes
15                              The Darkest Minds
16                        Insidious: The Last Key
17                             Bird Box Barcelona
18  Mission: Impossible - Dead Reckoning Part One


In [84]:
def movie_titles_df(tmdb):
    list_titles=movie_titles(tmdb)
    mt = pd.DataFrame(list_titles,columns=['Movie_title'])
    return mt
movie_titles_df(tmdb)


Unnamed: 0,Movie_title
0,Transformers: Rise of the Beasts
1,Barbie
2,Guardians of the Galaxy Vol. 3
3,The Flash
4,Warhorse One
5,Fast X
6,Knights of the Zodiac
7,Sound of Freedom
8,Bird Box Barcelona
9,Spider-Man: Across the Spider-Verse


#### 5b.Write an user defined function that returns a pandas data frame for user ratings of the movies listed on the page

In [85]:
def user_rating_df(tmdb):
    list_user_rating=user_ratings(tmdb)
    ur = pd.DataFrame(list_user_rating,columns=['user_rating'])
    return ur
    
user_rating_df(tmdb)
# print(type(urd(tmdb)))

Unnamed: 0,user_rating
0,74
1,79
2,81
3,69
4,68
5,73
6,66
7,81
8,60
9,85


#### 5c. Write an user defined function that returns a pandas data frame for Genres of the movies listed on the page

In [88]:
def genres_df(tmdb):
    list_genres=movie_genres(tmdb)
    gen_df = pd.DataFrame(list_genres)
    gen_df=gen_df.fillna('')
    # combine all genres for a single movie into a column separated by ,
    gen_df['Genres'] = gen_df.astype(str).apply(','.join, axis=1).str.rstrip(',')
    gen_df = gen_df.filter(['Genres'])
#     gen_df=gen_df['new']
    return gen_df
    
# gen_df=gend()
# gen_df = pd.DataFrame(gen_df)

# gen_df=gen_df.fillna('')
# gen_df['concat'] = pd.Series(gen_df.fillna('').values.tolist()).str.join(',')
# gen_df['new'] = gen_df.astype(str).apply(','.join, axis=1).str.rstrip(',')
# gen_df['team_and_name'] = gen_df[0]+' '+gen_df[1].astype(str)
# gen_df=gen_df.sum(axis=1).astype(str).str.rstrip(',')
# print (gen_df)
genres_df(tmdb)


Unnamed: 0,Genres
0,"Action,Adventure,Science Fiction"
1,"Comedy,Adventure,Fantasy"
2,"Science Fiction,Adventure,Action"
3,"Action,Adventure,Science Fiction"
4,"Action,Drama,War"
5,"Action,Crime,Thriller"
6,"Fantasy,Action,Adventure"
7,"Action,Drama"
8,"Thriller,Drama,Science Fiction,Horror"
9,"Action,Adventure,Animation,Science Fiction"


#### 5d.  Write an user defined function that returns a pandas data frame for casts of the movies listed on the page

In [89]:
def cast_df(tmdb):
    list_cast=cast_list(tmdb)
    cast_df = pd.DataFrame(list_cast)
    cast_df=cast_df.fillna('')
    cast_df['Cast'] = cast_df.astype(str).apply(','.join, axis=1).str.rstrip(',')
    cast_df = cast_df.filter(['Cast'])
    return cast_df
    
# cast_df=castd()
# cast_df = pd.DataFrame(cast_df)
# print (cast_df)
cast_df(tmdb)

Unnamed: 0,Cast
0,"Anthony Ramos,Dominique Fishback,Peter Cullen,..."
1,"Margot Robbie,Ryan Gosling,America Ferrera,Kat..."
2,"Chris Pratt,Zoe Saldaña,Dave Bautista,Karen Gi..."
3,"Ezra Miller,Sasha Calle,Michael Keaton,Michael..."
4,"Ezra Miller,Sasha Calle,Michael Keaton,Michael..."
5,"Vin Diesel,Michelle Rodriguez,Tyrese Gibson,Lu..."
6,"Mackenyu,Madison Iseman,Diego Tinoco,Mark Daca..."
7,"Jim Caviezel,Mira Sorvino,Bill Camp,Kurt Fulle..."
8,"Mario Casas,Georgina Campbell,Diego Calva,Nail..."
9,"Shameik Moore,Hailee Steinfeld,Brian Tyree Hen..."


#### 6a.  Write a function that scrapes data (mentioned in Q5) from page number 1, 2, 3, 4 and 5 on the URL https://www.themoviedb.org/movie and returns 5 data frames which can be exported to csv file by calling the functions defined in Q3a, Q4c and Q5

In [25]:
tmdb_pages=[]
page = 1
while page != 6:
    base_url="https://www.themoviedb.org/movie"    
    url = base_url + f"?page={page}"
    tmdb_pages.append(url)
    
    page = page + 1
print(tmdb_pages)

['https://www.themoviedb.org/movie?page=1', 'https://www.themoviedb.org/movie?page=2', 'https://www.themoviedb.org/movie?page=3', 'https://www.themoviedb.org/movie?page=4', 'https://www.themoviedb.org/movie?page=5']


In [26]:
# URL=tmdb_pages[1] #loop through tmdb_pages tomorrow
# page_content=create_soups(URL)
# #     sss.append(page_content)
# #     for s in sss:
# jjj=[]
# for content in page_content:
#     ttt=mtd(page_content)
#     uuu=urd(page_content)
#     www=gend(page_content)
#     vvv=pd.concat([ttt,uuu,www],axis='columns')
#     jjj.append(vvv)
# # vvv.to_csv('file1.csv')
# print(jjj[0])

                                                0   0  \
0                              The Little Mermaid  63   
1       Mortal Kombat Legends: Scorpion's Revenge  82   
2                                    Extraction 2  76   
3   Mission: Impossible - Dead Reckoning Part One  78   
4                         Insidious: The Red Door  60   
5                   War of the Worlds: The Attack  62   
6                                        My Fault  82   
7          The Conjuring: The Devil Made Me Do It  75   
8                        Avatar: The Way of Water  77   
9                               Little Bone Lodge  77   
10                         A Good Day to Die Hard  53   
11                                The Black Demon  63   
12    Mortal Kombat Legends: Battle of the Realms  78   
13                                  Shadow Master  68   
14                Mr. Car and the Knights Templar  60   
15                   Ruby Gillman, Teenage Kraken  75   
16                             

In [101]:
sss=[]
output_path='C:/Users/tanmay11'
for URL in tmdb_pages:
    page_content=create_soups(URL)
#     sss.append(page_content)
#     for s in sss:
    
    ttt=movie_titles_df(page_content)
    uuu=user_rating_df(page_content)
    www=genres_df(page_content)
    vvv=pd.concat([ttt,uuu,www],axis='columns')
    sss.append(vvv)
print(type(sss[0]))
    




# result = pd.concat(sss)
# result.head(50)
# print(type(sss[0]))
for index, dataset in enumerate(sss):
    filepath = os.path.join(output_path, 'dataset_'+str(index)+'.csv')
    dataset.to_csv(filepath)
    


    
    
    
    

<class 'pandas.core.frame.DataFrame'>


#### 6b.  Combine the data obtained from dataframes in Q6(a)

In [None]:
result = pd.concat(sss)
result.head(50)