In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
from urllib.parse import quote

In [3]:
toy_story_url = "https://en.wikipedia.org/wiki/Toy_Story_3"
oceans = "https://en.wikipedia.org/wiki/Oceans_(film)"
g_force = "https://en.wikipedia.org/wiki/G-Force_(film)"

In [4]:
walt_disney_movies_page = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"
wikipedia_base_url = "https://en.wikipedia.org/"

In [5]:
def make_soup(url):
    response = requests.get(url)
    if response.status_code !=200:
        raise Exception("failed to load the given url page.")
    soup = BeautifulSoup(response.text,'html.parser')
    return soup

In [6]:
def get_current_data(row_data):
    if row_data.find('li'):
        return [li.getText(" ").replace("\xa0", " ") for li in row_data.find_all('li')]
    else:
        return row_data.getText(" ").replace("\xa0", " ")

In [7]:
def get_info_box(soup):
    
    info_table= soup.find('table').find_all('tr')
    
    my_dict = {}
    for index, row in enumerate(info_table):
        if index ==0:
            my_dict['title']=row.find('th').getText(" ")
        elif index==1:
            continue
        else:
            current_label = row.find('th').getText(" ").replace("\xa0", " ")
            current_data = get_current_data(row.find('td'))
            my_dict[current_label] = current_data
    return my_dict


In [8]:
def make_movie_info_list(url):
    soup = make_soup(url)
    movies = soup.select(".wikitable.sortable i a")
    movies_info_list = []
    for index,movie in enumerate(movies):
        i = 1
        try:
            current_url = movie.get('href')
            current_url_encoded = quote(current_url, safe='/:')
            full_path = wikipedia_base_url + current_url_encoded
            movies_info_list.append(get_info_box(make_soup(full_path)))
        except Exception as e:
            print(movie.text)
            print(e)
        finally:
            print(f"The fin has run {i} times")
            i+=1
            

    return movies_info_list

In [9]:
movie_datas_info = make_movie_info_list(walt_disney_movies_page)

So Dear to My Heart
'NoneType' object has no attribute 'getText'
Davy Crockett and the River Pirates
'NoneType' object has no attribute 'getText'
Darby O'Gill and the Little People
failed to load the given url page.
The Sign of Zorro
'NoneType' object has no attribute 'find'
Nikki: Wild Dog of the North
'NoneType' object has no attribute 'getText'
Big Red
'NoneType' object has no attribute 'getText'
Miracle of the White Stallions
'NoneType' object has no attribute 'getText'
The Monkey's Uncle
failed to load the given url page.
Charlie, the Lonesome Cougar
'NoneType' object has no attribute 'getText'
Blackbeard's Ghost
failed to load the given url page.
The Horse in the Gray Flannel Suit
'NoneType' object has no attribute 'getText'
Smith!
'NoneType' object has no attribute 'getText'
Now You See Him, Now You Don't
failed to load the given url page.
The World's Greatest Athlete
failed to load the given url page.
The Island at the Top of the World
'NoneType' object has no attribute 'getTex

In [42]:
import json

def save_data(title,data):
    with open(title, "w", encoding = "utf-8") as outfile: 
        json.dump(data, outfile)

save_data("movie_info.json", movie_datas_info)

In [10]:
toy_story_url = "https://en.wikipedia.org/wiki/Toy_Story_3"
get_info_box(make_soup(toy_story_url))

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production company': 'Pixar Animation Studios',
 'Distributed by': 'Walt Disney Studios Motion Pictures [ a ]',
 'Release dates': ['June 12, 2010  ( 2010-06-12 )  ( Taormina Film Fest )',
  'June 18, 2010  ( 2010-06-18 )  (United States)'],
 'Running time': '103 minutes [ 1 ]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million [ 1 ]',
 'Box office': '$1.067 billion [ 1 ]'}

In [11]:
def make_movie_dataframe(url):
    soup = make_soup(url)
    movies = soup.select(".wikitable.sortable i a")
    movie_name = []
    movie_url = []
    for index,movie in enumerate(movies):
        current_url = movie.get('href')
        current_url_encoded = quote(current_url, safe='/:')
        full_path = wikipedia_base_url + current_url_encoded
        movie_url.append(full_path)
        movie_name.append(movie.text)
    my_dict = {
        "movie":movie_name,
        "movie_url":movie_url
    }

    return pd.DataFrame(my_dict)

In [30]:
df = pd.DataFrame(movie_datas_info)

In [32]:
df.isnull().sum()

title                     0
Directed by              11
Story by                305
Based on                203
Produced by              14
Music by                 15
Production company      223
Distributed by            5
Release dates           275
Running time             17
Country                  65
Language                 23
Budget                  173
Box office              110
Starring                 41
Cinematography          108
Release date            192
Written by              287
Edited by                42
Languages               443
Narrated by             415
Screenplay by           227
Countries               405
Color process           457
Production companies    245
Layouts by              460
Occupations             459
Years active            459
Known for               459
Born                    460
Died                    460
Alma mater              460
Board member of         460
Spouse                  460
Children                460
Awards              

In [34]:
df.columns

Index(['title', 'Directed by', 'Story by', 'Based on', 'Produced by',
       'Music by', 'Production company', 'Distributed by', 'Release dates',
       'Running time', 'Country', 'Language', 'Budget', 'Box office',
       'Starring', 'Cinematography', 'Release date', 'Written by', 'Edited by',
       'Languages', 'Narrated by', 'Screenplay by', 'Countries',
       'Color process', 'Production companies', 'Layouts by', 'Occupations',
       'Years active', 'Known for', 'Born', 'Died', 'Alma mater',
       'Board member of', 'Spouse', 'Children', 'Awards', 'Music', 'Lyrics',
       'Book', 'Basis', 'Productions', 'First appearance', 'Created by',
       'Voiced by'],
      dtype='object')