# Disney Dataset Creation (Web Scraping) using BeautifulSoup

### Scrape and clean a list of Disney wikipidea pages to create a dataset to further analyse

In [2]:
from bs4 import BeautifulSoup as bs
import requests

In [3]:
req = requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')
# req = requests.get('https://en.wikipedia.org/wiki/Angels_in_the_Outfield_(1994_film)')

In [4]:
webpage = bs(req.content)

### Locate the infobox

In [5]:
info_box = webpage.find(class_='infobox vevent')
info_rows = info_box.find_all('tr')
# for row in info_rows:
#     print(row.prettify())


### Get info box for Toy Story 3

In [6]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all('li')]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

In [11]:
def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()

In [12]:
movie_info = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['Title'] = row.find('th').get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        key = row.find('th').get_text(" ", strip=True)
        value = get_content_value(row.find('td'))
        movie_info[key] = value

        
# print(movie_info)
for key, val in movie_info.items():
    print(f"{key}: {val}")
    print("\n")

Title: Toy Story 3


Directed by: Lee Unkrich


Screenplay by: Michael Arndt


Story by: ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich']


Produced by: Darla K. Anderson


Starring: ['Tom Hanks', 'Tim Allen', 'Joan Cusack', 'Don Rickles', 'Wallace Shawn', 'John Ratzenberger', 'Estelle Harris', 'Ned Beatty', 'Michael Keaton', 'Jodi Benson', 'John Morris']


Cinematography: ['Jeremy Lasky', 'Kim White']


Edited by: Ken Schretzmann


Music by: Randy Newman


Production companies: ['Walt Disney Pictures', 'Pixar Animation Studios']


Distributed by: ['Walt Disney Studios', 'Motion Pictures']


Release dates: ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )', 'June 18, 2010 ( 2010-06-18 ) (United States)']


Running time: 103 minutes [1]


Country: United States


Language: English


Budget: $200 million [1]


Box office: $1.067 billion [1]




### Get info box for all Disney films

In [13]:
new_request = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

soup = bs(new_request.content)

In [19]:
def get_info_box(url):
    
    req = requests.get(url)
    webpage = bs(req.content)
    info_box = webpage.find(class_='infobox vevent')
    info_rows = info_box.find_all('tr')
    
    clean_tags(webpage)
    
    movie_info = {}

    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['Title'] = row.find('th').get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                key = row.find('th').get_text(" ", strip=True)
                value = get_content_value(row.find('td'))
                movie_info[key] = value
            
    return movie_info

In [15]:
get_info_box('https://en.wikipedia.org/wiki/Toy_Story_3')

{'Title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release dates': ['June 12, 2010 ( Taormina Film Fest )',
  'June 18, 2010 (United States)'],
 'Running time': '103 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million',
 'Box office': '$1.067 billion'}

In [21]:
movies = soup.select(".wikitable.sortable i a")
base_link = 'https://en.wikipedia.org/'
all_info_boxes = []

for movie in movies:
    try:
        url = base_link + movie['href']
        all_info_boxes.append(get_info_box(url))
    except Exception as e:
        print(movie.get_text())
        print(e)

Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
Elemental
'NoneType' object has no attribute 'find_all'
61
'NoneType' object has no attribute 'find_all'
All Night Long
'NoneType' object has no attribute 'find'
Big Thunder Mountain Railroad
'NoneType' object has no attribute 'find_all'
Keeper of the Lost Cities
'NoneType' object has no attribute 'find_all'
Muppet Man
'NoneType' object has no attribute 'find_all'
Shrunk
'NoneType' object has no attribute 'find'
Sister Act 3
'NoneType' object has no attribute 'find'
The Graveyard Book
'NoneType' object has no attribute 'find_all'
The Thief
'NoneType' object has no attribute 'find_all'
Tom Sawyer
'NoneType' object has no attribute 'find_all'
Tower of Terror
'NoneType' object has no attribute 'find_all'
Tron: Ares
'NoneType' object has no attribute 'find'
FC Barcelona
'NoneType' object

In [5]:
# Save and reload data

import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=2)
        
def load_data(title):
    with open(title, 'r', encoding='utf-8') as file:
        return json.load(file)

In [3]:
save_data("disney_data_cleaned.json", all_info_boxes)

NameError: name 'all_info_boxes' is not defined

In [4]:
# Clean our data

In [6]:
movie_info_list = load_data('disney_data.json')

### Subtasks
- Clean up references
- Convert running time into a number
- Convert date to datetime object
- Split up the long strings with comma
- Convert Budget and Box offices into numbers

In [9]:
# Remove [1], [2]...

# done by defining clean_tags

movie_info_list[:2]

[{'Title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': 'Snow White by The Brothers Grimm',
  'Produced by': 'Walt Disney',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan',
   'Scotty Mattraw',
   'Billy Gilbert',
   'Eddie Collins',
   'Moroni Olsen',
   'Stuart Buchanan'],
  'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'],
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'RKO Radio Pictures',
  'Release dates': ['December 21, 1937 ( 1937-12-21 ) ( Carthay Circle Theatre )',
   'February 4, 1938 ( 1938-02-04 ) (United States)'],
  'Running time': '83 minute