# Get Info Box From One Wikipedia Page

In [1]:
from bs4 import BeautifulSoup as bs
import requests

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out the HTML
contents = soup.prettify()
#print(contents)

In [3]:
movies = soup.select(".wikitable.sortable i")
movies[0:10]

[]

In [4]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()
        
def get_info_box(url):

    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
            
    return movie_info  

In [5]:
get_info_box("https://en.wikipedia.org/wiki/One_Little_Indian_(film)")

{'title': 'One Little Indian',
 'Directed by': 'Bernard McEveety',
 'Written by': 'Harry Spalding',
 'Produced by': 'Winston Hibler',
 'Starring': ['James Garner',
  'Vera Miles',
  'Pat Hingle',
  "Clay O'Brien",
  'John Doucette',
  'Morgan Woodward',
  'Andrew Prine'],
 'Cinematography': 'Charles F. Wheeler',
 'Edited by': 'Robert Stafford',
 'Music by': 'Jerry Goldsmith',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['June 20, 1973'],
 'Running time': '90 Minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$2 million'}

# Get Info From All Pages 

In [6]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
Mighty Ducks the Movie: The First Face-Off
'NoneType' object has no attribute 'find'
230
240
250
260
270
Spirited Away
'NoneType' object has no attribute 'find'
280
290
300
310
Howl's Moving Castle
'NoneType' object has no attribute 'find'
320
330
340
350
360
370
Ponyo
'NoneType' object has no attribute 'find'
380
Tales from Earthsea
'NoneType' object has no attribute 'find'
390
400
The Secret World of Arrietty
'NoneType' object has no attribute 'find'
410
420
430
440
450
460
470
480
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
490
500
Wish
'NoneType' object has no attribute 'find_all'
Elio
'NoneType' object has no attribute 'find_all'
510
61
'NoneType' object has no attribute 'find_all'
All Night Long
'NoneType' object has no attribute 'find'
Big 

## Store Dict In JSON Format

In [2]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [3]:
import json

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [4]:
save_data("disney_data_cleaned.json", movie_info_list)

NameError: name 'movie_info_list' is not defined

### Clean Data

In [5]:
movie_info_list = load_data("disney_data_cleaned.json")

## Convert Running Time Into Integer

In [7]:
print([movie.get('Running time', 'N/A') for movie in movie_info_list])

['83 minutes', '88 minutes', '126 minutes', '74 minutes', '64 minutes', '70 minutes', '42 minutes', '65 min', '71 minutes', '75 minutes', '94 minutes', '73 minutes', '75 minutes', '82 minutes', '68 minutes', '74 minutes', '96 minutes', '75 minutes', '84 minutes', '77 minutes', '92 minutes', '69 minutes', '81 minutes', ['60 minutes (VHS and Wild Discovery version)', '71 minutes (original)'], '127 minutes', '93 minutes', '76 minutes', '75 minutes', '73 minutes', '85 minutes', '81 minutes', '70 minutes', '90 min.', '80 minutes', '75 minutes', '84 minutes', '83 minutes', '72 minutes', '97 minutes', '75 minutes', '104 minutes', '93 minutes', '105 minutes', '95 minutes', '97 minutes', '134 minutes', '69 minutes', '92 minutes', '126 minutes', '79 minutes', '97 minutes', '128 minutes', '73 minutes', '91 minutes', '105 minutes', '98 minutes', '130 minutes', '89 minutes', '93 minutes', '67 minutes', '98 minutes', '100 minutes', '118 minutes', '103 minutes', '110 minutes', '80 min.', '79 minutes'

In [14]:
# "85 minutes"
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else: # is a string
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', "N/A"))

In [8]:
print([movie.get('Running time (int)', 'N/A') for movie in movie_info_list])

['N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A'

In [9]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])

['$1.5 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$858,000', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million', '$1.5 million', 'N/A', '$2.2 million', '$1.8 million', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', '$4 million', '$3.6 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', '$6.3 m

## Format Budget And Box Office

In [10]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value



'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None

In [11]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [12]:
money_conversion(str(movie_info_list[-40]["Budget"]))

200000000.0

## Convert Dates Into Datetimes

In [13]:

print([movie.get('Release date', 'N/A') for movie in movie_info_list])

['N/A', 'N/A', ['November 13, 1940'], ['June 27, 1941'], 'N/A', 'N/A', 'N/A', ['July 17, 1943'], 'N/A', 'N/A', 'N/A', ['September 27, 1947'], 'May 27, 1948', 'N/A', ['October 5, 1949'], 'N/A', 'N/A', 'N/A', 'N/A', ['February 5, 1953'], ['July 23, 1953 (United States)'], ['November 10, 1953'], 'N/A', ['August 17, 1954'], ['December 23, 1954'], 'May 25, 1955', ['June 22, 1955'], ['September 14, 1955'], 'December 22, 1955', 'June 8, 1956', ['July 18, 1956'], ['September 4, 1956'], ['December 20, 1956'], 'June 19, 1957', 'August 28, 1957', ['December 25, 1957'], ['July 8, 1958'], ['August 12, 1958'], ['December 25, 1958'], ['January 29, 1959'], ['March 19, 1959'], 'N/A', ['November 10, 1959'], 'January 21, 1960 ( Sarasota, FL )', ['February 24, 1960'], 'May 19, 1960', 'N/A', ['November 1, 1960'], ['December 21, 1960'], ['January 25, 1961'], 'March 16, 1961', ['June 21, 1961'], ['July 12, 1961'], ['July 17, 1961'], ['December 14, 1961'], 'April 5, 1962', ['May 17, 1962'], ['June 6, 1962'], 

In [14]:
# June 28, 1950
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = clean_date(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None

In [22]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [15]:
movie_info_list[50]

{'title': 'The Absent-Minded Professor',
 'Directed by': 'Robert Stevenson',
 'Screenplay by': 'Bill Walsh',
 'Based on': ['"A Situation of Gravity"', 'by', 'Samuel W. Taylor'],
 'Produced by': ['Walt Disney'],
 'Starring': ['Fred MacMurray',
  'Nancy Olson',
  'Keenan Wynn',
  'Tommy Kirk',
  'Leon Ames',
  'Elliott Reid',
  'Edward Andrews',
  'Wally Brown',
  'Alan Carney',
  'Forrest Lewis',
  'James Westerfield',
  'Ed Wynn'],
 'Cinematography': 'Edward Colman',
 'Edited by': 'Cotton Warburton',
 'Music by': 'George Bruns',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': 'March 16, 1961',
 'Running time': '97 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$25.3 million',
 'Budget (float)': None,
 'Box office (float)': 25300000.0}

## Save Using Pickle

In [19]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [17]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [20]:
save_data_pickle("disney_movie_data_cleaned_more.pickle", movie_info_list)

In [21]:
a = load_data_pickle("disney_movie_data_cleaned_more.pickle")

In [112]:
movie_info_list = load_data_pickle('disney_movie_data_cleaned_more.pickle')

In [None]:
# http://www.omdbapi.com/?apikey=[yourkey]&

## Attach IMDB/ Rotten-Tomatoes/ Metascore scores

In [41]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": os.environ['OMDB_API_KEY'], 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

get_omdb_info("into the woods")

{'Title': 'Into the Woods',
 'Year': '2014',
 'Rated': 'PG',
 'Released': '25 Dec 2014',
 'Runtime': '125 min',
 'Genre': 'Adventure, Comedy, Drama',
 'Director': 'Rob Marshall',
 'Writer': 'James Lapine',
 'Actors': 'Anna Kendrick, Meryl Streep, Chris Pine',
 'Plot': 'A witch tasks a childless baker and his wife with procuring magical items from classic fairy tales to reverse the curse put on their family tree.',
 'Language': 'English',
 'Country': 'United States',
 'Awards': 'Nominated for 3 Oscars. 10 wins & 74 nominations total',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BMTY4MzQ4OTY3NF5BMl5BanBnXkFtZTgwNjM5MDI3MjE@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '5.9/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '71%'},
  {'Source': 'Metacritic', 'Value': '69/100'}],
 'Metascore': '69',
 'imdbRating': '5.9',
 'imdbVotes': '143,418',
 'imdbID': 'tt2180411',
 'Type': 'movie',
 'DVD': '24 Mar 2015',
 'BoxOffice': '$128,002,372',
 'Production': 'N

In [26]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [56]:
movie_info_list[2]

{'title': 'Fantasia',
 'Directed by': ['Samuel Armstrong',
  'James Algar',
  'Bill Roberts',
  'Paul Satterfield',
  'Ben Sharpsteen',
  'David D. Hand',
  'Hamilton Luske',
  'Jim Handley',
  'Ford Beebe',
  'T. Hee',
  'Norman Ferguson',
  'Wilfred Jackson'],
 'Story by': ['Joe Grant', 'Dick Huemer'],
 'Produced by': ['Walt Disney', 'Ben Sharpsteen'],
 'Starring': ['Leopold Stokowski', 'Deems Taylor'],
 'Narrated by': 'Deems Taylor',
 'Cinematography': 'James Wong Howe',
 'Music by': 'See program',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['November 13, 1940'],
 'Running time': '126 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$2.28 million',
 'Box office': '$76.4–$83.3 million (United States and Canada)',
 'Budget (float)': 2280000.0,
 'Box office (float)': 83300000.0,
 'imdb': 7.7,
 'metascore': 96.0,
 'rotten_tomatoes': '95%'}

In [None]:
metascore = [m for m in movie_info_list if m['metascore'] == None]

len(metascore)

In [61]:
metascore = [m for m in movie_info_list if m['metascore'] == None]

len(metascore)

22

In [63]:
imdb = [m for m in movie_info_list if m['imdb'] == None]

len(imdb)

22

In [88]:
metascore = [m for m in movie_info_list if m['metascore'] == None]

len(metascore)

0

In [87]:
import numpy 
for n in movie_info_list:
    if n['metascore'] == None:
        movie_info_list.remove(n)

In [98]:
for n in movie_info_list:
    if n['metascore'] == "N/A":
        movie_info_list.remove(n)

In [99]:
metascore = [m for m in movie_info_list if m['metascore'] == 'N/A']

len(metascore)

0

In [93]:
rotten_tomatoes = [m for m in movie_info_list if m['rotten_tomatoes'] == None]

len(rotten_tomatoes)

0

In [100]:
for movie in movie_info_list:
    movie['imdb'] = float(movie['imdb']) 
    movie['metascore'] = float(movie['metascore'])
    

In [101]:
save_data_pickle('disney_movie_data_final.pickle', movie_info_list)

In [102]:
movie_info_list[50]

{'title': 'The Watcher in the Woods',
 'Directed by': ['John Hough', 'Vincent McEveety (uncredited)'],
 'Screenplay by': ['Brian Clemens', 'Harry Spalding', 'Rosemary Anne Sisson'],
 'Based on': ['A Watcher in the Woods', 'by', 'Florence Engel Randall'],
 'Produced by': 'Ron Miller',
 'Starring': ['Bette Davis',
  'Carroll Baker',
  'David McCallum',
  'Lynn-Holly Johnson',
  'Kyle Richards'],
 'Cinematography': 'Alan Hume',
 'Edited by': 'Geoffrey Foot',
 'Music by': 'Stanley Myers',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release dates': ['April 17, 1980 (New York City)', 'October 9, 1981 (U.S.)'],
 'Running time': '84 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$9 million',
 'Box office': '$5 million',
 'Budget (float)': 9000000.0,
 'Box office (float)': 5000000.0,
 'imdb': 6.1,
 'metascore': 52.0,
 'rotten_tomatoes': '50%'}

In [103]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [114]:
save_data("disney_data_final.json", movie_info_copy)

In [106]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [107]:
df.head()

Unnamed: 0,title,Directed by,Story by,Based on,Produced by,Starring,Music by,Production company,Distributed by,Release dates,...,Release date,Languages,Screenplay by,Edited by,Written by,Color process,Countries,Production companies,Adaptation by,Layouts by
0,Pinocchio,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...","[Ted Sears, Otto Englander, Webb Smith, Willia...","[The Adventures of Pinocchio, by, Carlo Collodi]",Walt Disney,"[Cliff Edwards, Dickie Jones, Christian Rub, W...","[Leigh Harline, Paul J. Smith]",Walt Disney Productions,RKO Radio Pictures,"[February 7, 1940 ( Center Theatre ), February...",...,,,,,,,,,,
1,Fantasia,"[Samuel Armstrong, James Algar, Bill Roberts, ...","[Joe Grant, Dick Huemer]",,"[Walt Disney, Ben Sharpsteen]","[Leopold Stokowski, Deems Taylor]",See program,Walt Disney Productions,RKO Radio Pictures,,...,"[November 13, 1940]",,,,,,,,,
2,Dumbo,"[Ben Sharpsteen, Norman Ferguson, Wilfred Jack...","[Joe Grant, Dick Huemer]","[Dumbo, the Flying Elephant, by, Helen Aberson...",Walt Disney,"[Edward Brophy, Verna Felton, Cliff Edwards, H...","[Frank Churchill, Oliver Wallace]",Walt Disney Productions,RKO Radio Pictures,"[October 23, 1941 (New York City), October 31,...",...,,,,,,,,,,
3,Bambi,"[Supervising director, David Hand, Sequence di...","[Story direction, Perce Pearce, Story adaptati...","[Bambi, a Life in the Woods, by, Felix Salten]",Walt Disney,see below,"[Frank Churchill, Edward H. Plumb]",Walt Disney Productions,RKO Radio Pictures,"[August 9, 1942 ( London ), August 21, 1942 (U...",...,,,,,,,,,,
4,Saludos Amigos,"[Norman Ferguson, Wilfred Jackson, Jack Kinney...","[Homer Brightman, William Cottrell, Richard Hu...",,Walt Disney,"[Lee Blair, Mary Blair, Pinto Colvig, Walt Dis...","[Paul Smith, Edward H. Plumb]",Walt Disney Productions,RKO Radio Pictures,"[August 24, 1942 (Rio de Janeiro), February 6,...",...,,"[English, Portuguese, Spanish]",,,,,,,,


## Save As CSV File

In [108]:
df.to_csv("disney_movie_data_final.csv")