# Disney Films Web Scraping Project

### Sangwon Baek

### Task 1: Obtain info box for Toy Story 3

#### Import Libraries

In [1]:
from bs4 import BeautifulSoup as bs
import requests

#### Load the Webpage

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# convert to a beautiful soup object
soup = bs(r.content)

# print out the HTML
contents = soup.prettify()
#print(contents)

In [3]:
# Taking a look at the infobox html

info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")

#for row in info_rows:
    #print(row.prettify())

In [4]:
movie_info = {}

# Function to extract data from info box to python dict 
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")
    
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value
    
movie_info



{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release dates': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million [1]',
 'Box office': '$1.067 billion [1]'}

### Task 2: Obtain info box for all movies 

In [8]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# convert to beautiful soup object
soup = bs(r.content)

# print out html
contents = soup.prettify()
#print(contents)

In [6]:
movies = soup.select(".wikitable.sortable i")
movies[0:10]


[<i><a href="/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)" title="Snow White and the Seven Dwarfs (1937 film)">Snow White and the Seven Dwarfs</a></i>,
 <i><a href="/wiki/Pinocchio_(1940_film)" title="Pinocchio (1940 film)">Pinocchio</a></i>,
 <i><a href="/wiki/Fantasia_(1940_film)" title="Fantasia (1940 film)">Fantasia</a></i>,
 <i><a href="/wiki/The_Reluctant_Dragon_(1941_film)" title="The Reluctant Dragon (1941 film)">The Reluctant Dragon</a></i>,
 <i><a href="/wiki/Dumbo" title="Dumbo">Dumbo</a></i>,
 <i><a href="/wiki/Bambi" title="Bambi">Bambi</a></i>,
 <i><a href="/wiki/Saludos_Amigos" title="Saludos Amigos">Saludos Amigos</a></i>,
 <i><a href="/wiki/Victory_Through_Air_Power_(film)" title="Victory Through Air Power (film)">Victory Through Air Power</a></i>,
 <i><a href="/wiki/The_Three_Caballeros" title="The Three Caballeros">The Three Caballeros</a></i>,
 <i><a href="/wiki/Make_Mine_Music" title="Make Mine Music">Make Mine Music</a></i>]

In [11]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]    
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()

def get_info_box(url):
    
    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)
    
    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
    
    return movie_info


In [13]:
get_info_box("https://en.wikipedia.org/wiki/Spirited_Away")

{'title': 'Spirited Away',
 'Japanese': '',
 'Hepburn': 'Sen to Chihiro no Kamikakushi',
 'Directed by': 'Hayao Miyazaki',
 'Written by': 'Hayao Miyazaki',
 'Produced by': 'Toshio Suzuki',
 'Starring': ['Rumi Hiiragi',
  'Miyu Irino',
  'Mari Natsuki',
  'Takashi Naito',
  'Yasuko Sawaguchi',
  'Tsunehiko Kamijō',
  'Takehiko Ono',
  'Bunta Sugawara'],
 'Cinematography': 'Atsushi Okui',
 'Edited by': 'Takeshi Seyama',
 'Music by': 'Joe Hisaishi',
 'Production company': 'Studio Ghibli',
 'Distributed by': 'Toho',
 'Release date': ['20 July 2001 (Japan)'],
 'Running time': '125 minutes',
 'Country': 'Japan',
 'Language': 'Japanese',
 'Budget': '$19.2 million',
 'Box office': '$395.8 million'}

In [14]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
    
    except Exception as e:
        print(movie.get_text())
        print(e)
        


0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
500
Wish
'NoneType' object has no attribute 'find_all'
Elio
'NoneType' object has no attribute 'find_all'
510
61
'NoneType' object has no attribute 'find_all'
All Night Long
'NoneType' object has no attribute 'find'
Big Thunder Mountain Railroad
'NoneType' object has no attribute 'find_all'
520
Keeper of the Lost Cities
'NoneType' object has no attribute 'find_all'
Muppet Man
'NoneType' object has no attribute 'find_all'
530
One Thousand and One Nights
'NoneType' object has no attribute 'find_all'
Shrunk
'NoneType' object has no attribute 'find'
Sister Act 3
'NoneType' object has no attribute 'find

In [15]:
len(movie_info_list)

529

#### Save/Reload Movie Data

In [65]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [2]:
import json

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)


In [18]:
save_data("disney_data_cleaned.json", movie_info_list)

### Task 3: Clean the Data

#### subtasks:
1. ~~clean up references [1]~~
2. ~~convert running time into an integer~~
3. ~~convert dates into datetime object~~
4. ~~split up long strings~~
5. ~~convert budget and box office to numbers~~

In [3]:
movie_info_list = load_data("disney_data_cleaned.json")

In [5]:
#print([movie.get('Running time', 'N/A') for movie in movie_info_list])

In [4]:
# Convert running time to int

def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    
    if isinstance(running_time, list): #is a list
        return int(running_time[0].split(" ")[0])
    else: #is a string
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', "N/A"))

In [39]:
print([movie.get('Running time (int)', 'N/A') for movie in movie_info_list])

[83, 88, 126, 74, 64, 70, 42, 65, 71, 75, 94, 73, 75, 82, 68, 74, 96, 75, 84, 77, 92, 69, 81, 60, 127, 93, 76, 75, 73, 85, 81, 70, 90, 80, 75, 84, 83, 72, 97, 75, 104, 93, 105, 95, 97, 134, 69, 92, 126, 79, 97, 128, 73, 91, 105, 98, 130, 89, 93, 67, 98, 100, 118, 103, 110, 80, 79, 91, 91, 97, 118, 139, 131, 92, 87, 116, 93, 110, 110, 131, 101, 108, 84, 78, 75, 164, 106, 110, 99, 113, 108, 102, 85, 91, 93, 100, 100, 79, 96, 113, 89, 117, 92, 88, 92, 87, 93, 93, 93, 90, 83, 96, 88, 89, 91, 93, 92, 97, 100, 100, 89, None, 91, 112, 115, 95, 91, 97, 104, 74, 48, 77, 104, 128, 101, 94, 104, 90, 100, 88, 93, 98, 112, 84, 97, 97, 114, 96, 97, 109, 83, 90, 107, 96, 103, 91, 95, 105, 113, 80, 101, 90, 74, 90, 89, 110, 74, 93, 84, 83, 74, 77, 107, 93, 88, 108, 84, 121, 89, 104, 90, 86, 84, 108, 107, 96, 98, 105, 108, 94, 106, 102, 69, 88, 102, 102, 97, 111, 100, 96, 96, 78, 81, 108, 89, 99, 89, 81, 92, 100, 89, 79, 91, 81, 101, 104, 103, 86, 106, 74, 93, 92, 98, 95, 93, 87, 93, 87, 128, 77, 86, 9

In [None]:
print([movie.get('Running time (int)', 'N/A') for movie in movie_info_list])

In [6]:
#print([movie.get('Budget', 'N/A') for movie in movie_info_list])

In [21]:
# Convert budget and box offices to numbers

import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"


def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]


def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value * word_value


def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value


'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''


def money_conversion(money):
    if money == "N/A":
        return None
    
    if isinstance(money, list):
        money = money[0]

    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())
    
    else:
        return None

print(money_conversion("$3.5 to 4 Million"))

3500000.0


In [22]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [24]:
movie_info_list[2]

{'title': 'Fantasia',
 'Directed by': ['Samuel Armstrong',
  'James Algar',
  'Bill Roberts',
  'Paul Satterfield',
  'Ben Sharpsteen',
  'David D. Hand',
  'Hamilton Luske',
  'Jim Handley',
  'Ford Beebe',
  'T. Hee',
  'Norman Ferguson',
  'Wilfred Jackson'],
 'Story by': ['Joe Grant', 'Dick Huemer'],
 'Produced by': ['Walt Disney', 'Ben Sharpsteen'],
 'Starring': ['Leopold Stokowski', 'Deems Taylor'],
 'Narrated by': 'Deems Taylor',
 'Cinematography': 'James Wong Howe',
 'Music by': 'See program',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['November 13, 1940'],
 'Running time': '126 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$2.28 million',
 'Box office': '$76.4–$83.3 million (United States and Canada)',
 'Running time (int)': 126,
 'Budget (float)': 2280000.0,
 'Box office (float)': 83300000.0}

In [None]:
# Changing 'Release dates' to 'Release date' so I can change both to 'Release date (datetime)'

for dic in movie_info_list:
    if 'Release dates' in dic:
        dic['Release date']=dic.pop('Release dates')


In [7]:
# Convert Dates into datetimes
#print([movie.get('Release date', 'N/A') for movie in movie_info_list])

In [31]:
# June 28, 1950
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
    
    if date == "N/A":
        return None
    
    date_str = clean_date(date)
    fmts = ["%B %d, %Y", "%d %B %Y"] 
    
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None
    

In [34]:
for dic in movie_info_list:
    if 'Release dates' in dic:
        dic['Release date']=dic.pop('Release dates')



{'title': 'Snow White and the Seven Dwarfs', 'Directed by': ['David Hand', 'William Cottrell', 'Wilfred Jackson', 'Larry Morey', 'Perce Pearce', 'Ben Sharpsteen'], 'Written by': ['Ted Sears', 'Richard Creedon', 'Otto Englander', 'Dick Rickard', 'Earl Hurd', 'Merrill De Maris', 'Dorothy Ann Blank', 'Webb Smith'], 'Based on': ['Snow White', 'by The', 'Brothers Grimm'], 'Produced by': 'Walt Disney', 'Starring': ['Adriana Caselotti', 'Lucille La Verne', 'Harry Stockwell', 'Roy Atwell', 'Pinto Colvig', 'Otis Harlan', 'Scotty Mattraw', 'Billy Gilbert', 'Eddie Collins', 'Moroni Olsen', 'Stuart Buchanan'], 'Music by': ['Frank Churchill', 'Paul Smith', 'Leigh Harline'], 'Production company': 'Walt Disney Productions', 'Distributed by': 'RKO Radio Pictures', 'Running time': '83 minutes', 'Country': 'United States', 'Language': 'English', 'Budget': '$1.49 million', 'Box office': '$418 million', 'Running time (int)': 83, 'Budget (float)': 1490000.0, 'Box office (float)': 418000000.0, 'Release date

In [35]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [None]:
print([movie.get('Release date (datetime)', 'N/A') for movie in movie_info_list])

#### Save/Load data with Pickle

In [37]:
import pickle 

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [38]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [39]:
save_data_pickle("disney_data_cleaned_more", movie_info_list)

In [40]:
movie_info_list = load_data_pickle("disney_data_cleaned_more")

### Task #4: Attach IMDB/Rotten Tomatoes/Meta Scores using OMDb api

In [None]:
# http://www.omdbapi.com/?apikey=[yourkey]& 

In [50]:
import requests
import urllib

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": "4fa208f6", 't': title} # can use system enviornment variables to hide apikey
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None


In [51]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)
    

In [55]:
# Checking random movie to see if the review scores were added
movie_info_list[-6]

{'title': 'The Aristocats',
 'Directed by': 'Wolfgang Reitherman',
 'Story by': ['Ken Anderson',
  'Larry Clemmons',
  'Eric Cleworth',
  'Vance Gerry',
  'Julius Svendsen',
  'Frank Thomas',
  'Ralph Wright'],
 'Based on': ['Tom McGowan', 'Tom Rowe'],
 'Produced by': ['Winston Hibler', 'Wolfgang Reitherman'],
 'Starring': ['Phil Harris',
  'Eva Gabor',
  'Sterling Holloway',
  'Scatman Crothers',
  'Paul Winchell',
  'Lord Tim Hudson',
  'Thurl Ravenscroft',
  'Dean Clark',
  'Liz English',
  'Gary Dubin'],
 'Edited by': 'Tom Acosta',
 'Music by': 'George Bruns',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$4 million',
 'Box office': '$191 million',
 'Running time (int)': 79,
 'Budget (float)': 4000000.0,
 'Box office (float)': 191000000.0,
 'Release date (datetime)': datetime.datetime(1970, 12, 11, 0, 0),
 'Release date': ['December 11, 

In [56]:
save_data_pickle('disney_movie_data_final.pickle', movie_info_list)

### Task #5: Save data as JSON & CSV

#### Converty data to JSON (need to change datetime variable into string)

In [57]:
#Make a copy of movie_info_list to convert datetime to string 
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [59]:
movie_info_copy[24]

{'title': '20,000 Leagues Under the Sea',
 'Directed by': 'Richard Fleischer',
 'Screenplay by': 'Earl Felton',
 'Based on': ['Twenty Thousand Leagues Under the Seas', 'by', 'Jules Verne'],
 'Produced by': 'Walt Disney',
 'Starring': ['Kirk Douglas', 'James Mason', 'Paul Lukas', 'Peter Lorre'],
 'Cinematography': 'Franz Planer',
 'Edited by': 'Elmo Williams',
 'Music by': 'Paul Smith',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['December 23, 1954'],
 'Running time': '127 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$5 million',
 'Box office': '$28.2 million',
 'Running time (int)': 127,
 'Budget (float)': 5000000.0,
 'Box office (float)': 28200000.0,
 'Release date (datetime)': datetime.datetime(1954, 12, 23, 0, 0),
 'imdb': '7.2',
 'metascore': '83',
 'rotten_tomatoes': '90%'}

In [61]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [64]:
# Checking if datetime was converted to string
movie_info_copy[24]

{'title': '20,000 Leagues Under the Sea',
 'Directed by': 'Richard Fleischer',
 'Screenplay by': 'Earl Felton',
 'Based on': ['Twenty Thousand Leagues Under the Seas', 'by', 'Jules Verne'],
 'Produced by': 'Walt Disney',
 'Starring': ['Kirk Douglas', 'James Mason', 'Paul Lukas', 'Peter Lorre'],
 'Cinematography': 'Franz Planer',
 'Edited by': 'Elmo Williams',
 'Music by': 'Paul Smith',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['December 23, 1954'],
 'Running time': '127 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$5 million',
 'Box office': '$28.2 million',
 'Running time (int)': 127,
 'Budget (float)': 5000000.0,
 'Box office (float)': 28200000.0,
 'Release date (datetime)': 'December 23, 1954',
 'imdb': '7.2',
 'metascore': '83',
 'rotten_tomatoes': '90%'}

In [66]:
save_data("disney_movie_data_final.json", movie_info_copy)

#### Convert data to CSV

In [67]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [68]:
df.head()

Unnamed: 0,title,Directed by,Written by,Based on,Produced by,Starring,Music by,Production company,Distributed by,Running time,...,Traditional,Simplified,Original title,Layouts by,Music,Lyrics,Book,Basis,Productions,Awards
0,Snow White and the Seven Dwarfs,"[David Hand, William Cottrell, Wilfred Jackson...","[Ted Sears, Richard Creedon, Otto Englander, D...","[Snow White, by The, Brothers Grimm]",Walt Disney,"[Adriana Caselotti, Lucille La Verne, Harry St...","[Frank Churchill, Paul Smith, Leigh Harline]",Walt Disney Productions,RKO Radio Pictures,83 minutes,...,,,,,,,,,,
1,Pinocchio,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",,"[The Adventures of Pinocchio, by, Carlo Collodi]",Walt Disney,"[Cliff Edwards, Dickie Jones, Christian Rub, W...","[Leigh Harline, Paul J. Smith]",Walt Disney Productions,RKO Radio Pictures,88 minutes,...,,,,,,,,,,
2,Fantasia,"[Samuel Armstrong, James Algar, Bill Roberts, ...",,,"[Walt Disney, Ben Sharpsteen]","[Leopold Stokowski, Deems Taylor]",See program,Walt Disney Productions,RKO Radio Pictures,126 minutes,...,,,,,,,,,,
3,The Reluctant Dragon,"[Alfred Werker, (live action), Hamilton Luske,...","[Live-action:, Ted Sears, Al Perkins, Larry Cl...",,Walt Disney,"[Robert Benchley, Frances Gifford, Buddy Peppe...","[Frank Churchill, Larry Morey]",Walt Disney Productions,RKO Radio Pictures,74 minutes,...,,,,,,,,,,
4,Dumbo,"[Ben Sharpsteen, Norman Ferguson, Wilfred Jack...",,"[Dumbo, the Flying Elephant, by, Helen Aberson...",Walt Disney,"[Edward Brophy, Verna Felton, Cliff Edwards, H...","[Frank Churchill, Oliver Wallace]",Walt Disney Productions,RKO Radio Pictures,64 minutes,...,,,,,,,,,,


In [69]:
df.to_csv("disney_movie_data_final.csv")

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 529 entries, 0 to 528
Data columns (total 44 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   title                    529 non-null    object        
 1   Directed by              525 non-null    object        
 2   Written by               218 non-null    object        
 3   Based on                 290 non-null    object        
 4   Produced by              516 non-null    object        
 5   Starring                 491 non-null    object        
 6   Music by                 519 non-null    object        
 7   Production company       209 non-null    object        
 8   Distributed by           527 non-null    object        
 9   Running time             514 non-null    object        
 10  Country                  465 non-null    object        
 11  Language                 505 non-null    object        
 12  Budget                   327 non-nul

In [74]:
running_times = df.sort_values(['Running time (int)'], ascending=False)

running_times.head()

Unnamed: 0,title,Directed by,Written by,Based on,Produced by,Starring,Music by,Production company,Distributed by,Running time,...,Traditional,Simplified,Original title,Layouts by,Music,Lyrics,Book,Basis,Productions,Awards
526,Tinker Bell,"[Bradley Raymond ( 1 , 3 & 4 ), Klay Hall ( 2 ...",,,,"[Mae Whitman, Lucy Liu, Raven-Symoné, Megan Hi...",Joel McNeely,DisneyToon Studios,"[Walt Disney Studios, Home Entertainment]",[468 minutes],...,,,,,,,,,,
332,Pirates of the Caribbean: At World's End,Gore Verbinski,"[Ted Elliott, Terry Rossio]",[Characters by Ted Elliott Terry Rossio Stuart...,Jerry Bruckheimer,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",Hans Zimmer,,Buena Vista Pictures Distribution,169 minutes,...,,,,,,,,,,
85,The Happiest Millionaire,Norman Tokar,,"[My Philadelphia Father, by Cordelia Drexel Bi...","[Walt Disney, Bill Anderson]","[Fred MacMurray, Tommy Steele, Greer Garson, G...",Jack Elliott,Walt Disney Productions,Buena Vista Distribution,"[164 minutes, (, Los Angeles, premiere), 144 m...",...,,,,,,,,,,
445,Jagga Jasoos,Anurag Basu,"[Screenplay:, Anurag Basu, Dialogues in Rhyme:...",,"[Siddharth Roy Kapur, Anurag Basu, Ranbir Kapoor]","[Ranbir Kapoor, Katrina Kaif, Saswata Chatterj...",Pritam,,UTV Motion Pictures,162 minutes,...,,,,,,,,,,
438,Dangal,Nitesh Tiwari,"[Nitesh Tiwari, Piyush Gupta, Shreyas Jain, Ni...",,"[Aamir Khan, Kiran Rao, Siddharth Roy Kapur]","[Aamir Khan, Sakshi Tanwar, Fatima Sana Shaikh...",Pritam,,UTV Motion Pictures,161 minutes,...,,,,,,,,,,
