### Import Necessary Libraries

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import json
import requests
import urllib
import pickle
import pandas as pd

### Load the page given in the URL variable using Requests lib

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_Marvel_Cinematic_Universe_films'
r = requests.get(url)
soup = bs(r.text)

### Convert the URL into a BeautifulSoup object and remove un-necessary tags

In [3]:
def clean_tags(soup) :
    for tag in soup.find_all(['sup']) :
        tag.decompose()

def url_bs_object(url) :
    r = requests.get(url)
    soup = bs(r.text)
    clean_tags(soup)
    info_box = soup.find('table',class_='infobox vevent')
    info_rows = info_box.find_all('tr')    
    
    return info_rows

In [4]:
def get_content_value(row) :
    if row.find(class_='plainlist') :
        return [li.get_text(strip=True,separator=' ').replace(u'\xa0',u' ') for li in row.find_all('li')]
    
    return row.find('td').get_text(strip=True,separator=' ').replace(u'\xa0',u' ')

### Dump the data scrapped from the info box into a dictionary

In [5]:
def insert_content_dict(info_rows) :
    movie_info = {} # this dictionary will store all the info of a particular movie
    for index,row in enumerate(info_rows) :
        if index == 0 :
            movie_info['title'] = row.find('th').get_text(strip=True)
        elif index == 1 :
            continue
        else :
            movie_info[row.find('th').get_text(strip=True,separator=' ')] = get_content_value(row)  
    return movie_info

### Scrap Info Box for all movies in List of 'List of Marvel Cinematic Universe films' and append in 'movies' list

In [6]:
movie_info_list = []
base_url = 'https://en.wikipedia.org'
table = soup.find('table',class_='wikitable plainrowheaders')
movie_rows = table.find_all('tr')
for row in movie_rows :
    try :
        relative_url = row.find('i').a['href']
        url = base_url + relative_url
        info_rows = url_bs_object(url)
        movie_info = insert_content_dict(info_rows)
        movie_info_list.append(movie_info)
    except Exception as e :
        pass

movie_info_list

[{'title': 'Iron Man',
  'Directed by': 'Jon Favreau',
  'Produced by': ['Avi Arad', 'Kevin Feige'],
  'Screenplay by': ['Mark Fergus Hawk Ostby', 'Art Marcum Matt Holloway'],
  'Based on': ['Stan Lee', 'Larry Lieber', 'Don Heck', 'Jack Kirby'],
  'Starring': ['Robert Downey Jr.',
   'Terrence Howard',
   'Jeff Bridges',
   'Shaun Toub',
   'Gwyneth Paltrow'],
  'Music by': 'Ramin Djawadi',
  'Cinematography': 'Matthew Libatique',
  'Edited by': 'Dan Lebental',
  'Production company': 'Marvel Studios',
  'Distributed by': 'Paramount Pictures',
  'Release date': ['April 14, 2008 ( 2008-04-14 ) (Sydney)',
   'May 2, 2008 ( 2008-05-02 ) (United States)'],
  'Running time': '126 minutes',
  'Country': 'United States',
  'Language': 'English',
  'Budget': '$140 million',
  'Box office': '$585.3 million'},
 {'title': 'The Incredible Hulk',
  'Directed by': 'Louis Leterrier',
  'Produced by': ['Avi Arad', 'Gale Anne Hurd', 'Kevin Feige'],
  'Written by': 'Zak Penn',
  'Based on': ['Stan Lee',

### Convert the "Running Time" into Integer and "Release Date" into date format

In [7]:
for movie in movie_info_list :
    movie['Running time'] = int(movie['Running time'].split(' ')[0])
    movie['Release date'] = movie['Release date'][0].split('(')[0]

### Convert the Box Office into float numbers, all in the scale of MILLION

In [8]:
def boxofice_into_num(box_office) :
    amount,scale = box_office.strip().split(' ')
    if scale.startswith('billion') :
        return float(amount[1:])*1000
    return float(amount[1:])

for movie in movie_info_list :
    movie['Box office'] = boxofice_into_num(movie['Box office'])
    movie['Box Office (USD Millions)'] = movie.pop('Box office')

### Save/Reload Movie Data

In [9]:
def save_data(file,data) :
    with open(file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [10]:
def load_data(file) :
    with open(file) as f :
        data = json.load(f)
    return data

In [11]:
save_data('marvel_movie_data.json',movie_info_list)

In [12]:
movie_info_list = load_data('marvel_movie_data.json')

### Convert Budget into float numbers all in scale of MILLION

In [13]:
def budget_into_num(budget) :
    if isinstance(budget,list) :
        budget = budget[0]
    try:
        budget = float(budget.replace(u'â€“',u' ').split(' ')[0][1:])
    except:
        budget = float(budget.replace('-',' ').split(' ')[0][1:])
    return budget

In [14]:
for movie in movie_info_list :
    budget = budget_into_num(movie['Budget'])
    movie['Budget'] = budget
    movie['Budget (USD Millions)'] = movie.pop('Budget')

### Convert Release date into a Date object

In [15]:
from datetime import datetime
for movie in movie_info_list :
    movie['Release date'] = datetime.strptime(movie['Release date'], '%B %d, %Y ')

### Saving our data again in .Pickle format 

In [16]:
def save_pickle(file) :
    with open(file, 'wb') as handle:
        pickle.dump(movie_info_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
def load_pickle(file) :
    with open('marvel_movie_data_final.pickle', 'rb') as handle:
        movie_info_list = pickle.load(handle)
    return movie_info_list

In [18]:
save_pickle('marvel_movie_data_final.pickle')

In [19]:
movie_info_list = load_pickle('marvel_movie_data_final.pickle')

### Attach IMDB, Metascore/Rotten Tomatoes scores to dataset

In [20]:
def get_omdb_info(title) :
    
    base_url = 'http://www.omdbapi.com/?'
    parameters = {'apikey':'ef11ad99','t':title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato(omdb_info) :
    ratings = omdb_info.get('Ratings',[])
    for rating in ratings :
        if rating['Source'] == 'Rotten Tomatoes' :
            return rating['Value']
    return None

In [21]:
for movie in movie_info_list :
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdbRating'] = omdb_info.get('imdbRating',None)
    movie['Metascore'] = omdb_info.get('Metascore',None)
    movie['Rotten Tomatoes'] = get_rotten_tomato(omdb_info)

In [22]:
save_pickle('marvel_movie_data_final.pickle')

In [23]:
movie_info_list = load_pickle('marvel_movie_data_final.pickle')

In [24]:
movie_info_list

[{'title': 'Iron Man',
  'Directed by': 'Jon Favreau',
  'Produced by': ['Avi Arad', 'Kevin Feige'],
  'Screenplay by': ['Mark Fergus Hawk Ostby', 'Art Marcum Matt Holloway'],
  'Based on': ['Stan Lee', 'Larry Lieber', 'Don Heck', 'Jack Kirby'],
  'Starring': ['Robert Downey Jr.',
   'Terrence Howard',
   'Jeff Bridges',
   'Shaun Toub',
   'Gwyneth Paltrow'],
  'Music by': 'Ramin Djawadi',
  'Cinematography': 'Matthew Libatique',
  'Edited by': 'Dan Lebental',
  'Production company': 'Marvel Studios',
  'Distributed by': 'Paramount Pictures',
  'Release date': datetime.datetime(2008, 4, 14, 0, 0),
  'Running time': 126,
  'Country': 'United States',
  'Language': 'English',
  'Box Office (USD Millions)': 585.3,
  'Budget (USD Millions)': 140.0,
  'imdbRating': '7.9',
  'Metascore': '79',
  'Rotten Tomatoes': '94%'},
 {'title': 'The Incredible Hulk',
  'Directed by': 'Louis Leterrier',
  'Produced by': ['Avi Arad', 'Gale Anne Hurd', 'Kevin Feige'],
  'Written by': 'Zak Penn',
  'Based 

### Save Data as JSON & CSV

In [25]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [26]:
for movie in movie_info_copy :
    movie['Release date'] = movie['Release date'].strftime('%B %d, %Y')

In [27]:
save_data('marvel_movie_data_final.json',movie_info_copy)

In [28]:
movie_info_copy = load_data('marvel_movie_data_final.json')

In [29]:
df = pd.DataFrame(movie_info_list)

In [30]:
df

Unnamed: 0,title,Directed by,Produced by,Screenplay by,Based on,Starring,Music by,Cinematography,Edited by,Production company,...,Country,Language,Box Office (USD Millions),Budget (USD Millions),imdbRating,Metascore,Rotten Tomatoes,Written by,Story by,Production companies
0,Iron Man,Jon Favreau,"[Avi Arad, Kevin Feige]","[Mark Fergus Hawk Ostby, Art Marcum Matt Hollo...","[Stan Lee, Larry Lieber, Don Heck, Jack Kirby]","[Robert Downey Jr., Terrence Howard, Jeff Brid...",Ramin Djawadi,Matthew Libatique,Dan Lebental,Marvel Studios,...,United States,English,585.3,140.0,7.9,79,94%,,,
1,The Incredible Hulk,Louis Leterrier,"[Avi Arad, Gale Anne Hurd, Kevin Feige]",,"[Stan Lee, Jack Kirby]","[Edward Norton, Liv Tyler, Tim Roth, Tim Blake...",Craig Armstrong,Peter Menzies Jr.,"[John Wright, Rick Shaine, Vincent Tabaillon]","[Marvel Studios, Valhalla Motion Pictures]",...,United States,English,264.8,150.0,6.7,61,67%,Zak Penn,,
2,Iron Man 2,Jon Favreau,Kevin Feige,,"[Stan Lee, Larry Lieber, Don Heck, Jack Kirby]","[Robert Downey Jr., Gwyneth Paltrow, Don Chead...",John Debney,Matthew Libatique,"[Dan Lebental, Richard Pearson]",Marvel Studios,...,United States,English,623.9,170.0,7.0,57,72%,Justin Theroux,,
3,Captain America: The First Avenger,Joe Johnston,Kevin Feige,Christopher Markus Stephen McFeely,"[Joe Simon, Jack Kirby]","[Chris Evans, Tommy Lee Jones, Hugo Weaving, H...",Alan Silvestri,Shelly Johnson,"[Jeffrey Ford, Robert Dalva]",Marvel Studios,...,United States,English,370.6,140.0,6.9,66,80%,,,
4,The Avengers,Joss Whedon,Kevin Feige,Joss Whedon,"[Stan Lee, Jack Kirby]","[Robert Downey Jr., Chris Evans, Mark Ruffalo,...",Alan Silvestri,Seamus McGarvey,"[Jeffrey Ford, Lisa Lassek]",Marvel Studios,...,United States,English,1519.0,220.0,8.0,69,92%,,"[Zak Penn, Joss Whedon]",
5,Iron Man 3,Shane Black,Kevin Feige,"[Shane Black, Drew Pearce]","[Stan Lee, Don Heck, Larry Lieber, Jack Kirby]","[Robert Downey Jr., Gwyneth Paltrow, Don Chead...",Brian Tyler,John Toll,"[Jeffrey Ford, Peter S. Elliot]",Marvel Studios,...,United States,English,1215.0,200.0,7.2,62,79%,,,
6,Thor: The Dark World,Alan Taylor,Kevin Feige,"[Christopher Yost, Christopher Markus Stephen ...","[Stan Lee, Larry Lieber, Jack Kirby]","[Chris Hemsworth, Natalie Portman, Tom Hiddles...",Brian Tyler,Kramer Morgenthau,"[Dan Lebental, Wyatt Smith]",Marvel Studios,...,United States,English,644.8,170.0,6.9,54,66%,,"[Don Payne, Robert Rodat]",
7,Captain America: The Winter Soldier,"[Anthony Russo, Joe Russo]",Kevin Feige,"[Christopher Markus, Stephen McFeely]","[Joe Simon, Jack Kirby]","[Chris Evans, Scarlett Johansson, Sebastian St...",Henry Jackman,Trent Opaloch,"[Jeffrey Ford, Matthew Schmidt]",Marvel Studios,...,United States,English,714.4,170.0,7.7,70,90%,,,
8,Avengers: Age of Ultron,Joss Whedon,Kevin Feige,,"[Stan Lee, Jack Kirby]","[Robert Downey Jr., Chris Hemsworth, Mark Ruff...","[Brian Tyler, Danny Elfman]",Ben Davis,"[Jeffrey Ford, Lisa Lassek]",Marvel Studios,...,United States,English,1403.0,444.0,7.3,66,76%,Joss Whedon,,
9,Ant-Man,Peyton Reed,Kevin Feige,"[Edgar Wright, Joe Cornish, Adam McKay, Paul R...","[Stan Lee, Larry Lieber, Jack Kirby]","[Paul Rudd, Evangeline Lilly, Corey Stoll, Bob...",Christophe Beck,Russell Carpenter,"[Dan Lebental, Colby Parker, Jr.]",Marvel Studios,...,United States,English,519.3,130.0,7.3,64,83%,,"[Edgar Wright, Joe Cornish]",


In [34]:
df.to_csv('marvel_movie_data_final.csv')