# Set-up

In [2]:
# load packages
import requests
from bs4 import BeautifulSoup

In [3]:
# Define the URL of the site
base_site = "https://editorial.rottentomatoes.com/guide/140-essential-action-movies-to-watch-now/2/"

In [5]:
# sending a request to the webpage
response = requests.get(base_site)
response.status_code

200

In [6]:
# get the HTML from the webpage
html = response.content

## Choosing a parser

### html.parser

In [7]:
# convert the HTML to a Beautiful Soup object
soup = BeautifulSoup(html, 'html.parser')

In [8]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_HTML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [9]:
# When inspecting the file we see that HTML element is closed at the begining -- it parsed incorrectly!
# Let's check another parser

### lxml

In [10]:
# convert the HTML to a BeatifulSoup object
soup = BeautifulSoup(html, 'lxml')

In [11]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_LXML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [12]:
# By first accounts of inspecting the file everything seems fine

### A word of caution

In [13]:
# Beautiful Soup ranks the lxml parser as the best one.

# If a parser is not explicitly stated in the Beautiful Soup constructor,
# the best one available on the current machine is chosen.

# This means that the same piece of code can give different results on different computers.

## Finding an element containing all the data

In [14]:
# Find all div tags on the webpage containing the information we want to scrape
divs = soup.find_all("div", {"class": "col-sm-18 col-full-xs countdown-item-content"})
divs

[<div class="col-sm-18 col-full-xs countdown-item-content">
 <div class="row countdown-item-title-bar">
 <div class="col-sm-20 col-full-xs" style="height: 100%;">
 <div class="article_movie_title" style="float: left;">
 <div><h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared/">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">60%</span></h2></div>
 </div>
 </div>
 <div class="col-sm-4 col-full-xs" style="height: 100%;">
 <div class="countdown-index">#140</div>
 </div>
 </div>
 <div class="row countdown-item-details">
 <div class="col-sm-24">
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>61.188% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="to

# Extracting the title, year and score of each movie

In [15]:
# The title, year and score of each movie are contained in the 'h2' tags

In [16]:
# for instance, let's explore the first div
divs[0].find("h2")

<h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared/">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">60%</span></h2>

In [17]:
# Extracting all 'h2' tags
headings = [div.find("h2") for div in divs]
headings

[<h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared/">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">60%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/equilibrium/">Equilibrium</a> <span class="subtle start-year">(2002)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">40%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/hero/">Hero</a> <span class="subtle start-year">(2004)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">95%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/1017666-road_house/">Road House</a> <span class="subtle start-year">(1989)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">39%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/unstoppable-2010/">Unstoppable</a> <span class="subtle st

In [18]:
# Inspecting the text inside the headings
[heading.text for heading in headings]

['Running Scared (1986)  60%',
 'Equilibrium (2002)  40%',
 'Hero (2004)  95%',
 'Road House (1989)  39%',
 'Unstoppable (2010)  86%',
 'Shaft (1971)  88%',
 'The Villainess (Ak-Nyeo) (2017)  84%',
 'Highlander (1986)  69%',
 'Die Hard 2 (1990)  68%',
 'National Treasure (2004)  46%',
 'The Protector (Tom yum goong) (Warrior King) (2005)  53%',
 'Revenge (2018)  92%',
 'El Mariachi (1993)  93%',
 'A Touch of Zen (1969)  96%',
 'Top Gun (1986)  54%',
 'Con Air (1997)  55%',
 'The Expendables 2 (2012)  68%',
 'The Mummy (1999)  60%',
 'Mr. & Mrs. Smith (2005)  60%',
 'Rush Hour (1998)  60%',
 'The Equalizer (2014)  59%',
 'Captain America: Civil War (2016)  91%',
 'Air Force One (1997)  76%',
 'Bloodsport (1988)  38%',
 'Blade (1998)  55%',
 'Bad Boys (1995)  43%',
 'Die Hard: With a Vengeance (1995)  52%',
 'The Running Man (1987)  65%',
 'Code of Silence (1985)  63%',
 "Shoot 'Em Up (2007)  67%",
 'Crank (2006)  61%',
 'Machete (2010)  72%',
 'Drive (2011)  92%',
 'Batman (1989)  72%',

In [19]:
# It does contain the info we want to extract
# However, we need to obtain the title, year and score separately
# Let's inspect one heading to see if there is a way to distinguish between them
headings[0]

<h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared/">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">60%</span></h2>

In [20]:
# We notice that:

# The movie title is in the 'a' tag
# The year is in a 'span' with class 'start-year'
# The score is in a 'span' with class 'tMeterScore'

## Title

In [21]:
# Let's check all heading links
[heading.find('a') for heading in headings]

[<a href="https://www.rottentomatoes.com/m/1018009-running_scared/">Running Scared</a>,
 <a href="https://www.rottentomatoes.com/m/equilibrium/">Equilibrium</a>,
 <a href="https://www.rottentomatoes.com/m/hero/">Hero</a>,
 <a href="https://www.rottentomatoes.com/m/1017666-road_house/">Road House</a>,
 <a href="https://www.rottentomatoes.com/m/unstoppable-2010/">Unstoppable</a>,
 <a href="https://www.rottentomatoes.com/m/1018699-shaft/">Shaft</a>,
 <a href="https://www.rottentomatoes.com/m/the_villainess/">The Villainess (Ak-Nyeo)</a>,
 <a href="https://www.rottentomatoes.com/m/highlander/">Highlander</a>,
 <a href="https://www.rottentomatoes.com/m/die_hard_2_1990/">Die Hard 2</a>,
 <a href="https://www.rottentomatoes.com/m/national_treasure/">National Treasure</a>,
 <a href="https://www.rottentomatoes.com/m/protector/">The Protector (Tom yum goong) (Warrior King)</a>,
 <a href="https://www.rottentomatoes.com/m/revenge_2018/">Revenge</a>,
 <a href="https://www.rottentomatoes.com/m/el_ma

In [22]:
# Obtaining the movie titles from the links
movie_names = [heading.find('a').string for heading in headings]
movie_names

['Running Scared',
 'Equilibrium',
 'Hero',
 'Road House',
 'Unstoppable',
 'Shaft',
 'The Villainess (Ak-Nyeo)',
 'Highlander',
 'Die Hard 2',
 'National Treasure',
 'The Protector (Tom yum goong) (Warrior King)',
 'Revenge',
 'El Mariachi',
 'A Touch of Zen',
 'Top Gun',
 'Con Air',
 'The Expendables 2',
 'The Mummy',
 'Mr. & Mrs. Smith',
 'Rush Hour',
 'The Equalizer',
 'Captain America: Civil War',
 'Air Force One',
 'Bloodsport',
 'Blade',
 'Bad Boys',
 'Die Hard: With a Vengeance',
 'The Running Man',
 'Code of Silence',
 "Shoot 'Em Up",
 'Crank',
 'Machete',
 'Drive',
 'Batman',
 'Under Siege',
 'Independence Day',
 'Bullitt',
 'Wanted',
 'Superman',
 'Ronin',
 'They Live',
 'Cliffhanger',
 "Marvel's The Avengers",
 'Hot Fuzz',
 'The Warriors',
 'Starship Troopers',
 'Elite Squad: The Enemy Within',
 'Point Break',
 'The Long Kiss Goodnight',
 'The Guest',
 'Taken',
 '300',
 'True Lies',
 'Demolition Man',
 'Hardcore Henry',
 'Police Story (Ging chaat goo si) (Police Force)',
 '

## Year

In [23]:
# Filtering only the spans containing the year
[heading.find("span", class_ = 'start-year') for heading in headings]

[<span class="subtle start-year">(1986)</span>,
 <span class="subtle start-year">(2002)</span>,
 <span class="subtle start-year">(2004)</span>,
 <span class="subtle start-year">(1989)</span>,
 <span class="subtle start-year">(2010)</span>,
 <span class="subtle start-year">(1971)</span>,
 <span class="subtle start-year">(2017)</span>,
 <span class="subtle start-year">(1986)</span>,
 <span class="subtle start-year">(1990)</span>,
 <span class="subtle start-year">(2004)</span>,
 <span class="subtle start-year">(2005)</span>,
 <span class="subtle start-year">(2018)</span>,
 <span class="subtle start-year">(1993)</span>,
 <span class="subtle start-year">(1969)</span>,
 <span class="subtle start-year">(1986)</span>,
 <span class="subtle start-year">(1997)</span>,
 <span class="subtle start-year">(2012)</span>,
 <span class="subtle start-year">(1999)</span>,
 <span class="subtle start-year">(2005)</span>,
 <span class="subtle start-year">(1998)</span>,
 <span class="subtle start-year">(2014)<

In [24]:
# Extracting the year string
years = [heading.find("span", class_ = 'start-year').string for heading in headings]
years

['(1986)',
 '(2002)',
 '(2004)',
 '(1989)',
 '(2010)',
 '(1971)',
 '(2017)',
 '(1986)',
 '(1990)',
 '(2004)',
 '(2005)',
 '(2018)',
 '(1993)',
 '(1969)',
 '(1986)',
 '(1997)',
 '(2012)',
 '(1999)',
 '(2005)',
 '(1998)',
 '(2014)',
 '(2016)',
 '(1997)',
 '(1988)',
 '(1998)',
 '(1995)',
 '(1995)',
 '(1987)',
 '(1985)',
 '(2007)',
 '(2006)',
 '(2010)',
 '(2011)',
 '(1989)',
 '(1992)',
 '(1996)',
 '(1968)',
 '(2008)',
 '(1978)',
 '(1998)',
 '(1988)',
 '(1993)',
 '(2012)',
 '(2007)',
 '(1979)',
 '(1997)',
 '(2011)',
 '(1991)',
 '(1996)',
 '(2014)',
 '(2009)',
 '(2007)',
 '(1994)',
 '(1993)',
 '(2016)',
 '(1985)',
 '(2001)',
 '(2015)',
 '(1997)',
 '(1986)',
 '(2017)',
 '(1995)',
 '(2006)',
 '(1984)',
 '(2005)',
 '(2004)',
 '(2001)',
 '(1981)',
 '(2000)',
 '(2004)',
 '(2011)',
 '(1992)',
 '(1989)',
 '(2005)',
 '(2010)',
 '(2008)',
 '(2018)',
 '(2017)',
 '(1964)',
 '(1976)',
 '(2017)',
 '(1972)',
 '(2014)',
 '(2005)',
 '(1971)',
 '(2015)',
 '(1990)',
 '(1996)',
 '(1971)',
 '(2014)',
 '(2003)',

In [25]:
years[0]

'(1986)'

### Removing the brackets

In [26]:
# One way to remove the brackets is to drop the first and last symbol of the string
years[0][1:-1]

'1986'

In [27]:
# However, this will break, if the format of the year is changed

In [28]:
# Alternatively, we can do it with the help of the strip() method (this is robust)

# It removes leading and trailing symbols from a string
# By default, it removes whitespace, but we can specify other symbols to strip

In [29]:
# Removing '('
years[0].strip('(')

'1986)'

In [30]:
# Removing ')'
years[0].strip(')')

'(1986'

In [31]:
# Combining both
years[0].strip('()')

'1986'

In [32]:
# Updating years with stripped values
years = [year.strip('()') for year in years]
years

['1986',
 '2002',
 '2004',
 '1989',
 '2010',
 '1971',
 '2017',
 '1986',
 '1990',
 '2004',
 '2005',
 '2018',
 '1993',
 '1969',
 '1986',
 '1997',
 '2012',
 '1999',
 '2005',
 '1998',
 '2014',
 '2016',
 '1997',
 '1988',
 '1998',
 '1995',
 '1995',
 '1987',
 '1985',
 '2007',
 '2006',
 '2010',
 '2011',
 '1989',
 '1992',
 '1996',
 '1968',
 '2008',
 '1978',
 '1998',
 '1988',
 '1993',
 '2012',
 '2007',
 '1979',
 '1997',
 '2011',
 '1991',
 '1996',
 '2014',
 '2009',
 '2007',
 '1994',
 '1993',
 '2016',
 '1985',
 '2001',
 '2015',
 '1997',
 '1986',
 '2017',
 '1995',
 '2006',
 '1984',
 '2005',
 '2004',
 '2001',
 '1981',
 '2000',
 '2004',
 '2011',
 '1992',
 '1989',
 '2005',
 '2010',
 '2008',
 '2018',
 '2017',
 '1964',
 '1976',
 '2017',
 '1972',
 '2014',
 '2005',
 '1971',
 '2015',
 '1990',
 '1996',
 '1971',
 '2014',
 '2003',
 '1993',
 '2018',
 '2010',
 '1995',
 '2002',
 '2019',
 '2012',
 '2002',
 '2010',
 '1997',
 '1985',
 '2008',
 '2011',
 '2011',
 '1987',
 '1996',
 '1987',
 '2017',
 '2006',
 '2017',
 

In [33]:
# Converting all the strings to integers
years = [int(year) for year in years]
years

[1986,
 2002,
 2004,
 1989,
 2010,
 1971,
 2017,
 1986,
 1990,
 2004,
 2005,
 2018,
 1993,
 1969,
 1986,
 1997,
 2012,
 1999,
 2005,
 1998,
 2014,
 2016,
 1997,
 1988,
 1998,
 1995,
 1995,
 1987,
 1985,
 2007,
 2006,
 2010,
 2011,
 1989,
 1992,
 1996,
 1968,
 2008,
 1978,
 1998,
 1988,
 1993,
 2012,
 2007,
 1979,
 1997,
 2011,
 1991,
 1996,
 2014,
 2009,
 2007,
 1994,
 1993,
 2016,
 1985,
 2001,
 2015,
 1997,
 1986,
 2017,
 1995,
 2006,
 1984,
 2005,
 2004,
 2001,
 1981,
 2000,
 2004,
 2011,
 1992,
 1989,
 2005,
 2010,
 2008,
 2018,
 2017,
 1964,
 1976,
 2017,
 1972,
 2014,
 2005,
 1971,
 2015,
 1990,
 1996,
 1971,
 2014,
 2003,
 1993,
 2018,
 2010,
 1995,
 2002,
 2019,
 2012,
 2002,
 2010,
 1997,
 1985,
 2008,
 2011,
 2011,
 1987,
 1996,
 1987,
 2017,
 2006,
 2017,
 1994,
 1989,
 2014,
 1973,
 1985,
 1982,
 2015,
 1984,
 2000,
 2003,
 1994,
 1994,
 1994,
 2014,
 2001,
 1987,
 2007,
 1990,
 1982,
 1995,
 2012,
 2018,
 1981,
 1986,
 1992,
 1999,
 1991,
 1988,
 2015]

## Extracting the scores

In [34]:
#Filtering only the spans containing the score
[heading.find_all('span', class_ = 'tMeterScore') for heading in headings]

[[<span class="tMeterScore">60%</span>],
 [<span class="tMeterScore">40%</span>],
 [<span class="tMeterScore">95%</span>],
 [<span class="tMeterScore">39%</span>],
 [<span class="tMeterScore">86%</span>],
 [<span class="tMeterScore">88%</span>],
 [<span class="tMeterScore">84%</span>],
 [<span class="tMeterScore">69%</span>],
 [<span class="tMeterScore">68%</span>],
 [<span class="tMeterScore">46%</span>],
 [<span class="tMeterScore">53%</span>],
 [<span class="tMeterScore">92%</span>],
 [<span class="tMeterScore">93%</span>],
 [<span class="tMeterScore">96%</span>],
 [<span class="tMeterScore">54%</span>],
 [<span class="tMeterScore">55%</span>],
 [<span class="tMeterScore">68%</span>],
 [<span class="tMeterScore">60%</span>],
 [<span class="tMeterScore">60%</span>],
 [<span class="tMeterScore">60%</span>],
 [<span class="tMeterScore">59%</span>],
 [<span class="tMeterScore">91%</span>],
 [<span class="tMeterScore">76%</span>],
 [<span class="tMeterScore">38%</span>],
 [<span class="t

In [35]:
# Extracting the score string
scores = [heading.find("span", class_ = 'tMeterScore').string for heading in headings]
scores

['60%',
 '40%',
 '95%',
 '39%',
 '86%',
 '88%',
 '84%',
 '69%',
 '68%',
 '46%',
 '53%',
 '92%',
 '93%',
 '96%',
 '54%',
 '55%',
 '68%',
 '60%',
 '60%',
 '60%',
 '59%',
 '91%',
 '76%',
 '38%',
 '55%',
 '43%',
 '52%',
 '65%',
 '63%',
 '67%',
 '61%',
 '72%',
 '92%',
 '72%',
 '79%',
 '65%',
 '97%',
 '71%',
 '94%',
 '68%',
 '86%',
 '68%',
 '92%',
 '91%',
 '89%',
 '63%',
 '93%',
 '69%',
 '69%',
 '91%',
 '58%',
 '60%',
 '70%',
 '62%',
 '51%',
 '92%',
 '73%',
 '74%',
 '71%',
 '77%',
 '79%',
 '80%',
 '80%',
 '82%',
 '85%',
 '86%',
 '91%',
 '86%',
 '87%',
 '93%',
 '95%',
 '88%',
 '88%',
 '90%',
 '93%',
 '94%',
 '90%',
 '93%',
 '98%',
 '98%',
 '93%',
 '92%',
 '90%',
 '82%',
 '98%',
 '81%',
 '88%',
 '96%',
 '89%',
 '90%',
 '85%',
 '96%',
 '97%',
 '87%',
 '77%',
 '90%',
 '94%',
 '79%',
 '83%',
 '85%',
 '92%',
 '91%',
 '94%',
 '93%',
 '77%',
 '82%',
 '66%',
 '89%',
 '89%',
 '95%',
 '93%',
 '100%',
 '98%',
 '80%',
 '94%',
 '70%',
 '87%',
 '93%',
 '100%',
 '76%',
 '85%',
 '73%',
 '94%',
 '83%',
 '86%'

In [36]:
# Removing the '%' sign
scores = [s.strip('%') for s in scores]
scores

['60',
 '40',
 '95',
 '39',
 '86',
 '88',
 '84',
 '69',
 '68',
 '46',
 '53',
 '92',
 '93',
 '96',
 '54',
 '55',
 '68',
 '60',
 '60',
 '60',
 '59',
 '91',
 '76',
 '38',
 '55',
 '43',
 '52',
 '65',
 '63',
 '67',
 '61',
 '72',
 '92',
 '72',
 '79',
 '65',
 '97',
 '71',
 '94',
 '68',
 '86',
 '68',
 '92',
 '91',
 '89',
 '63',
 '93',
 '69',
 '69',
 '91',
 '58',
 '60',
 '70',
 '62',
 '51',
 '92',
 '73',
 '74',
 '71',
 '77',
 '79',
 '80',
 '80',
 '82',
 '85',
 '86',
 '91',
 '86',
 '87',
 '93',
 '95',
 '88',
 '88',
 '90',
 '93',
 '94',
 '90',
 '93',
 '98',
 '98',
 '93',
 '92',
 '90',
 '82',
 '98',
 '81',
 '88',
 '96',
 '89',
 '90',
 '85',
 '96',
 '97',
 '87',
 '77',
 '90',
 '94',
 '79',
 '83',
 '85',
 '92',
 '91',
 '94',
 '93',
 '77',
 '82',
 '66',
 '89',
 '89',
 '95',
 '93',
 '100',
 '98',
 '80',
 '94',
 '70',
 '87',
 '93',
 '100',
 '76',
 '85',
 '73',
 '94',
 '83',
 '86',
 '97',
 '81',
 '92',
 '82',
 '95',
 '86',
 '86',
 '97',
 '95',
 '97',
 '94',
 '87',
 '93',
 '93',
 '97']

In [37]:
# Converting each score to an integer
scores = [int(s) for s in scores]
scores

[60,
 40,
 95,
 39,
 86,
 88,
 84,
 69,
 68,
 46,
 53,
 92,
 93,
 96,
 54,
 55,
 68,
 60,
 60,
 60,
 59,
 91,
 76,
 38,
 55,
 43,
 52,
 65,
 63,
 67,
 61,
 72,
 92,
 72,
 79,
 65,
 97,
 71,
 94,
 68,
 86,
 68,
 92,
 91,
 89,
 63,
 93,
 69,
 69,
 91,
 58,
 60,
 70,
 62,
 51,
 92,
 73,
 74,
 71,
 77,
 79,
 80,
 80,
 82,
 85,
 86,
 91,
 86,
 87,
 93,
 95,
 88,
 88,
 90,
 93,
 94,
 90,
 93,
 98,
 98,
 93,
 92,
 90,
 82,
 98,
 81,
 88,
 96,
 89,
 90,
 85,
 96,
 97,
 87,
 77,
 90,
 94,
 79,
 83,
 85,
 92,
 91,
 94,
 93,
 77,
 82,
 66,
 89,
 89,
 95,
 93,
 100,
 98,
 80,
 94,
 70,
 87,
 93,
 100,
 76,
 85,
 73,
 94,
 83,
 86,
 97,
 81,
 92,
 82,
 95,
 86,
 86,
 97,
 95,
 97,
 94,
 87,
 93,
 93,
 97]

# Extracting the rest of the information

## Adjusted score

In [38]:
# The adjusted scores can be found in a div with class 'info countdown-adjusted-score'
adj_scores = [div.find("div", {"class": "info countdown-adjusted-score"}) for div in divs]
adj_scores

[<div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>61.188% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>,
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>41.991% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>,
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>100.759% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The A

In [39]:
# Inspecting an element
adj_scores[0]

<div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>61.188% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>

In [40]:
# By inspection we see that the string we are looking for is the second child of the 'div' tag
adj_scores[0].contents[1]  # Note the extra whitespace at the end

'61.188% '

In [41]:
# Extracting the string (without '%' sign and extra space)
adj_scores_clean = [score.contents[1].strip('% ') for score in adj_scores]
adj_scores_clean

['61.188',
 '41.991',
 '100.759',
 '41.997',
 '91.465',
 '92.029',
 '86.904',
 '71.939',
 '72.289',
 '50.845',
 '55.434',
 '97.024',
 '94.586',
 '98.505',
 '59.128',
 '58.687',
 '72.295',
 '63.98',
 '67.128',
 '63.724',
 '66.554',
 '107.315',
 '78.966',
 '39.152',
 '60.6',
 '46.863',
 '55.785',
 '67.838',
 '63.699',
 '72.11',
 '63.633',
 '77.431',
 '100.113',
 '77.722',
 '80.431',
 '68.807',
 '100.519',
 '78.617',
 '101.273',
 '71.206',
 '90.65',
 '71.896',
 '105.42',
 '97.704',
 '91.978',
 '66.543',
 '93.571',
 '73.219',
 '71.656',
 '94.925',
 '63.259',
 '67.805',
 '72.844',
 '64.603',
 '56.141',
 '93.325',
 '76.646',
 '83.678',
 '74.687',
 '80.815',
 '93.89',
 '83.245',
 '83.445',
 '85.637',
 '88.121',
 '92.641',
 '93.648',
 '90.945',
 '87.977',
 '100.881',
 '98.504',
 '61.119',
 '93.268',
 '96.174',
 '44.07',
 '104.686',
 '90.365',
 '110.599',
 '104.24',
 '102.298',
 '112.436',
 '92.807',
 '101.745',
 '86.609',
 '104.292',
 '91.225',
 '90.526',
 '98.249',
 '93.658',
 '102.566',
 '92

In [42]:
# Converting the strings to numbers
final_adj = [float(score) for score in adj_scores_clean] # Note that this time the scores are float, not int!
final_adj

[61.188,
 41.991,
 100.759,
 41.997,
 91.465,
 92.029,
 86.904,
 71.939,
 72.289,
 50.845,
 55.434,
 97.024,
 94.586,
 98.505,
 59.128,
 58.687,
 72.295,
 63.98,
 67.128,
 63.724,
 66.554,
 107.315,
 78.966,
 39.152,
 60.6,
 46.863,
 55.785,
 67.838,
 63.699,
 72.11,
 63.633,
 77.431,
 100.113,
 77.722,
 80.431,
 68.807,
 100.519,
 78.617,
 101.273,
 71.206,
 90.65,
 71.896,
 105.42,
 97.704,
 91.978,
 66.543,
 93.571,
 73.219,
 71.656,
 94.925,
 63.259,
 67.805,
 72.844,
 64.603,
 56.141,
 93.325,
 76.646,
 83.678,
 74.687,
 80.815,
 93.89,
 83.245,
 83.445,
 85.637,
 88.121,
 92.641,
 93.648,
 90.945,
 87.977,
 100.881,
 98.504,
 61.119,
 93.268,
 96.174,
 44.07,
 104.686,
 90.365,
 110.599,
 104.24,
 102.298,
 112.436,
 92.807,
 101.745,
 86.609,
 104.292,
 91.225,
 90.526,
 98.249,
 93.658,
 102.566,
 92.173,
 101.928,
 119.294,
 97.914,
 81.943,
 97.34,
 117.383,
 84.558,
 88.28,
 77.681,
 97.202,
 92.951,
 107.208,
 100.554,
 82.948,
 86.278,
 69.333,
 94.237,
 100.308,
 103.895,

In [43]:
# The synopsis is located inside a 'div' tag with the class 'info synopsis'
synopsis = [div.find('div', class_='synopsis') for div in divs]
synopsis

[<div class="info synopsis"><span class="descriptor">Synopsis:</span> Distinguished by a sharp, witty dialogue between its two cop protagonists, Ray and Danny (Gregory Hines and Billy Crystal), this...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/1018009-running_scared/" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> In the nation of Libria, there is always peace among men. The rules of the Librian system are simple. If...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/equilibrium/" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> Hero is two-time Academy Award nominee Zhang Yimou's directorial attempt at exploring the concept of a Chinese hero. During the...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/hero/" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</spa

In [44]:
# Inspecting the element
synopsis[0]

<div class="info synopsis"><span class="descriptor">Synopsis:</span> Distinguished by a sharp, witty dialogue between its two cop protagonists, Ray and Danny (Gregory Hines and Billy Crystal), this...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/1018009-running_scared/" target="_top"> [More]</a></div>

In [45]:
# The text is the second child
synopsis[0].contents[1]

' Distinguished by a sharp, witty dialogue between its two cop protagonists, Ray and Danny (Gregory Hines and Billy Crystal), this...'

In [46]:
# Extracting the text
synopsis_text = [syn.contents[1] for syn in synopsis]
synopsis_text

[' Distinguished by a sharp, witty dialogue between its two cop protagonists, Ray and Danny (Gregory Hines and Billy Crystal), this...',
 ' In the nation of Libria, there is always peace among men. The rules of the Librian system are simple. If...',
 " Hero is two-time Academy Award nominee Zhang Yimou's directorial attempt at exploring the concept of a Chinese hero. During the...",
 ' Dalton (Swayze) is a true gentleman with a degree in philosophy from NYU. He also has a flip side -...',
 ' In this action thriller from director Tony Scott, rookie train operator Will (Chris Pine) and grizzled veteran engineer Frank (Denzel...',
 ' Shaft, a highly successful film, spawned an industry of sequels and imitations. The daughter (Sherri Brewer) of Bumpy Jones (Moses...',
 ' Since she was a little girl, Sook-hee was raised to be a deadly assassin. She gladly accepts the chance to...',
 ' Among humans for centuries, an immortal specie existed. Connor MacLeod is a member of this specie. Unaware 

## Critics Consensus

In [47]:
# The critics consensus is located inside a 'div' tag with the class 'info critics-consensus'
# This can be found inside the original 'div's we scraped
divs

[<div class="col-sm-18 col-full-xs countdown-item-content">
 <div class="row countdown-item-title-bar">
 <div class="col-sm-20 col-full-xs" style="height: 100%;">
 <div class="article_movie_title" style="float: left;">
 <div><h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared/">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">60%</span></h2></div>
 </div>
 </div>
 <div class="col-sm-4 col-full-xs" style="height: 100%;">
 <div class="countdown-index">#140</div>
 </div>
 </div>
 <div class="row countdown-item-details">
 <div class="col-sm-24">
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>61.188% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="to

In [48]:
# Getting the 'div' tags containing the critics consensus
consensus = [div.find("div", {"class": "info critics-consensus"}) for div in divs]
consensus

[<div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> Equilibrium is a reheated mishmash of other sci-fi movies.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> With death-defying action sequences and epic historic sweep, Hero offers everything a martial arts fan could ask for.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> Whether Road House is simply bad or so bad it's good depends largely on the audience's fondness for Swayze -- and tolerance for violently cheesy action.</div>,
 <div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> As fast, loud, and relentless as the

In [49]:
# Inspecting the text inside these tags
[con.text for con in consensus]

['Critics Consensus: Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.',
 'Critics Consensus: Equilibrium is a reheated mishmash of other sci-fi movies.',
 'Critics Consensus: With death-defying action sequences and epic historic sweep, Hero offers everything a martial arts fan could ask for.',
 "Critics Consensus: Whether Road House is simply bad or so bad it's good depends largely on the audience's fondness for Swayze -- and tolerance for violently cheesy action.",
 "Critics Consensus: As fast, loud, and relentless as the train at the center of the story, Unstoppable is perfect popcorn entertainment -- and director Tony Scott's best movie in years.",
 'Critics Consensus: This is the man that would risk his neck for his brother, man. Can you dig it?',
 'Critics Consensus: The Villainess offers enough pure kinetic thrills to satisfy genre enthusiasts -- and carve out a bl

In [50]:
# Every consensus starts with the string 'Critics Consensus: '
# There are a couple of ways to remove it from the final text

### Way #1: Text processing

In [51]:
# The simplest (but not necessarily the best) way of achieving it is by taking the substring after the common phrase

In [52]:
# Defining the phrase to be removed (note the space at the end)
common_phrase = 'Critics Consensus: '

In [53]:
# Finding how long is the common phrase
len(common_phrase)

19

In [54]:
consensus[0].text

'Critics Consensus: Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.'

In [55]:
# Taking only the part of the text after the common phrase
consensus[0].text[19:]

'Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.'

In [56]:
# Define a variable to store the length
common_len = len(common_phrase)

In [57]:
# Cleaning the list of the common phrase
consensus_text = [con.text[common_len:] for con in consensus]
consensus_text

['Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.',
 'Equilibrium is a reheated mishmash of other sci-fi movies.',
 'With death-defying action sequences and epic historic sweep, Hero offers everything a martial arts fan could ask for.',
 "Whether Road House is simply bad or so bad it's good depends largely on the audience's fondness for Swayze -- and tolerance for violently cheesy action.",
 "As fast, loud, and relentless as the train at the center of the story, Unstoppable is perfect popcorn entertainment -- and director Tony Scott's best movie in years.",
 'This is the man that would risk his neck for his brother, man. Can you dig it?',
 'The Villainess offers enough pure kinetic thrills to satisfy genre enthusiasts -- and carve out a bloody niche for itself in modern Korean action cinema.',
 "People hate Highlander because it's cheesy, bombastic, and absurd. And peop

In [58]:
# We can add if-else logic to only truncate the string in case it starts with the common phrase
consensus_text = [con.text[common_len:] if con.text.startswith(common_phrase) else con.text for con in consensus ]
consensus_text

['Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.',
 'Equilibrium is a reheated mishmash of other sci-fi movies.',
 'With death-defying action sequences and epic historic sweep, Hero offers everything a martial arts fan could ask for.',
 "Whether Road House is simply bad or so bad it's good depends largely on the audience's fondness for Swayze -- and tolerance for violently cheesy action.",
 "As fast, loud, and relentless as the train at the center of the story, Unstoppable is perfect popcorn entertainment -- and director Tony Scott's best movie in years.",
 'This is the man that would risk his neck for his brother, man. Can you dig it?',
 'The Villainess offers enough pure kinetic thrills to satisfy genre enthusiasts -- and carve out a bloody niche for itself in modern Korean action cinema.',
 "People hate Highlander because it's cheesy, bombastic, and absurd. And peop

### Way #2: Inspecting the HTML

In [60]:
consensus[0]

<div class="info critics-consensus"><span class="descriptor">Critics Consensus:</span> Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.</div>

In [61]:
# When inspecting the HTML we see that the common phrase ("Critics Consensus: ")
# is located inside a span element
# The string we want to obtain follows that

In [62]:
# We can use .contents to obtain a list of all children of the tag
consensus[0].contents

[<span class="descriptor">Critics Consensus:</span>,
 ' Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.']

In [63]:
# The second element of that list is the text we want
consensus[0].contents[1]

' Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.'

In [64]:
# We can remove the extra whitespace (space at the beginning) with the .strip() method
consensus[0].contents[1].strip()

'Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.'

In [65]:
# Processing all texts
consensus_text = [con.contents[1].strip() for con in consensus]
consensus_text

['Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining.',
 'Equilibrium is a reheated mishmash of other sci-fi movies.',
 'With death-defying action sequences and epic historic sweep, Hero offers everything a martial arts fan could ask for.',
 "Whether Road House is simply bad or so bad it's good depends largely on the audience's fondness for Swayze -- and tolerance for violently cheesy action.",
 "As fast, loud, and relentless as the train at the center of the story, Unstoppable is perfect popcorn entertainment -- and director Tony Scott's best movie in years.",
 'This is the man that would risk his neck for his brother, man. Can you dig it?',
 'The Villainess offers enough pure kinetic thrills to satisfy genre enthusiasts -- and carve out a bloody niche for itself in modern Korean action cinema.',
 "People hate Highlander because it's cheesy, bombastic, and absurd. And peop

In [66]:
# In my opinion, this method is closer to the BeautifulSoup approach

## Directors

In [68]:
# Extracting all director divs
directors = [div.find("div", class_ = 'director') for div in divs]
directors

[<div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="/celebrity/peter_hyams/">Peter Hyams</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="/celebrity/kurt_wimmer/">Kurt Wimmer</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="/celebrity/zhang_yimou/">Zhang Yimou</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="/celebrity/rowdy_herrington/">Rowdy Herrington</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="/celebrity/tony_scott/">Tony Scott</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="/celebrity/gordon_parks/">Gordon Parks</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="/celebrity/jung_byoung_gil/">Jung Byung-gil</a></di

In [69]:
# Inspecting a div
directors[0]

<div class="info director">
<span class="descriptor">Directed By:</span> <a class="" href="/celebrity/peter_hyams/">Peter Hyams</a></div>

In [70]:
# The director's name can be found as the string of a link

# Obtaining all director links
[director.find("a") for director in directors]

[<a class="" href="/celebrity/peter_hyams/">Peter Hyams</a>,
 <a class="" href="/celebrity/kurt_wimmer/">Kurt Wimmer</a>,
 <a class="" href="/celebrity/zhang_yimou/">Zhang Yimou</a>,
 <a class="" href="/celebrity/rowdy_herrington/">Rowdy Herrington</a>,
 <a class="" href="/celebrity/tony_scott/">Tony Scott</a>,
 <a class="" href="/celebrity/gordon_parks/">Gordon Parks</a>,
 <a class="" href="/celebrity/jung_byoung_gil/">Jung Byung-gil</a>,
 <a class="" href="/celebrity/russell_mulcahy/">Russell Mulcahy</a>,
 <a class="" href="/celebrity/renny_harlin/">Renny Harlin</a>,
 <a class="" href="/celebrity/jon_turteltaub/">Jon Turteltaub</a>,
 <a class="" href="/celebrity/prachya_pinkaew/">Prachya Pinkaew</a>,
 <a class="" href="/celebrity/coralie_fargeat/">Coralie Fargeat</a>,
 <a class="" href="/celebrity/robert_rodriguez/">Robert Rodriguez</a>,
 <a class="" href="/celebrity/king_hu/">King Hu</a>,
 <a class="" href="/celebrity/tony_scott/">Tony Scott</a>,
 <a class="" href="/celebrity/simon_

In [72]:
# Notice that few links are None.
# This means we can't simply use .string,
# because None has no string attribute

In [73]:
# Running the line below will raise an error if uncommented

#[director.find("a").string for director in directors]

In [74]:
# We can use if-else to deal with the None value

final_directors = [None if director.find("a") is None else director.find("a").string for director in directors]
final_directors

['Peter Hyams',
 'Kurt Wimmer',
 'Zhang Yimou',
 'Rowdy Herrington',
 'Tony Scott',
 'Gordon Parks',
 'Jung Byung-gil',
 'Russell Mulcahy',
 'Renny Harlin',
 'Jon Turteltaub',
 'Prachya Pinkaew',
 'Coralie Fargeat',
 'Robert Rodriguez',
 'King Hu',
 'Tony Scott',
 'Simon West',
 'Simon West',
 'Stephen Sommers',
 'Doug Liman',
 'Brett Ratner',
 'Antoine Fuqua',
 'Anthony Russo',
 'Wolfgang Petersen',
 'Newt Arnold',
 'Stephen Norrington',
 'Michael Bay',
 'John McTiernan',
 'Paul Michael Glaser',
 'Andrew Davis',
 'Michael Davis',
 'Mark Neveldine',
 'Ethan Maniquis',
 'Nicolas Winding Refn',
 'Tim Burton',
 'Andrew Davis',
 'Roland Emmerich',
 'Peter Yates',
 'Timur Bekmambetov',
 'Richard Donner',
 'John Frankenheimer',
 'John Carpenter',
 None,
 None,
 'Edgar Wright',
 'Walter Hill',
 'Paul Verhoeven',
 'José Padilha',
 'Kathryn Bigelow',
 'Renny Harlin',
 'Adam Wingard',
 'Pierre Morel',
 'Zack Snyder',
 'James Cameron',
 'Marco Brambilla',
 'Ilya Naishuller',
 'Jackie Chan',
 'Chr

## Cast info

In [76]:
cast_info = [div.find("div", class_ = 'cast') for div in divs]
cast_info

[<div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="/celebrity/gregory_hines/">Gregory Hines</a>, <a class="" href="/celebrity/billy_crystal/">Billy Crystal</a>, <a class="" href="/celebrity/jimmy_smits/">Jimmy Smits</a>, <a class="" href="/celebrity/steven_bauer/">Steven Bauer</a></div>,
 <div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="/celebrity/christian_bale/">Christian Bale</a>, <a class="" href="/celebrity/emily_watson/">Emily Watson</a>, <a class="" href="/celebrity/taye_diggs/">Taye Diggs</a>, <a class="" href="/celebrity/angus_macfadyen/">Angus Macfadyen</a></div>,
 <div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="/celebrity/jet_li/">Jet Li</a>, <a class="" href="/celebrity/leung_chiu_wai/">Tony Leung Chiu Wai</a>, <a class="" href="/celebrity/maggie_cheung_manyuk/">Maggie Cheung</a>, <a class="" href="/celebrity/daoming_chen/">Daoming Chen</a></div>,
 <div class="info ca

In [77]:
cast_info[0]

<div class="info cast">
<span class="descriptor">Starring:</span> <a class="" href="/celebrity/gregory_hines/">Gregory Hines</a>, <a class="" href="/celebrity/billy_crystal/">Billy Crystal</a>, <a class="" href="/celebrity/jimmy_smits/">Jimmy Smits</a>, <a class="" href="/celebrity/steven_bauer/">Steven Bauer</a></div>

In [78]:
# Each cast member's name is the string of a link
# There are multiple cast members for a movie

In [79]:
# Let's first practice with a single movie

# Obtain all the links to different cast members
cast_links = cast_info[0].find_all('a')
cast_links

[<a class="" href="/celebrity/gregory_hines/">Gregory Hines</a>,
 <a class="" href="/celebrity/billy_crystal/">Billy Crystal</a>,
 <a class="" href="/celebrity/jimmy_smits/">Jimmy Smits</a>,
 <a class="" href="/celebrity/steven_bauer/">Steven Bauer</a>]

In [80]:
# Extract the names from the links
cast_names = [link.string for link in cast_links]
cast_names

['Gregory Hines', 'Billy Crystal', 'Jimmy Smits', 'Steven Bauer']

In [81]:
# OPTIONALLY: We can stitch all names together as one string

# This can be done using the join method
# To use join, pick a string to use as a separator (in our case a comma, followed with a space) and
# pass the list of strings you want to merge to the join method

cast = ", ".join(cast_names)
cast

'Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer'

In [82]:
# Now we need to do the above operations for every movie

# We can either use a for loop (clearer), or
# use a nested list compehension (more concise)

### Using a for loop

In [83]:
# Initialize the list of all cast memners
cast = []

# Just put all previous operations inside a for loop
for c in cast_info:
    cast_links = c.find_all('a')
    cast_names = [link.string for link in cast_links]
    
    cast.append(", ".join(cast_names)) # Joining is optional

cast

['Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer',
 'Christian Bale, Emily Watson, Taye Diggs, Angus Macfadyen',
 'Jet Li, Tony Leung Chiu Wai, Maggie Cheung, Daoming Chen',
 'Patrick Swayze, Kelly Lynch, Sam Elliott, Ben Gazzara',
 'Denzel Washington, Chris Pine, Rosario Dawson, Ethan Suplee',
 'Richard Roundtree, Moses Gunn, Gwen Mitchell, Christopher St. John',
 'Ok-bin Kim, Kim Seo-hyung, Shin Ha-kyun, Bang Sung-jun',
 'Christopher Lambert, Sean Connery, Roxanne Hart, Clancy Brown',
 'Bruce Willis, Bonnie Bedelia, William Atherton, Reginald VelJohnson',
 'Nicolas Cage, Diane Kruger, Justin Bartha, Sean Bean',
 'Tony Jaa, Petchtai Wongkamlao, Bongkoj Khongmalai, Bongkoo Kongmalai',
 'Matilda Anna Ingrid Lutz, Kevin Janssens, Vincent Colombe, Guillaume Bouchède',
 'Carlos Gallardo, Consuelo Gómez, Reinol Martinez, Peter Marquardt',
 'Feng Hsu, Chun Shih, Pai Ying, Tien Peng',
 'Tom Cruise, Kelly McGillis, Anthony Edwards, Val Kilmer',
 'Nicolas Cage, John Cusack, John Malkov

### Nested list comprehension

In [85]:
# As you can see this can be done in just one line using nested list comprehension
# However, the code is harder to understand

cast = [", ".join([link.string for link in c.find_all("a")]) for c in cast_info]
cast

['Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer',
 'Christian Bale, Emily Watson, Taye Diggs, Angus Macfadyen',
 'Jet Li, Tony Leung Chiu Wai, Maggie Cheung, Daoming Chen',
 'Patrick Swayze, Kelly Lynch, Sam Elliott, Ben Gazzara',
 'Denzel Washington, Chris Pine, Rosario Dawson, Ethan Suplee',
 'Richard Roundtree, Moses Gunn, Gwen Mitchell, Christopher St. John',
 'Ok-bin Kim, Kim Seo-hyung, Shin Ha-kyun, Bang Sung-jun',
 'Christopher Lambert, Sean Connery, Roxanne Hart, Clancy Brown',
 'Bruce Willis, Bonnie Bedelia, William Atherton, Reginald VelJohnson',
 'Nicolas Cage, Diane Kruger, Justin Bartha, Sean Bean',
 'Tony Jaa, Petchtai Wongkamlao, Bongkoj Khongmalai, Bongkoo Kongmalai',
 'Matilda Anna Ingrid Lutz, Kevin Janssens, Vincent Colombe, Guillaume Bouchède',
 'Carlos Gallardo, Consuelo Gómez, Reinol Martinez, Peter Marquardt',
 'Feng Hsu, Chun Shih, Pai Ying, Tien Peng',
 'Tom Cruise, Kelly McGillis, Anthony Edwards, Val Kilmer',
 'Nicolas Cage, John Cusack, John Malkov

# Representing the data in structured form

In [86]:
#We will take advantage of Pandas and its dataframe for data storage

In [87]:
import pandas as pd

## Creating a Data Frame

In [88]:
# A dataframe is a tabular data type, frequently used in data science

movies_info = pd.DataFrame()
movies_info  # The dataframe is still empty, we need to fill it with the info we gathered

## Populating the dataframe

In [90]:
# Populating the dataframe

movies_info["Movie Title"] = movie_names
movies_info["Year"] = years
movies_info["Score"] = scores
movies_info["Adjusted Score"] = final_adj  
movies_info["Director"] = final_directors
movies_info["Synopsis"] = synopsis_text    
movies_info["Cast"] = cast
movies_info["Consensus"] = consensus_text

# Let's see how it looks
movies_info

Unnamed: 0,Movie Title,Year,Score,Adjusted Score,Director,Synopsis,Cast,Consensus
0,Running Scared,1986,60,61.188,Peter Hyams,"Distinguished by a sharp, witty dialogue betw...","Gregory Hines, Billy Crystal, Jimmy Smits, Ste...",Running Scared struggles to strike a consisten...
1,Equilibrium,2002,40,41.991,Kurt Wimmer,"In the nation of Libria, there is always peac...","Christian Bale, Emily Watson, Taye Diggs, Angu...",Equilibrium is a reheated mishmash of other sc...
2,Hero,2004,95,100.759,Zhang Yimou,Hero is two-time Academy Award nominee Zhang ...,"Jet Li, Tony Leung Chiu Wai, Maggie Cheung, Da...",With death-defying action sequences and epic h...
3,Road House,1989,39,41.997,Rowdy Herrington,Dalton (Swayze) is a true gentleman with a de...,"Patrick Swayze, Kelly Lynch, Sam Elliott, Ben ...",Whether Road House is simply bad or so bad it'...
4,Unstoppable,2010,86,91.465,Tony Scott,In this action thriller from director Tony Sc...,"Denzel Washington, Chris Pine, Rosario Dawson,...","As fast, loud, and relentless as the train at ..."
...,...,...,...,...,...,...,...,...
135,Lat sau san taam (Hard-Boiled),1992,94,96.039,John Woo,"Yun-Fat portrays a maverick, clarinet-playing...","Yun-Fat Chow, Tony Leung Chiu Wai, Anthony Won...",Boasting impactful action as well as surprisin...
136,The Matrix,1999,87,93.701,Lilly Wachowski,"What if virtual reality wasn't just for fun, ...","Keanu Reeves, Laurence Fishburne, Carrie-Anne ...","Thanks to the Wachowskis' imaginative vision, ..."
137,Terminator 2: Judgment Day,1991,93,99.166,James Cameron,A sequel to the sci-fi action thriller that m...,"Arnold Schwarzenegger, Linda Hamilton, Edward ...",T2 features thrilling action sequences and eye...
138,Die Hard,1988,93,98.816,John McTiernan,"It's Christmas time in L.A., and there's an e...","Bruce Willis, Alan Rickman, Reginald VelJohnso...",Its many imitators (and sequels) have never co...


In [91]:
# By default pandas abbreviates any text beyond a certain length (as seen in the Cast and Consensus columns)

# We can change that by setting the maximum column width to -1,
# which means the column would be as wide as to display the whole text
pd.set_option('display.max_colwidth', -1)
movies_info

Unnamed: 0,Movie Title,Year,Score,Adjusted Score,Director,Synopsis,Cast,Consensus
0,Running Scared,1986,60,61.188,Peter Hyams,"Distinguished by a sharp, witty dialogue between its two cop protagonists, Ray and Danny (Gregory Hines and Billy Crystal), this...","Gregory Hines, Billy Crystal, Jimmy Smits, Steven Bauer","Running Scared struggles to strike a consistent balance between violent action and humor, but the chemistry between its well-matched leads keeps things entertaining."
1,Equilibrium,2002,40,41.991,Kurt Wimmer,"In the nation of Libria, there is always peace among men. The rules of the Librian system are simple. If...","Christian Bale, Emily Watson, Taye Diggs, Angus Macfadyen",Equilibrium is a reheated mishmash of other sci-fi movies.
2,Hero,2004,95,100.759,Zhang Yimou,Hero is two-time Academy Award nominee Zhang Yimou's directorial attempt at exploring the concept of a Chinese hero. During the...,"Jet Li, Tony Leung Chiu Wai, Maggie Cheung, Daoming Chen","With death-defying action sequences and epic historic sweep, Hero offers everything a martial arts fan could ask for."
3,Road House,1989,39,41.997,Rowdy Herrington,Dalton (Swayze) is a true gentleman with a degree in philosophy from NYU. He also has a flip side -...,"Patrick Swayze, Kelly Lynch, Sam Elliott, Ben Gazzara",Whether Road House is simply bad or so bad it's good depends largely on the audience's fondness for Swayze -- and tolerance for violently cheesy action.
4,Unstoppable,2010,86,91.465,Tony Scott,"In this action thriller from director Tony Scott, rookie train operator Will (Chris Pine) and grizzled veteran engineer Frank (Denzel...","Denzel Washington, Chris Pine, Rosario Dawson, Ethan Suplee","As fast, loud, and relentless as the train at the center of the story, Unstoppable is perfect popcorn entertainment -- and director Tony Scott's best movie in years."
...,...,...,...,...,...,...,...,...
135,Lat sau san taam (Hard-Boiled),1992,94,96.039,John Woo,"Yun-Fat portrays a maverick, clarinet-playing cop nicknamed ""Tequila"" whose partner is killed in the dizzying chaos of a restaurant gunfight...","Yun-Fat Chow, Tony Leung Chiu Wai, Anthony Wong, Teresa Mo","Boasting impactful action as well as surprising emotional resonance, Hard Boiled is a powerful thriller that hits hard in more ways than one."
136,The Matrix,1999,87,93.701,Lilly Wachowski,"What if virtual reality wasn't just for fun, but was being used to imprison you? That's the dilemma that faces...","Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss, Joe Pantoliano","Thanks to the Wachowskis' imaginative vision, The Matrix is a smartly crafted combination of spectacular action and groundbreaking special effects."
137,Terminator 2: Judgment Day,1991,93,99.166,James Cameron,"A sequel to the sci-fi action thriller that made him and star Arnold Schwarzenegger A-list Hollywood names, writer/director James Cameron...","Arnold Schwarzenegger, Linda Hamilton, Edward Furlong, Robert Patrick","T2 features thrilling action sequences and eye-popping visual effects, but what takes this sci-fi/ action landmark to the next level is the depth of the human (and cyborg) characters."
138,Die Hard,1988,93,98.816,John McTiernan,"It's Christmas time in L.A., and there's an employee party in progress on the 30th floor of the Nakatomi Corporation...","Bruce Willis, Alan Rickman, Reginald VelJohnson, Bonnie Bedelia",Its many imitators (and sequels) have never come close to matching the taut thrills of the definitive holiday action classic.


## Exporting the data to CSV (comma-separated values) and excel files

In [92]:
# Write data to excel file
movies_info.to_excel("movies_info.xlsx", index = False, header = True)

In [93]:
# or write data to CSV file
movies_info.to_csv("movies_info.csv", index = False, header = True)

In [94]:
# or write data to CSV file
movies_info.to_csv("movies_info.csv", index = False, header = True)