# scraping Netflix series data from Rotten Tomatoes

## set up

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
base_site = "https://editorial.rottentomatoes.com/guide/best-netflix-shows-and-movies-to-binge-watch-now/"

In [5]:
response = requests.get(base_site)
response

<Response [200]>

In [6]:
response.status_code

200

In [7]:
html = response.content

In [8]:
type(html)

bytes

## making a soup

In [9]:
soup = BeautifulSoup(html,'lxml')

In [8]:
# saving this soup object to a html file for any refrence.

with open('Rotten_tomatoes_netflix.html','wb') as file:
    
    file.write(soup.prettify('utf-8'))

## Finding element containing all data

we need movie names, release year and its ratings on rotten tomatoes. so we inspected the html code in chrome and found the relevent tags in which we can find related information.

In [10]:
divs = soup.find_all('div',{'class': 'col-sm-18 col-full-xs countdown-item-content'})
divs

[<div class="col-sm-18 col-full-xs countdown-item-content">
 <div class="row countdown-item-title-bar">
 <div class="col-sm-20 col-full-xs" style="height: 100%;">
 <div class="article_movie_title" style="float: left;">
 <h2><a href="//www.rottentomatoes.com/tv/on_the_verge">On the Verge</a> <span class="subtle start-year">(2021)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">60%</span></h2>
 </div>
 </div>
 <div class="col-sm-4 col-full-xs" style="height: 100%;">
 <div class="countdown-index">#214</div>
 </div>
 </div>
 <div class="row countdown-item-details">
 <div class="col-sm-24">
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> Four women -- a chef, a single mother, an heiress, and a job seeker -- dig into love, work and...<a class="" data-pageheader="" href="//www.rottentomatoes.com/tv/on_the_verge" target="_top"> [More]</a></div>
 <div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="//www.

In [11]:
len(divs)

214

In [12]:
divs[0]

<div class="col-sm-18 col-full-xs countdown-item-content">
<div class="row countdown-item-title-bar">
<div class="col-sm-20 col-full-xs" style="height: 100%;">
<div class="article_movie_title" style="float: left;">
<h2><a href="//www.rottentomatoes.com/tv/on_the_verge">On the Verge</a> <span class="subtle start-year">(2021)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">60%</span></h2>
</div>
</div>
<div class="col-sm-4 col-full-xs" style="height: 100%;">
<div class="countdown-index">#214</div>
</div>
</div>
<div class="row countdown-item-details">
<div class="col-sm-24">
<div class="info synopsis"><span class="descriptor">Synopsis:</span> Four women -- a chef, a single mother, an heiress, and a job seeker -- dig into love, work and...<a class="" data-pageheader="" href="//www.rottentomatoes.com/tv/on_the_verge" target="_top"> [More]</a></div>
<div class="info cast">
<span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.c

### our content is under h2 tag. so we will now get all h2 tag.

In [13]:
# .find() will not work on list object. so take single element form list an apply on it.

divs[0].find('h2')

<h2><a href="//www.rottentomatoes.com/tv/on_the_verge">On the Verge</a> <span class="subtle start-year">(2021)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">60%</span></h2>

In [14]:
h2_tags = [div.find('h2') for div in divs ]
h2_tags

[<h2><a href="//www.rottentomatoes.com/tv/on_the_verge">On the Verge</a> <span class="subtle start-year">(2021)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">60%</span></h2>,
 <h2><a href="//www.rottentomatoes.com/tv/anne">Anne With an E</a> <span class="subtle start-year">(2017)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">53%</span></h2>,
 <h2><a href="//www.rottentomatoes.com/tv/derek">Derek</a> <span class="subtle start-year">(2013)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">60%</span></h2>,
 <h2><a href="//www.rottentomatoes.com/tv/ratched">Ratched</a> <span class="subtle start-year">(2020)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">61%</span></h2>,
 <h2><a href="//www.rottentomatoes.com/tv/behind_her_eyes">Behind Her Eyes</a> <span class="subtle start-year">(2021)</span> <span class="icon tiny fresh" title="Fresh"></s

In [15]:
# assessing h2_tags for text inside

[h.text for h in h2_tags]

['On the Verge (2021)  60%',
 'Anne With an E (2017)  53%',
 'Derek (2013)  60%',
 'Ratched (2020)  61%',
 'Behind Her Eyes (2021)  62%',
 'Bloodline (2015)  62%',
 'White Lines (2020)  64%',
 "Marvel's The Punisher (2017)  64%",
 'The Duchess (2020)  65%',
 'Lilyhammer (2012)  65%',
 'Halston (2021)  65%',
 'Disenchantment (2018)  65%',
 'Marco Polo (2014)  66%',
 'Grand Army (2020)  67%',
 'Killer Inside: The Mind of Aaron Hernandez (2020)  67%',
 'Cursed (2020)  67%',
 'Fear City: New York vs. the Mafia (2020)  68%',
 'Self Made: Inspired by the Life of Madam C.J. Walker (2020)  68%',
 'Love Is Blind (2020)  68%',
 'Ginny & Georgia (2021)  68%',
 'Warrior Nun (2020)  68%',
 'The Eddy (2020)  68%',
 'After Life (2019)  64%',
 'Pacific Rim: The Black (2021)  69%',
 'The Liberator (2020)  69%',
 'Dark Tourist (2018)  70%',
 'History of Swear Words (2021)  70%',
 'Marcella (2016)  70%',
 'The Serpent (2021)  70%',
 'Daybreak (2019)  70%',
 'Bonding (2019)  71%',
 'Requiem (2018)  71%',


In [16]:
h2_tags[0].find('a').string

'On the Verge'

## extracting movie names

In [17]:
movie_names = [m.find('a').string for m in h2_tags]
movie_names

['On the Verge',
 'Anne With an E',
 'Derek',
 'Ratched',
 'Behind Her Eyes',
 'Bloodline',
 'White Lines',
 "Marvel's The Punisher",
 'The Duchess',
 'Lilyhammer',
 'Halston',
 'Disenchantment',
 'Marco Polo',
 'Grand Army',
 'Killer Inside: The Mind of Aaron Hernandez',
 'Cursed',
 'Fear City: New York vs. the Mafia',
 'Self Made: Inspired by the Life of Madam C.J. Walker',
 'Love Is Blind',
 'Ginny & Georgia',
 'Warrior Nun',
 'The Eddy',
 'After Life',
 'Pacific Rim: The Black',
 'The Liberator',
 'Dark Tourist',
 'History of Swear Words',
 'Marcella',
 'The Serpent',
 'Daybreak',
 'Bonding',
 'Requiem',
 'Troy: Fall of a City',
 'Safe',
 'Wanderlust',
 'Dracula',
 'Designated Survivor',
 'Japan Sinks: 2020',
 'Everything Sucks!',
 'Night Stalker: The Hunt for a Serial Killer',
 'Watership Down: Miniseries',
 'Locke & Key',
 'Arrested Development',
 "DOTA: Dragon's Blood",
 'Altered Carbon',
 'High Score',
 'Sacred Games',
 'Unsolved Mysteries',
 'House of Cards',
 'Outer Banks',
 

## extracting year

In [18]:
h2_tags[0].find('span',{'class':'subtle start-year'})

<span class="subtle start-year">(2021)</span>

In [19]:
years = [s.find('span',{'class':'subtle start-year'}).string for s in h2_tags]
years

['(2021)',
 '(2017)',
 '(2013)',
 '(2020)',
 '(2021)',
 '(2015)',
 '(2020)',
 '(2017)',
 '(2020)',
 '(2012)',
 '(2021)',
 '(2018)',
 '(2014)',
 '(2020)',
 '(2020)',
 '(2020)',
 '(2020)',
 '(2020)',
 '(2020)',
 '(2021)',
 '(2020)',
 '(2020)',
 '(2019)',
 '(2021)',
 '(2020)',
 '(2018)',
 '(2021)',
 '(2016)',
 '(2021)',
 '(2019)',
 '(2019)',
 '(2018)',
 '(2018)',
 '(2018)',
 '(2018)',
 '(2020)',
 '(2016)',
 '(2020)',
 '(2018)',
 '(2021)',
 '(2018)',
 '(2020)',
 '(2003)',
 '(2021)',
 '(2018)',
 '(2020)',
 '(2018)',
 '(2020)',
 '(2013)',
 '(2020)',
 '(2021)',
 '(2018)',
 '(2017)',
 '(2017)',
 '(2016)',
 '(2018)',
 '(2018)',
 '(2021)',
 '(2021)',
 '(2017)',
 '(2018)',
 '(2019)',
 '(2021)',
 '(2018)',
 '(2021)',
 '(2019)',
 '(2019)',
 '(2020)',
 '(2016)',
 '(2019)',
 '(2018)',
 '(2021)',
 '(2020)',
 '(2021)',
 '(2019)',
 '(2020)',
 '(2019)',
 '(2017)',
 '(2019)',
 '(2015)',
 '(2020)',
 '(2015)',
 '(2018)',
 '(2016)',
 '(2018)',
 '(2017)',
 '(2011)',
 '(2015)',
 '(2019)',
 '(2018)',
 '(2019)',

In [20]:
years = [y.strip('()') for y in years]
years

['2021',
 '2017',
 '2013',
 '2020',
 '2021',
 '2015',
 '2020',
 '2017',
 '2020',
 '2012',
 '2021',
 '2018',
 '2014',
 '2020',
 '2020',
 '2020',
 '2020',
 '2020',
 '2020',
 '2021',
 '2020',
 '2020',
 '2019',
 '2021',
 '2020',
 '2018',
 '2021',
 '2016',
 '2021',
 '2019',
 '2019',
 '2018',
 '2018',
 '2018',
 '2018',
 '2020',
 '2016',
 '2020',
 '2018',
 '2021',
 '2018',
 '2020',
 '2003',
 '2021',
 '2018',
 '2020',
 '2018',
 '2020',
 '2013',
 '2020',
 '2021',
 '2018',
 '2017',
 '2017',
 '2016',
 '2018',
 '2018',
 '2021',
 '2021',
 '2017',
 '2018',
 '2019',
 '2021',
 '2018',
 '2021',
 '2019',
 '2019',
 '2020',
 '2016',
 '2019',
 '2018',
 '2021',
 '2020',
 '2021',
 '2019',
 '2020',
 '2019',
 '2017',
 '2019',
 '2015',
 '2020',
 '2015',
 '2018',
 '2016',
 '2018',
 '2017',
 '2011',
 '2015',
 '2019',
 '2018',
 '2019',
 '2021',
 '2020',
 '2021',
 '2015',
 '2017',
 '2016',
 '2020',
 '2016',
 '2018',
 '2022',
 '2012',
 '2019',
 '2021',
 '2016',
 '2020',
 '2016',
 '2021',
 '2019',
 '2015',
 '2019',
 

In [21]:
type(years[0])

str

In [22]:
years = [int(y) for y in years]
years

[2021,
 2017,
 2013,
 2020,
 2021,
 2015,
 2020,
 2017,
 2020,
 2012,
 2021,
 2018,
 2014,
 2020,
 2020,
 2020,
 2020,
 2020,
 2020,
 2021,
 2020,
 2020,
 2019,
 2021,
 2020,
 2018,
 2021,
 2016,
 2021,
 2019,
 2019,
 2018,
 2018,
 2018,
 2018,
 2020,
 2016,
 2020,
 2018,
 2021,
 2018,
 2020,
 2003,
 2021,
 2018,
 2020,
 2018,
 2020,
 2013,
 2020,
 2021,
 2018,
 2017,
 2017,
 2016,
 2018,
 2018,
 2021,
 2021,
 2017,
 2018,
 2019,
 2021,
 2018,
 2021,
 2019,
 2019,
 2020,
 2016,
 2019,
 2018,
 2021,
 2020,
 2021,
 2019,
 2020,
 2019,
 2017,
 2019,
 2015,
 2020,
 2015,
 2018,
 2016,
 2018,
 2017,
 2011,
 2015,
 2019,
 2018,
 2019,
 2021,
 2020,
 2021,
 2015,
 2017,
 2016,
 2020,
 2016,
 2018,
 2022,
 2012,
 2019,
 2021,
 2016,
 2020,
 2016,
 2021,
 2019,
 2015,
 2019,
 2015,
 2017,
 2021,
 2016,
 2017,
 2018,
 2013,
 2016,
 2021,
 2019,
 2020,
 2021,
 2020,
 2015,
 2018,
 2021,
 2021,
 2019,
 2020,
 2021,
 2019,
 2021,
 2013,
 2015,
 2017,
 2020,
 2021,
 2019,
 2018,
 2018,
 2018,
 2018,

## extracting ratings

In [44]:
h2_tags[0].find('span',{'class':'tMeterScore'})

<span class="tMeterScore">60%</span>

In [45]:
scores = [s.find('span',{'class':'tMeterScore'}).string for s in h2_tags]
scores

['60%',
 '53%',
 '60%',
 '61%',
 '62%',
 '62%',
 '64%',
 '64%',
 '65%',
 '65%',
 '65%',
 '65%',
 '66%',
 '67%',
 '67%',
 '67%',
 '68%',
 '68%',
 '68%',
 '68%',
 '68%',
 '68%',
 '64%',
 '69%',
 '69%',
 '70%',
 '70%',
 '70%',
 '70%',
 '70%',
 '71%',
 '71%',
 '71%',
 '71%',
 '71%',
 '71%',
 '71%',
 '72%',
 '72%',
 '73%',
 '74%',
 '74%',
 '74%',
 None,
 '75%',
 '76%',
 '76%',
 '77%',
 '77%',
 '78%',
 '78%',
 '78%',
 '78%',
 '78%',
 '79%',
 '79%',
 '79%',
 '80%',
 '80%',
 '80%',
 '80%',
 '80%',
 '81%',
 '81%',
 '81%',
 '81%',
 '81%',
 '81%',
 '81%',
 '81%',
 '81%',
 '82%',
 '82%',
 '77%',
 '83%',
 '83%',
 None,
 '83%',
 '83%',
 '83%',
 '84%',
 '80%',
 '84%',
 '84%',
 '84%',
 '86%',
 '84%',
 '85%',
 '86%',
 '86%',
 '86%',
 '86%',
 '86%',
 '86%',
 '86%',
 '87%',
 '87%',
 None,
 '87%',
 '88%',
 '84%',
 '88%',
 '88%',
 '88%',
 '88%',
 '88%',
 '88%',
 '89%',
 '89%',
 '89%',
 '89%',
 '89%',
 '89%',
 '90%',
 '90%',
 '90%',
 '90%',
 '90%',
 '90%',
 '91%',
 '91%',
 '91%',
 '91%',
 '91%',
 '91%',
 '9

In [46]:
scores[0][:-1]

'60'

In [47]:
type(scores[0])

bs4.element.NavigableString

In [48]:
 

scores = [s.strip('%') for s in scores]
scores

AttributeError: 'NoneType' object has no attribute 'strip'

In [49]:
# if at some places string is missing then it is replaced with 'none' so to counter this kind of error we will use
# if and else condition inside a list comprehension.

scores = [None if s is None else s.strip('%') for s in scores]
scores

['60',
 '53',
 '60',
 '61',
 '62',
 '62',
 '64',
 '64',
 '65',
 '65',
 '65',
 '65',
 '66',
 '67',
 '67',
 '67',
 '68',
 '68',
 '68',
 '68',
 '68',
 '68',
 '64',
 '69',
 '69',
 '70',
 '70',
 '70',
 '70',
 '70',
 '71',
 '71',
 '71',
 '71',
 '71',
 '71',
 '71',
 '72',
 '72',
 '73',
 '74',
 '74',
 '74',
 None,
 '75',
 '76',
 '76',
 '77',
 '77',
 '78',
 '78',
 '78',
 '78',
 '78',
 '79',
 '79',
 '79',
 '80',
 '80',
 '80',
 '80',
 '80',
 '81',
 '81',
 '81',
 '81',
 '81',
 '81',
 '81',
 '81',
 '81',
 '82',
 '82',
 '77',
 '83',
 '83',
 None,
 '83',
 '83',
 '83',
 '84',
 '80',
 '84',
 '84',
 '84',
 '86',
 '84',
 '85',
 '86',
 '86',
 '86',
 '86',
 '86',
 '86',
 '86',
 '87',
 '87',
 None,
 '87',
 '88',
 '84',
 '88',
 '88',
 '88',
 '88',
 '88',
 '88',
 '89',
 '89',
 '89',
 '89',
 '89',
 '89',
 '90',
 '90',
 '90',
 '90',
 '90',
 '90',
 '91',
 '91',
 '91',
 '91',
 '91',
 '91',
 '91',
 '92',
 '92',
 '92',
 '92',
 '92',
 '95',
 '92',
 '92',
 '92',
 '92',
 '93',
 '93',
 '93',
 '93',
 '93',
 '93',
 None,

In [50]:
int(scores[0])

60

In [51]:
scores = [None if s is None else int(s) for s in scores]
scores

[60,
 53,
 60,
 61,
 62,
 62,
 64,
 64,
 65,
 65,
 65,
 65,
 66,
 67,
 67,
 67,
 68,
 68,
 68,
 68,
 68,
 68,
 64,
 69,
 69,
 70,
 70,
 70,
 70,
 70,
 71,
 71,
 71,
 71,
 71,
 71,
 71,
 72,
 72,
 73,
 74,
 74,
 74,
 None,
 75,
 76,
 76,
 77,
 77,
 78,
 78,
 78,
 78,
 78,
 79,
 79,
 79,
 80,
 80,
 80,
 80,
 80,
 81,
 81,
 81,
 81,
 81,
 81,
 81,
 81,
 81,
 82,
 82,
 77,
 83,
 83,
 None,
 83,
 83,
 83,
 84,
 80,
 84,
 84,
 84,
 86,
 84,
 85,
 86,
 86,
 86,
 86,
 86,
 86,
 86,
 87,
 87,
 None,
 87,
 88,
 84,
 88,
 88,
 88,
 88,
 88,
 88,
 89,
 89,
 89,
 89,
 89,
 89,
 90,
 90,
 90,
 90,
 90,
 90,
 91,
 91,
 91,
 91,
 91,
 91,
 91,
 92,
 92,
 92,
 92,
 92,
 95,
 92,
 92,
 92,
 92,
 93,
 93,
 93,
 93,
 93,
 93,
 None,
 93,
 93,
 93,
 93,
 94,
 94,
 94,
 94,
 94,
 94,
 94,
 94,
 94,
 93,
 95,
 None,
 95,
 95,
 97,
 94,
 95,
 95,
 86,
 95,
 96,
 96,
 96,
 96,
 96,
 96,
 97,
 None,
 97,
 97,
 97,
 97,
 97,
 97,
 97,
 97,
 97,
 98,
 None,
 98,
 98,
 98,
 98,
 98,
 98,
 99,
 99,
 99,
 99,
 100,


## Extracting rest of the information

### extracting Synopsis:

In [52]:
divs[0]

<div class="col-sm-18 col-full-xs countdown-item-content">
<div class="row countdown-item-title-bar">
<div class="col-sm-20 col-full-xs" style="height: 100%;">
<div class="article_movie_title" style="float: left;">
<h2><a href="//www.rottentomatoes.com/tv/on_the_verge">On the Verge</a> <span class="subtle start-year">(2021)</span> <span class="icon tiny fresh" title="Fresh"></span> <span class="tMeterScore">60%</span></h2>
</div>
</div>
<div class="col-sm-4 col-full-xs" style="height: 100%;">
<div class="countdown-index">#214</div>
</div>
</div>
<div class="row countdown-item-details">
<div class="col-sm-24">
<div class="info synopsis"><span class="descriptor">Synopsis:</span> Four women -- a chef, a single mother, an heiress, and a job seeker -- dig into love, work and...<a class="" data-pageheader="" href="//www.rottentomatoes.com/tv/on_the_verge" target="_top"> [More]</a></div>
<div class="info cast">
<span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.c

In [61]:
divs[0].find('div',{'class':'info synopsis'}).text

'Synopsis: Four women -- a chef, a single mother, an heiress, and a job seeker -- dig into love, work and... [More]'

In [104]:
synopsis = [syn.find('div',{'class':'info synopsis'}) for syn in divs]
synopsis

[<div class="info synopsis"><span class="descriptor">Synopsis:</span> Four women -- a chef, a single mother, an heiress, and a job seeker -- dig into love, work and...<a class="" data-pageheader="" href="//www.rottentomatoes.com/tv/on_the_verge" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> This reimagining of the classic book and film is a coming-of-age story about a young orphan who is seeking love,...<a class="" data-pageheader="" href="//www.rottentomatoes.com/tv/anne" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> A documentary-style comedy-drama follows a group of people living on the fringe of society. At the group's core is Derek...<a class="" data-pageheader="" href="//www.rottentomatoes.com/tv/derek" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> A young nurse at a mental institution becomes jaded and bitter b

In [105]:
type(synopsis[0])

bs4.element.Tag

In [106]:
synopsis[0].get_text()

'Synopsis: Four women -- a chef, a single mother, an heiress, and a job seeker -- dig into love, work and... [More]'

In [107]:
synopsis[0].contents

[<span class="descriptor">Synopsis:</span>,
 ' Four women -- a chef, a single mother, an heiress, and a job seeker -- dig into love, work and...',
 <a class="" data-pageheader="" href="//www.rottentomatoes.com/tv/on_the_verge" target="_top"> [More]</a>]

In [111]:
synopsis[0].contents[1].strip()

'Four women -- a chef, a single mother, an heiress, and a job seeker -- dig into love, work and...'

In [112]:
synopsis_text = [ None if s is None else s.contents[1].strip() for s in synopsis]
synopsis_text

['Four women -- a chef, a single mother, an heiress, and a job seeker -- dig into love, work and...',
 'This reimagining of the classic book and film is a coming-of-age story about a young orphan who is seeking love,...',
 "A documentary-style comedy-drama follows a group of people living on the fringe of society. At the group's core is Derek...",
 'A young nurse at a mental institution becomes jaded and bitter before turning into a full-fledged monster to her patients....',
 'A single mother enters a world of twisted mind games when she begins an affair with her psychiatrist boss while...',
 'From the creators of "Damages," "Bloodline" is a dramatic thriller that explores the demons lurking beneath the surface of a...',
 "Zoe Walker leaves her quiet life behind to investigate her brother's disappearance in Ibiza, where she heads down a decadent...",
 'After exacting revenge on the people responsible for the deaths of his wife and children, Frank Castle uncovers a conspiracy...',
 'In 

## extracting cast

In [118]:
divs[0].find('div',{'class':'info cast'}).find_all('a')

[<a class="" href="//www.rottentomatoes.com/celebrity/julie_delpy">Julie Delpy</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/elisabeth_shue">Elisabeth Shue</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/sarah_jones_5">Sarah Jones</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/alexia_landeau">Alexia Landeau</a>]

In [119]:
cast_tags = [a.find('div',{'class':'info cast'}) for a in divs]
cast_tags

[<div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/celebrity/julie_delpy">Julie Delpy</a>, <a class="" href="//www.rottentomatoes.com/celebrity/elisabeth_shue">Elisabeth Shue</a>, <a class="" href="//www.rottentomatoes.com/celebrity/sarah_jones_5">Sarah Jones</a>, <a class="" href="//www.rottentomatoes.com/celebrity/alexia_landeau">Alexia Landeau</a></div>,
 <div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/celebrity/amybeth_mcnulty">Amybeth McNulty</a>, <a class="" href="//www.rottentomatoes.com/celebrity/rh_thomson">R.H. Thomson</a>, <a class="" href="//www.rottentomatoes.com/celebrity/geraldine_james">Geraldine James</a>, <a class="" href="//www.rottentomatoes.com/celebrity/lucas_jade_zumann">Lucas Jade Zumann</a></div>,
 <div class="info cast">
 <span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/celebrity/ricky_gervais">Ricky Gerv

In [120]:
cast_tags[0]

<div class="info cast">
<span class="descriptor">Starring:</span> <a class="" href="//www.rottentomatoes.com/celebrity/julie_delpy">Julie Delpy</a>, <a class="" href="//www.rottentomatoes.com/celebrity/elisabeth_shue">Elisabeth Shue</a>, <a class="" href="//www.rottentomatoes.com/celebrity/sarah_jones_5">Sarah Jones</a>, <a class="" href="//www.rottentomatoes.com/celebrity/alexia_landeau">Alexia Landeau</a></div>

In [127]:
cast_tags[0].find_all('a')

[<a class="" href="//www.rottentomatoes.com/celebrity/julie_delpy">Julie Delpy</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/elisabeth_shue">Elisabeth Shue</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/sarah_jones_5">Sarah Jones</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/alexia_landeau">Alexia Landeau</a>]

In [129]:
[a.find_all('a') for a in cast_tags]

[[<a class="" href="//www.rottentomatoes.com/celebrity/julie_delpy">Julie Delpy</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/elisabeth_shue">Elisabeth Shue</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/sarah_jones_5">Sarah Jones</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/alexia_landeau">Alexia Landeau</a>],
 [<a class="" href="//www.rottentomatoes.com/celebrity/amybeth_mcnulty">Amybeth McNulty</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/rh_thomson">R.H. Thomson</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/geraldine_james">Geraldine James</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/lucas_jade_zumann">Lucas Jade Zumann</a>],
 [<a class="" href="//www.rottentomatoes.com/celebrity/ricky_gervais">Ricky Gervais</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/karl_pilkington">Karl Pilkington</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/kerry_godliman">Kerry Godliman</a>,
  <

In [136]:
cast_a_tags = []

for a in cast_tags:
    anchor = a.find_all('a')
    cast_a_tags.append(anchor)

In [137]:
len(cast_a_tags)

214

In [144]:
cast_a_tags[0]

[<a class="" href="//www.rottentomatoes.com/celebrity/julie_delpy">Julie Delpy</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/elisabeth_shue">Elisabeth Shue</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/sarah_jones_5">Sarah Jones</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/alexia_landeau">Alexia Landeau</a>]

In [150]:
cast = []

for a in cast_a_tags:
    cast_mov =[]
    for t in a:
        ab = t.text
        cast_mov.append(ab)
    result = ", ".join(cast_mov)
    cast.append(result)

In [151]:
cast

['Julie Delpy, Elisabeth Shue, Sarah Jones, Alexia Landeau',
 'Amybeth McNulty, R.H. Thomson, Geraldine James, Lucas Jade Zumann',
 'Ricky Gervais, Karl Pilkington, Kerry Godliman, David Earl',
 'Sarah Paulson, Sharon Stone, Cynthia Nixon, Finn Wittrock',
 'Simona Brown, Robert Aramayo, Tyler Howitt, Eve Hewson',
 'Kyle Chandler, Sissy Spacek, Linda Cardellini, Norbert Leo Butz',
 'Laura Haddock, Daniel Mays, Guillermo Lasheras, Pedro Casablanc',
 'Jon Bernthal, Ben Barnes, Amber Rose Revah, Jason R. Moore',
 'Katherine Ryan, Steen Raskopoulos, Kate Byrne, Rory Keenan',
 'Steven Van Zandt, Trond Fausa Aurvåg, Marian Saastad Ottesen, Steinar Sagen',
 'Ewan McGregor, Bill Pullman, Rebecca Dayan, David Pittu',
 'Abbi Jacobson, Nat Faxon, Eric André, John DiMaggio',
 'Lorenzo Richelmy, Michelle Yeoh, Benedict Wong, Joan Chen',
 "Odessa A'Zion, Odley Jean, Maliq Johnson, Amalia Yoo",
 '',
 'Katherine Langford, Devon Terrell, Gustaf Skarsgård, Peter Mullan',
 '',
 'Octavia Spencer, Tiffany H

In [152]:
len(cast)

214

## extracting directors info 

In [157]:
divs[1].find('div',{'class':'info director'})

<div class="info director">
<span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/miranda_de_pencier">Miranda de Pencier</a>, <a class="" href="//www.rottentomatoes.com/celebrity/elizabeth_bradley">Elizabeth Bradley</a>, <a class="" href="//www.rottentomatoes.com/celebrity/alison_owen">Alison Owen</a>, <a class="" href="//www.rottentomatoes.com/celebrity/debra_hayward">Debra Hayward</a></div>

In [159]:
director_info = [d.find('div',{'class':'info director'}) for d in divs]
director_info

[<div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/julie_delpy">Julie Delpy</a>, <a class="" href="//www.rottentomatoes.com/celebrity/elisabeth_shue">Elisabeth Shue</a>, <a class="" href="//www.rottentomatoes.com/celebrity/michael_gentile">Michael Gentile</a>, <a class="" href="//www.rottentomatoes.com/celebrity/lauraine_heftler">Lauraine Heftler</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/celebrity/miranda_de_pencier">Miranda de Pencier</a>, <a class="" href="//www.rottentomatoes.com/celebrity/elizabeth_bradley">Elizabeth Bradley</a>, <a class="" href="//www.rottentomatoes.com/celebrity/alison_owen">Alison Owen</a>, <a class="" href="//www.rottentomatoes.com/celebrity/debra_hayward">Debra Hayward</a></div>,
 <div class="info director">
 <span class="descriptor">Directed By:</span> <a class="" href="//www.rottentomatoes.com/ce

In [161]:
director_info[0].find_all('a')

[<a class="" href="//www.rottentomatoes.com/celebrity/julie_delpy">Julie Delpy</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/elisabeth_shue">Elisabeth Shue</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/michael_gentile">Michael Gentile</a>,
 <a class="" href="//www.rottentomatoes.com/celebrity/lauraine_heftler">Lauraine Heftler</a>]

In [163]:
director_atag = [None if d is None else d.find_all('a') for d in director_info]
director_atag

[[<a class="" href="//www.rottentomatoes.com/celebrity/julie_delpy">Julie Delpy</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/elisabeth_shue">Elisabeth Shue</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/michael_gentile">Michael Gentile</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/lauraine_heftler">Lauraine Heftler</a>],
 [<a class="" href="//www.rottentomatoes.com/celebrity/miranda_de_pencier">Miranda de Pencier</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/elizabeth_bradley">Elizabeth Bradley</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/alison_owen">Alison Owen</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/debra_hayward">Debra Hayward</a>],
 [<a class="" href="//www.rottentomatoes.com/celebrity/ricky_gervais">Ricky Gervais</a>],
 [<a class="" href="//www.rottentomatoes.com/celebrity/ryan_murphy">Ryan Murphy</a>,
  <a class="" href="//www.rottentomatoes.com/celebrity/michael_douglas">Michael Douglas</

In [166]:
director_atag[0][0].text

'Julie Delpy'

In [176]:
director = []

for l in director_atag:
    d_names = []
    
    if l is None: # accounting for none values
        director.append("") # adding space to maintain total length of list
        continue
    else:
        for a in l:
            name = a.text
            d_names.append(name)
    result = ", ".join(d_names)
    director.append(result)

In [177]:
director

['Julie Delpy, Elisabeth Shue, Michael Gentile, Lauraine Heftler',
 'Miranda de Pencier, Elizabeth Bradley, Alison Owen, Debra Hayward',
 'Ricky Gervais',
 'Ryan Murphy, Michael Douglas, Aleen Keshishian, Margaret Riley',
 'Erik Richter Strand',
 'Todd A. Kessler, Daniel Zelman, Glenn Kessler',
 'Nick Hamm, Luis Prieto',
 'Steve Lightfoot',
 'Katherine Ryan, Dave Becky, Josh Lieberman, Murray Ferguson',
 'Geir Henning Hopland, Simen Alsvik, Lisa Marie Gamlem, Øystein Karlsen',
 'Ryan Murphy, Alexis Martin Woodall, Ian Brennan, Daniel Minahan',
 'Matt Groening, Josh Weinstein',
 'John Fusco, Dan Minahan, Patrick MacManus, Harvey Weinstein',
 'Josh Donen, Beau Willimon, Jordan Tappis',
 'Angus Wall, Christina Douglas, Dan Wetzel, Kevin Armstrong',
 'Frank Miller, Thomas Wheeler, Leila Gerstein, Zetna Fuentes',
 'Dimitri Doganis, Adam Hawkins, Jon Liebman, Bart Layton',
 'Janine Sherman Barrois, Elle Johnson, Maverick Carter, LeBron James',
 'Chris Coelen, Sam Dean, Eric Detwiler, Ally Si

In [178]:
len(director)

214

# Representing data in structured form

In [179]:
# for this we will use pandas dataframe.

In [180]:
import pandas as pd

### creating a dataframe

In [181]:
netflix_series = pd.DataFrame()

### populating dataframe

In [182]:
netflix_series['series_names'] = movie_names

In [183]:
netflix_series['release_year'] = years

In [184]:
netflix_series['ratings'] = scores

In [185]:
netflix_series['synopsis'] = synopsis_text

In [186]:
netflix_series['cast'] = cast

In [187]:
netflix_series['director'] = director

In [190]:
netflix_series.head()

Unnamed: 0,series_names,release_year,ratings,synopsis,cast,director
0,On the Verge,2021,60.0,"Four women -- a chef, a single mother, an heiress, and a job seeker -- dig into love, work and...","Julie Delpy, Elisabeth Shue, Sarah Jones, Alexia Landeau","Julie Delpy, Elisabeth Shue, Michael Gentile, Lauraine Heftler"
1,Anne With an E,2017,53.0,"This reimagining of the classic book and film is a coming-of-age story about a young orphan who is seeking love,...","Amybeth McNulty, R.H. Thomson, Geraldine James, Lucas Jade Zumann","Miranda de Pencier, Elizabeth Bradley, Alison Owen, Debra Hayward"
2,Derek,2013,60.0,A documentary-style comedy-drama follows a group of people living on the fringe of society. At the group's core is Derek...,"Ricky Gervais, Karl Pilkington, Kerry Godliman, David Earl",Ricky Gervais
3,Ratched,2020,61.0,A young nurse at a mental institution becomes jaded and bitter before turning into a full-fledged monster to her patients....,"Sarah Paulson, Sharon Stone, Cynthia Nixon, Finn Wittrock","Ryan Murphy, Michael Douglas, Aleen Keshishian, Margaret Riley"
4,Behind Her Eyes,2021,62.0,A single mother enters a world of twisted mind games when she begins an affair with her psychiatrist boss while...,"Simona Brown, Robert Aramayo, Tyler Howitt, Eve Hewson",Erik Richter Strand


In [189]:
# this code is to display full column text if not displayed properly
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


In [191]:
netflix_series.head()

Unnamed: 0,series_names,release_year,ratings,synopsis,cast,director
0,On the Verge,2021,60.0,"Four women -- a chef, a single mother, an heiress, and a job seeker -- dig into love, work and...","Julie Delpy, Elisabeth Shue, Sarah Jones, Alexia Landeau","Julie Delpy, Elisabeth Shue, Michael Gentile, Lauraine Heftler"
1,Anne With an E,2017,53.0,"This reimagining of the classic book and film is a coming-of-age story about a young orphan who is seeking love,...","Amybeth McNulty, R.H. Thomson, Geraldine James, Lucas Jade Zumann","Miranda de Pencier, Elizabeth Bradley, Alison Owen, Debra Hayward"
2,Derek,2013,60.0,A documentary-style comedy-drama follows a group of people living on the fringe of society. At the group's core is Derek...,"Ricky Gervais, Karl Pilkington, Kerry Godliman, David Earl",Ricky Gervais
3,Ratched,2020,61.0,A young nurse at a mental institution becomes jaded and bitter before turning into a full-fledged monster to her patients....,"Sarah Paulson, Sharon Stone, Cynthia Nixon, Finn Wittrock","Ryan Murphy, Michael Douglas, Aleen Keshishian, Margaret Riley"
4,Behind Her Eyes,2021,62.0,A single mother enters a world of twisted mind games when she begins an affair with her psychiatrist boss while...,"Simona Brown, Robert Aramayo, Tyler Howitt, Eve Hewson",Erik Richter Strand


### exporting this data to a csv file

In [192]:
netflix_series.to_csv('netflix_series.csv', index= False)