In [1]:
import pandas as pd # library for data analysis
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents

In [2]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(2015%E2%80%932017)"
response=requests.get(wikiurl)

In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
netflix_table=soup.find_all('table',{'class':"wikitable"}) #find_all scans the entire document to look for the tag <table>

In [4]:
print(netflix_table)

[<table class="wikitable sortable" style="width:100%">
<tbody><tr>
<th style="width:20%;">Title
</th>
<th style="width:12%;">Genre
</th>
<th style="width:7%;">Premiere
</th>
<th style="width:7%;">Runtime
</th>
<th style="width:5%;">Language
</th></tr>
<tr>
<td><i><a href="/wiki/Beasts_of_No_Nation_(film)" title="Beasts of No Nation (film)">Beasts of No Nation</a></i>
</td>
<td><a href="/wiki/War_film" title="War film">War drama</a>
</td>
<td><span data-sort-value="000000002015-10-16-0000" style="white-space:nowrap">October 16, 2015</span>
</td>
<td><span data-sort-value="137 !">2 hours, 17 min.</span>
</td>
<td>English
</td></tr>
<tr>
<td><i><a href="/wiki/The_Ridiculous_6" title="The Ridiculous 6">The Ridiculous 6</a></i>
</td>
<td><a href="/wiki/Western_(genre)" title="Western (genre)">Western</a>
</td>
<td><span data-sort-value="000000002015-12-11-0000" style="white-space:nowrap">December 11, 2015</span>
</td>
<td><span data-sort-value="120 !">2 hours</span>
</td>
<td>English
</td><

In [5]:
#We read the HTML table into a list of dataframe object using read_html()
df=pd.read_html(str(netflix_table))
df #returns a list

[                                            Title                       Genre  \
 0                             Beasts of No Nation                   War drama   
 1                                The Ridiculous 6                     Western   
 2                           Pee-wee's Big Holiday                   Adventure   
 3                          Special Correspondents                      Satire   
 4                                     The Do-Over               Action comedy   
 5                      The Fundamentals of Caring                Comedy-drama   
 6                                   Brahman Naman                      Comedy   
 7                                         Rebirth                    Thriller   
 8                                        Tallulah                Comedy-drama   
 9                                            XOXO                       Drama   
 10                                            ARQ  Science fiction / Thriller   
 11             

In [6]:
type(df[0]) #dataframe in list, three dataframes

pandas.core.frame.DataFrame

In [7]:
df[0].head()

Unnamed: 0,Title,Genre,Premiere,Runtime,Language
0,Beasts of No Nation,War drama,"October 16, 2015","2 hours, 17 min.",English
1,The Ridiculous 6,Western,"December 11, 2015",2 hours,English
2,Pee-wee's Big Holiday,Adventure,"March 18, 2016","1 hour, 30 min.",English
3,Special Correspondents,Satire,"April 29, 2016","1 hour, 41 min.",English
4,The Do-Over,Action comedy,"May 27, 2016","1 hour, 48 min.",English


In [8]:
#get documentaries
df[1].insert(1,'Genre','Documentary')

In [9]:
df[1].head()

Unnamed: 0,Title,Genre,Premiere,Runtime,Language
0,My Own Man,Documentary,"March 6, 2015","1 hour, 21 min.",English
1,The Other One: The Long Strange Trip of Bob Weir,Documentary,"May 22, 2015","1 hour, 23 min.",English
2,Hot Girls Wanted,Documentary,"May 29, 2015","1 hour, 24 min.",English
3,"What Happened, Miss Simone?",Documentary,"June 26, 2015","1 hour, 24 min.",English
4,Tig,Documentary,"July 17, 2015","1 hour, 20 min.",English


In [10]:
net_15_17 = pd.concat([df[0],df[1]],axis=0)

In [11]:
net_15_17

Unnamed: 0,Title,Genre,Premiere,Runtime,Language
0,Beasts of No Nation,War drama,"October 16, 2015","2 hours, 17 min.",English
1,The Ridiculous 6,Western,"December 11, 2015",2 hours,English
2,Pee-wee's Big Holiday,Adventure,"March 18, 2016","1 hour, 30 min.",English
3,Special Correspondents,Satire,"April 29, 2016","1 hour, 41 min.",English
4,The Do-Over,Action comedy,"May 27, 2016","1 hour, 48 min.",English
...,...,...,...,...,...
36,Joan Didion: The Center Will Not Hold,Documentary,"October 27, 2017","1 hour, 38 min.",English
37,Jim & Andy: The Great Beyond – Featuring a Ver...,Documentary,"November 17, 2017","1 hour, 34 min.",English
38,Saving Capitalism,Documentary,"November 21, 2017","1 hour, 13 min.",English
39,Cuba and the Cameraman,Documentary,"November 24, 2017","1 hour, 54 min.",English


# year 2018 data

In [12]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(2018)"
response=requests.get(wikiurl)

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
netflix_table=soup.find_all('table',{'class':"wikitable"}) #find_all scans the entire document to look for the tag <table>

#We read the HTML table into a list of dataframe object using read_html()
df2=pd.read_html(str(netflix_table))

#get documentaries
df2[1].insert(1,'Genre','Documentary')

net_18 = pd.concat([df2[0],df2[1]],axis=0)

In [13]:
net_18

Unnamed: 0,Title,Genre,Premiere,Runtime,Language
0,The Polka King,Comedy-drama,"January 12, 2018","1 hour, 34 min.",English
1,Step Sisters,Comedy,"January 19, 2018","1 hour, 48 min.",English
2,The Open House,Horror thriller,"January 19, 2018","1 hour, 34 min.",English
3,A Futile and Stupid Gesture,Biographical / Comedy,"January 26, 2018","1 hour, 41 min.",English
4,The Cloverfield Paradox,Science fiction,"February 4, 2018","1 hour, 42 min.",English
...,...,...,...,...,...
20,ReMastered: Tricky Dick & the Man in Black,Documentary,"November 2, 2018",58 min.,English
21,The American Meme,Documentary,"December 7, 2018","1 hour, 38 min.",English
22,ReMastered: Who Killed Jam Master Jay?,Documentary,"December 7, 2018",58 min.,English
23,"Out of Many, One",Documentary,"December 12, 2018",34 min.,English


# year 2019 data

In [14]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(2019)"
response=requests.get(wikiurl)

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
netflix_table=soup.find_all('table',{'class':"wikitable"}) #find_all scans the entire document to look for the tag <table>

#We read the HTML table into a list of dataframe object using read_html()
df3=pd.read_html(str(netflix_table))

#get documentaries
df3[1].insert(1,'Genre','Documentary')

net_19 = pd.concat([df3[0],df3[1]],axis=0)

In [15]:
net_19

Unnamed: 0,Title,Genre,Premiere,Runtime,Language
0,Lionheart,Comedy,"January 4, 2019","1 hour, 34 min.",English
1,The Last Laugh,Comedy-drama,"January 11, 2019","1 hour, 38 min.",English
2,IO,Science fiction / Drama,"January 18, 2019","1 hour, 36 min.",English
3,Soni,Crime drama,"January 18, 2019","1 hour, 37 min.",Hindi
4,Polar,Action,"January 25, 2019","1 hour, 58 min.",English
...,...,...,...,...,...
35,Fire in Paradise,Documentary,"November 1, 2019",39 min.,English
36,"Bikram: Yogi, Guru, Predator",Documentary,"November 20, 2019","1 hour, 26 min.",English
37,"Lorena, Light-Footed Woman",Documentary,"November 20, 2019",28 min.,Spanish
38,After the Raid,Documentary,"December 19, 2019",25 min.,Spanish


# year 2020 data

In [16]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(2020)"
response=requests.get(wikiurl)

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
netflix_table=soup.find_all('table',{'class':"wikitable"}) #find_all scans the entire document to look for the tag <table>

#We read the HTML table into a list of dataframe object using read_html()
df4=pd.read_html(str(netflix_table))

#get documentaries
df4[1].insert(1,'Genre','Documentary')

net_20 = pd.concat([df4[0],df4[1]],axis=0)

In [17]:
net_20

Unnamed: 0,Title,Genre,Premiere,Runtime,Language
0,Ghost Stories,Horror anthology,"January 1, 2020","2 hours, 24 min.",Hindi
1,A Fall from Grace,Thriller,"January 17, 2020",2 hours,English
2,Airplane Mode,Comedy,"January 23, 2020","1 hour, 36 min.",Portuguese
3,Horse Girl,Drama,"February 7, 2020","1 hour, 44 min.",English
4,To All the Boys: P.S. I Still Love You,Romantic comedy,"February 12, 2020","1 hour, 42 min.",English
...,...,...,...,...,...
35,Secrets of the Saqqara Tomb,Documentary,"October 28, 2020","1 hour, 54 min.",English
36,Shawn Mendes: In Wonder,Documentary,"November 23, 2020","1 hour, 23 min.",English
37,Dance Dreams: Hot Chocolate Nutcracker,Documentary,"November 27, 2020","1 hour, 20 min..",English
38,Emicida: AmarElo – It's All For Yesterday,Documentary,"December 8, 2020","1 hour, 29 min.",Portuguese


# year 2021 data

In [18]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(since_2021)"
response=requests.get(wikiurl)

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
netflix_table=soup.find_all('table',{'class':"wikitable"}) #find_all scans the entire document to look for the tag <table>

#We read the HTML table into a list of dataframe object using read_html()
df5=pd.read_html(str(netflix_table))

#get documentaries
df5[1].insert(1,'Genre','Documentary')

In [19]:
#delete unreleased rows
df5[0].iloc[0:135,].tail()

Unnamed: 0,Title,Genre,Premiere,Runtime,Language
130,Just Short of Perfect,Romantic comedy,"November 18, 2021",1 h 34 min,Portuguese
131,The Princess Switch 3: Romancing the Star,Romantic comedy,"November 18, 2021",1 h 46 min,English
132,Dhamaka,Action thriller,"November 19, 2021",1 h 44 min,Hindi
133,Love Me Instead,Drama,"November 19, 2021",2 h 4 min,Turkish
134,"Tick, Tick... Boom!",Musical,"November 19, 2021",1 h 55 min,English


In [20]:
df5[1].iloc[0:41,].tail()

Unnamed: 0,Title,Genre,Premiere,Runtime,Language
36,Found,Documentary,"October 20, 2021",1 h 38 min,English
37,Flip a Coin – One OK Rock Documentary,Documentary,"October 21, 2021",1 h 45 min,Japanese
38,Lords of Scam,Documentary,"November 3, 2021",1 h 45 min,French
39,A Cop Movie,Documentary,"November 5, 2021",1 h 47 min,Spanish
40,Procession,Documentary,"November 19, 2021",1 h 58 min,English


In [21]:
net_21 = pd.concat([df5[0].iloc[0:135,],df5[1].iloc[0:41,]],axis=0)

In [22]:
net_21

Unnamed: 0,Title,Genre,Premiere,Runtime,Language
0,What Happened to Mr. Cha?,Comedy,"January 1, 2021",1 h 42 min,Korean
1,Pieces of a Woman,Drama,"January 8, 2021",2 h 6 min,English
2,Stuck Apart,Drama,"January 8, 2021",1 h 36 min,Turkish
3,Double Dad,Comedy-drama,"January 15, 2021",1 h 45 min,Portuguese
4,Outside the Wire,Action / science fiction,"January 15, 2021",1 h 55 min,English
...,...,...,...,...,...
36,Found,Documentary,"October 20, 2021",1 h 38 min,English
37,Flip a Coin – One OK Rock Documentary,Documentary,"October 21, 2021",1 h 45 min,Japanese
38,Lords of Scam,Documentary,"November 3, 2021",1 h 45 min,French
39,A Cop Movie,Documentary,"November 5, 2021",1 h 47 min,Spanish


# Combine all tables

In [23]:
net_full = pd.concat([net_15_17,net_18,net_19,net_20,net_21],axis=0)

In [24]:
net_full.reset_index(drop=True)

Unnamed: 0,Title,Genre,Premiere,Runtime,Language
0,Beasts of No Nation,War drama,"October 16, 2015","2 hours, 17 min.",English
1,The Ridiculous 6,Western,"December 11, 2015",2 hours,English
2,Pee-wee's Big Holiday,Adventure,"March 18, 2016","1 hour, 30 min.",English
3,Special Correspondents,Satire,"April 29, 2016","1 hour, 41 min.",English
4,The Do-Over,Action comedy,"May 27, 2016","1 hour, 48 min.",English
...,...,...,...,...,...
642,Found,Documentary,"October 20, 2021",1 h 38 min,English
643,Flip a Coin – One OK Rock Documentary,Documentary,"October 21, 2021",1 h 45 min,Japanese
644,Lords of Scam,Documentary,"November 3, 2021",1 h 45 min,French
645,A Cop Movie,Documentary,"November 5, 2021",1 h 47 min,Spanish


In [27]:
net_full.to_csv('C:/Users/Will Jiang/Desktop/Emory Desktop/Big Data/Final Project/data/netflix_original_movies.csv')