# Netflix original films webscraping
### A webscraping script that reads in Wikipedia pages for lists of Netflix original films, turns them into pandas DataFrames, and then saves out as csv files.

In [1]:
import pandas as pd # for data analysis
import requests # need this to talk to a website
from bs4 import BeautifulSoup # to parse HTML documents
import numpy as np

## Let's generate a pandas DataFrame from tables in a Wikipedia page -- 2015-2017 for example

In [2]:
# check website response
wikiurl="https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(2015-2017)"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
print(response.status_code) # 200 means good to go

200


In [3]:
# parse html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser') # get the content
table=soup.find_all('table',{'class':"wikitable"}) # find all <table> with "wikitable" class
print(len(table)) # 3 tables in this wikipedia page
print(table[0])

3
<table class="wikitable sortable" style="width:100%">
<tbody><tr>
<th style="width:35%;">Title
</th>
<th scope="col" style="width:12%;">Release date
</th>
<th>Genre
</th>
<th>Runtime
</th>
<th>Language
</th></tr>
<tr>
<td><i><a href="/wiki/Beasts_of_No_Nation_(film)" title="Beasts of No Nation (film)">Beasts of No Nation</a></i>
</td>
<td>October 16, 2015
</td>
<td><a href="/wiki/War_film" title="War film">War drama</a>
</td>
<td><span data-sort-value="137 !">2 h 17 min</span>
</td>
<td>English
</td></tr>
<tr>
<td><i><a href="/wiki/The_Ridiculous_6" title="The Ridiculous 6">The Ridiculous 6</a></i>
</td>
<td>December 11, 2015
</td>
<td><a href="/wiki/Western_(genre)" title="Western (genre)">Western</a> <a href="/wiki/Comedy_film" title="Comedy film">comedy</a>
</td>
<td><span data-sort-value="120 !">2 h</span>
</td>
<td>English
</td></tr>
<tr>
<td><i><a href="/wiki/Pee-wee%27s_Big_Holiday" title="Pee-wee's Big Holiday">Pee-wee's Big Holiday</a></i>
</td>
<td>March 18, 2016
</td>
<td>

In [4]:
# find film types and add to individual table before concatenating
heading=soup.find_all('h2')

# these are common heading across wikipedia pages, so let's drop them
headings_to_drop = ['Contents','References','External links']

# loop over all headings and grab film type names
film_type = []
for h in range(len(heading)):
    name_str = str(heading[h])
    start_idx = name_str.find(">")+1 # where the name string starts
    end_idx = name_str.find("</") # where the name string ends
    heading_str = name_str[start_idx:end_idx]
    heading_str == headings_to_drop
    if not heading_str in headings_to_drop:
        film_type.append(heading_str)

# make sure that the number of film types match the number of tables we grabbed earlier
if len(table) != len(film_type):
    print("Warning: the number of film types does not match the number of tables")

print(film_type)


['Feature films', 'Documentaries', 'Specials']


In [5]:
# now concatenate all three tables while adding the film type as a new column at the same time
dfs = []
for d in range(len(table)):
    df=pd.read_html(str(table[d]))
    # convert list to dataframe
    df=pd.DataFrame(df[0])
    df['Film type'] = film_type[d]
    dfs.append(df)
final_df = pd.concat(dfs, ignore_index=True)
print(final_df)
# final_df.describe()

                                                 Title       Release date  \
0                                  Beasts of No Nation   October 16, 2015   
1                                     The Ridiculous 6  December 11, 2015   
2                                Pee-wee's Big Holiday     March 18, 2016   
3                               Special Correspondents     April 29, 2016   
4                                          The Do-Over       May 27, 2016   
..                                                 ...                ...   
101             Justin Timberlake + The Tennessee Kids   October 12, 2016   
102  13th: A Conversation with Oprah Winfrey & Ava ...   January 26, 2017   
103  Michael Bolton's Big, Sexy Valentine's Day Spe...   February 7, 2017   
104                                        Rodney King     April 28, 2017   
105         Barbra: The Music, The Mem'ries, The Magic  November 22, 2017   

                     Genre     Runtime Language      Film type  
0         

  df=pd.read_html(str(table[d]))
  df=pd.read_html(str(table[d]))
  df=pd.read_html(str(table[d]))


## Do the same for all other pages -- writing a function that we can pass in the url to make this easier

In [6]:
def wikitable2df(url,add_heading=False,heading_name=None):
    '''
    reads in tables in a wikipedia page and converts into a pandas DataFrame.
    '''
    from io import StringIO # for pd.read_html
    
    # check website response
    table_class="wikitable sortable jquery-tablesorter"
    response=requests.get(url)
    if response.status_code != 200:
        raise ValueError("No-go on the status_code: May not be allowed to web-scrape from this website")

    # parse html into a beautifulsoup object
    soup = BeautifulSoup(response.text, 'html.parser') # get the content
    table=soup.find_all('table',{'class':"wikitable"}) # find all <table> with "wikitable" class

    # if add_heading = True, grab heading values and add an additional column filled with that value
    if add_heading:
        # find headings and add to individual table before concatenating
        heading=soup.find_all('h2')
        
        # these are some common headings that can exist in any wikipedia pages, so let's drop them
        headings_to_drop = ['Contents','References','External links','Notes']
        
        # loop over all headings and grab heading strings
        new_col = []
        for h in range(len(heading)):
            name_str = str(heading[h])
            start_idx = name_str.find(">")+1 # where the name string starts
            end_idx = name_str.find("</") # where the name string ends
            heading_str = name_str[start_idx:end_idx]
            if not heading_str in headings_to_drop:
                new_col.append(heading_str)
        
        # make sure that the number of headings match the number of tables we grabbed earlier
        if len(table) != len(new_col):
            print(url, new_col)
            raise ValueError("The number of headings does not match the number of tables")
        
    # now concatenate all tables while adding the film type as a new column at the same time
    dfs = []
    for d in range(len(table)):
        df=pd.read_html(StringIO(str(table[d])))
        # convert list to dataframe
        df=pd.DataFrame(df[0])
        df[str(heading_name)] = new_col[d]
        dfs.append(df)
    final_df = pd.concat(dfs, ignore_index=True)
    return final_df

Now we can run this function over all urls and then concatenate the returned DataFrames.

In [7]:
urls = ["https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(2015-2017)",
        "https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(2018)",
        "https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(2019)",
        "https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(2020)",
        "https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(2021)",
        "https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(2022)",
        "https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(2023)",
        "https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(2024)"]
        # "https://en.wikipedia.org/wiki/List_of_Netflix_original_films_(since_2025)"] excluding 2025- page for now
        
dfs = []
for url in urls:
    df = wikitable2df(url,add_heading=True,heading_name='Film type')
    dfs.append(df)
final_df = pd.concat(dfs, ignore_index=True)
final_df

Unnamed: 0,Title,Release date,Genre,Runtime,Language,Film type
0,Beasts of No Nation,"October 16, 2015",War drama,2 h 17 min,English,Feature films
1,The Ridiculous 6,"December 11, 2015",Western comedy,2 h,English,Feature films
2,Pee-wee's Big Holiday,"March 18, 2016",Adventure comedy,1 h 30 min,English,Feature films
3,Special Correspondents,"April 29, 2016",Satire,1 h 41 min,English,Feature films
4,The Do-Over,"May 27, 2016",Action comedy,1 h 48 min,English,Feature films
...,...,...,...,...,...,...
1408,White Christmas Fireplace,"December 13, 2024",Fireplace,1 h,No dialogue,Specials
1409,Christmas Gameday: Chiefs vs. Steelers,"December 25, 2024",Sports event,5 h 26 min,English,Specials
1410,Christmas Gameday: Ravens vs. Texans,"December 25, 2024",Sports event,3 h 56 min,English,Specials
1411,Avicii – My Last Show,"December 31, 2024",DJ mixset,29 min,English,Specials


Let's clean the data a little bit before saving

In [8]:
final_df.info()
final_df_replaced = final_df.replace('', np.nan)  # Assuming empty cells are represented as empty strings

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1413 entries, 0 to 1412
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         1413 non-null   object
 1   Release date  1413 non-null   object
 2   Genre         1065 non-null   object
 3   Runtime       1413 non-null   object
 4   Language      1413 non-null   object
 5   Film type     1413 non-null   object
dtypes: object(6)
memory usage: 66.4+ KB


In [11]:
final_df_replaced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1413 entries, 0 to 1412
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         1413 non-null   object
 1   Release date  1413 non-null   object
 2   Genre         1065 non-null   object
 3   Runtime       1413 non-null   object
 4   Language      1413 non-null   object
 5   Film type     1413 non-null   object
dtypes: object(6)
memory usage: 66.4+ KB


In [14]:
final_df_replaced[final_df_replaced['Title']=='My Own Man']

Unnamed: 0,Title,Release date,Genre,Runtime,Language,Film type
59,My Own Man,"March 6, 2015",,1 h 21 min,English,Documentaries


Let's save out as a csv file.

In [10]:
final_df_replaced.to_csv('netflix_og_films.csv',encoding='utf-8-sig',index=False) # using utf-8-sig for special characters in the title