In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load data
titles = pd.read_csv('./data_unzipped/netflix_titles.csv')
subscribers = pd.read_csv('./netflix_sub_count.csv')

# Convert date_added to datetime
titles['date_added'] = pd.to_datetime(titles['date_added'])

# Create separate dataframes for movies and tv shows
movies = titles[titles['type'] == 'Movie'].copy()
tv_shows = titles[titles['type'] == 'TV Show'].copy()

# Fix rows where duration was input to rating
movies.loc[5541, 'duration'] = movies.loc[5541, 'rating']
movies.loc[5794, 'duration'] = movies.loc[5794, 'rating']
movies.loc[5813, 'duration'] = movies.loc[5813, 'rating']

# Fix NA ratings by googling the title
movies.loc[5541, 'rating'] = 'TV-MA'
movies.loc[5794, 'rating'] = 'TV-MA'
movies.loc[5813, 'rating'] = 'TV-MA'
movies.loc[7537, 'rating'] = 'PG-13'
movies.loc[5989, 'rating'] = 'TV-PG'
tv_shows.loc[7312, 'rating'] = 'TV-G'
tv_shows.loc[6827, 'rating'] = 'TV-14'

# Create a new column in each dataframe to better house duration
tv_shows['num_seasons'] = tv_shows['duration'].str.extract('(\d+)').astype(int)
movies['num_minutes'] = movies['duration'].str.extract('(\d+)').astype(int)

# Random data fixes
movies.loc[4653, 'country'] = 'United States'

In [3]:
# Create lists of unqiue values from director, cast, country, and listed_in, as well as their frequency
movie_directors = movies['director'].str.split(', ').explode().value_counts().reset_index()
movie_directors.columns = ['director', 'count']

movie_cast = movies['cast'].str.split(', ').explode().value_counts().reset_index()
movie_cast.columns = ['cast', 'count']

movie_countries = movies['country'].str.split(', ').explode().value_counts().reset_index()
movie_countries.columns = ['country', 'count']

movie_genres = movies['listed_in'].str.split(', ').explode().value_counts().reset_index()
movie_genres.columns = ['genre', 'count']

In [4]:
# Now do the same for tv shows
tv_directors = tv_shows['director'].str.split(', ').explode().value_counts().reset_index()
tv_directors.columns = ['director', 'count']

tv_cast = tv_shows['cast'].str.split(', ').explode().value_counts().reset_index()
tv_cast.columns = ['cast', 'count']

tv_countries = tv_shows['country'].str.split(', ').explode().value_counts().reset_index()
tv_countries.columns = ['country', 'count']

tv_genres = tv_shows['listed_in'].str.split(', ').explode().value_counts().reset_index()
tv_genres.columns = ['genre', 'count']

In [5]:
def director_criteria(directors, all_directors, threshold):
    if pd.isna(directors):
        return 0
    for director in directors.split(', '):
        if all_directors[all_directors['director'] == director]['count'].values[0] > threshold:
            return 1
    return 0

def cast_criteria(casts, all_casts, threshold):
    if pd.isna(casts):
        return 0
    for cast in casts.split(', '):
        if all_casts[all_casts['cast'] == cast]['count'].values[0] > threshold:
            return 1
    return 0

movies['director_2'] = movies['director'].apply(lambda x: director_criteria(x, movie_directors, 1))
movies['director_5'] = movies['director'].apply(lambda x: director_criteria(x, movie_directors, 4))

movies['cast_2'] = movies['cast'].apply(lambda x: cast_criteria(x, movie_cast, 1))
movies['cast_6'] = movies['cast'].apply(lambda x: cast_criteria(x, movie_cast, 5))
movies['cast_12'] = movies['cast'].apply(lambda x: cast_criteria(x, movie_cast, 11))
tv_shows['cast_2'] = tv_shows['cast'].apply(lambda x: cast_criteria(x, tv_cast, 1))
tv_shows['cast_5'] = tv_shows['cast'].apply(lambda x: cast_criteria(x, tv_cast, 4))

In [6]:
for genre in movie_genres['genre']:
    movies[genre] = movies['listed_in'].apply(lambda x: int(genre in x.split(', ')))

for genre in tv_genres['genre']:
    tv_shows[genre] = tv_shows['listed_in'].apply(lambda x: int(genre in x.split(', ')))

In [7]:
def country_criteria(country, countries):
    if pd.isna(countries):
        return 0
    if country in countries.split(', '):
        return 1
    return 0

countries = [
    'United States',
    'India',
    'United Kingdom',
    'Canada',
    'France',
    'Germany',
    'Spain',
    'Japan',
    'China',
    'Mexico',
    'South Korea',
    'Taiwan',
    'Australia'
    ]

for country in countries:
    movies[country] = movies['country'].apply(lambda x: country_criteria(country, x))
    tv_shows[country] = tv_shows['country'].apply(lambda x: country_criteria(country, x))

In [8]:
movies['Quarter'] = movies['date_added'].dt.year.astype(str) + 'Q' + movies['date_added'].dt.quarter.astype(str)
tv_shows['Quarter'] = tv_shows['date_added'].dt.year.astype(str) + 'Q' + tv_shows['date_added'].dt.quarter.astype(str)

In [9]:
# create new movies df with only the quarter column and binary columns
movies_q = movies[['Quarter', 'director_2', 'director_5', 'cast_2', 'cast_6', 'cast_12', 'num_minutes'] + list(movie_genres['genre']) + countries].copy()
shows_q = tv_shows[['Quarter', 'cast_2', 'cast_5', 'num_seasons'] + list(tv_genres['genre']) + countries].copy()
# summarise by quarter
movies_q.groupby('Quarter').sum().join(shows_q.groupby('Quarter').sum(), lsuffix='_movies', rsuffix='_shows')

Unnamed: 0_level_0,director_2,director_5,cast_2_movies,cast_6,cast_12,num_minutes,International Movies,Dramas,Comedies,Documentaries,...,Canada_shows,France_shows,Germany_shows,Spain_shows,Japan_shows,China_shows,Mexico_shows,South Korea_shows,Taiwan_shows,Australia_shows
Quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008Q1,0,0,0,0,0,81,0,1,0,0,...,,,,,,,,,,
2009Q2,0,0,1,0,0,104,1,1,0,0,...,,,,,,,,,,
2009Q4,1,0,1,1,0,29,0,0,0,0,...,,,,,,,,,,
2010Q4,0,0,0,0,0,84,0,0,0,0,...,,,,,,,,,,
2011Q2,0,0,1,1,0,103,1,1,0,0,...,,,,,,,,,,
2011Q3,0,0,1,1,0,75,0,1,0,0,...,,,,,,,,,,
2011Q4,0,0,11,3,0,1037,0,11,0,0,...,,,,,,,,,,
2012Q1,0,0,1,1,0,76,0,0,0,1,...,,,,,,,,,,
2012Q4,1,0,2,2,2,110,0,0,2,0,...,,,,,,,,,,
2013Q3,1,1,1,1,0,75,0,0,0,0,...,,,,,,,,,,


In [10]:
# how many unique ratings
tv_shows['rating'].nunique()

9