In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load data
titles = pd.read_csv('../data/data_unzipped/netflix_titles.csv')
subscribers = pd.read_csv('../data/netflix_sub_count.csv')

# Convert date_added to datetime
titles['date_added'] = pd.to_datetime(titles['date_added'])

# Create separate dataframes for movies and tv shows
movies = titles[titles['type'] == 'Movie'].copy()
tv_shows = titles[titles['type'] == 'TV Show'].copy()

# Fix rows where duration was input to rating
movies.loc[5541, 'duration'] = movies.loc[5541, 'rating']
movies.loc[5794, 'duration'] = movies.loc[5794, 'rating']
movies.loc[5813, 'duration'] = movies.loc[5813, 'rating']

# Fix NA ratings by googling the title
movies.loc[5541, 'rating'] = 'TV-MA'
movies.loc[5794, 'rating'] = 'TV-MA'
movies.loc[5813, 'rating'] = 'TV-MA'
movies.loc[7537, 'rating'] = 'PG-13'
movies.loc[5989, 'rating'] = 'TV-PG'
tv_shows.loc[7312, 'rating'] = 'TV-G'
tv_shows.loc[6827, 'rating'] = 'TV-14'

# Create a new column in each dataframe to better house duration
tv_shows['num_seasons'] = tv_shows['duration'].str.extract('(\d+)').astype(int)
movies['num_minutes'] = movies['duration'].str.extract('(\d+)').astype(int)

In [None]:
# Create lists of unqiue values from director, cast, country, and listed_in, as well as their frequency
movie_directors = movies['director'].str.split(', ').explode().value_counts().reset_index()
movie_directors.columns = ['director', 'count']

movie_cast = movies['cast'].str.split(', ').explode().value_counts().reset_index()
movie_cast.columns = ['cast', 'count']

movie_countries = movies['country'].str.split(', ').explode().value_counts().reset_index()
movie_countries.columns = ['country', 'count']

movie_genres = movies['listed_in'].str.split(', ').explode().value_counts().reset_index()
movie_genres.columns = ['genre', 'count']

In [None]:
# Now do the same for tv shows
tv_directors = tv_shows['director'].str.split(', ').explode().value_counts().reset_index()
tv_directors.columns = ['director', 'count']

tv_cast = tv_shows['cast'].str.split(', ').explode().value_counts().reset_index()
tv_cast.columns = ['cast', 'count']

tv_countries = tv_shows['country'].str.split(', ').explode().value_counts().reset_index()
tv_countries.columns = ['country', 'count']

tv_genres = tv_shows['listed_in'].str.split(', ').explode().value_counts().reset_index()
tv_genres.columns = ['genre', 'count']

In [None]:
def director_criteria(directors, all_directors, threshold):
    if pd.isna(directors):
        return 0
    for director in directors.split(', '):
        if all_directors[all_directors['director'] == director]['count'].values[0] > threshold:
            return 1
    return 0

def cast_criteria(casts, all_casts, threshold):
    if pd.isna(casts):
        return 0
    for cast in casts.split(', '):
        if all_casts[all_casts['cast'] == cast]['count'].values[0] > threshold:
            return 1
    return 0

movies['director_2'] = movies['director'].apply(lambda x: director_criteria(x, movie_directors, 1))
movies['director_5'] = movies['director'].apply(lambda x: director_criteria(x, movie_directors, 4))

movies['cast_2'] = movies['cast'].apply(lambda x: cast_criteria(x, movie_cast, 1))
movies['cast_6'] = movies['cast'].apply(lambda x: cast_criteria(x, movie_cast, 5))
movies['cast_12'] = movies['cast'].apply(lambda x: cast_criteria(x, movie_cast, 11))
tv_shows['cast_2'] = tv_shows['cast'].apply(lambda x: cast_criteria(x, tv_cast, 1))
tv_shows['cast_5'] = tv_shows['cast'].apply(lambda x: cast_criteria(x, tv_cast, 4))

In [None]:
movie_directors[movie_directors['director'] == 'JJC Skillz']