In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf

## Importing Data

In [2]:
# Add column with service name to each DataFrame
amazon = pd.read_csv("data/amazon_prime_titles.csv").assign(service="amazon")
disney = pd.read_csv("data/disney_plus_titles.csv").assign(service="disney")
hulu = pd.read_csv("data/hulu_titles.csv").assign(service="hulu")
netflix = pd.read_csv("data/netflix_titles.csv").assign(service="netflix")
raw_dfs = [amazon,disney,hulu,netflix]

## Initial Restriction and Combining

In [3]:
# Combine all DataFrames
services = raw_dfs[0].copy()
for df in raw_dfs[1:]:
    services = sqldf("SELECT * FROM services UNION SELECT * FROM df")

In [4]:
services.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,service
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",netflix
1,s1,Movie,Duck the Halls: A Mickey Mouse Christmas Special,"Alonso Ramirez Ramos, Dave Wasson","Chris Diamantopoulos, Tony Anselmo, Tress MacN...",,"November 26, 2021",2016,TV-G,23 min,"Animation, Family",Join Mickey and the gang as they duck the halls!,disney
2,s1,Movie,Ricky Velez: Here's Everything,,,,"October 24, 2021",2021,TV-MA,,"Comedy, Stand Up",​Comedian Ricky Velez bares it all with his ho...,hulu
3,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...,amazon
4,s10,Movie,A Muppets Christmas: Letters To Santa,Kirk R. Thatcher,"Steve Whitmire, Dave Goelz, Bill Barretta, Eri...",United States,"November 19, 2021",2008,G,45 min,"Comedy, Family, Musical",Celebrate the holiday season with all your fav...,disney


In [5]:
# Separate DataFrames based on content type 
movies = sqldf("SELECT * FROM services WHERE type == \"Movie\"")
tv = sqldf("SELECT * FROM services WHERE type == \"TV Show\"")

## Basic Stats Computation

In [6]:
# Find the mean release year of content as well as the total number of movies on each service
basic_movie_stats = sqldf("SELECT AVG(release_year) as mean_year, COUNT(*) as n_titles, service FROM movies GROUP BY service")
basic_movie_stats

Unnamed: 0,mean_year,n_titles,service
0,2006.869977,7814,amazon
1,1999.230989,1052,disney
2,2012.607143,1484,hulu
3,2013.121514,6131,netflix


In [7]:
# Find the mean release year of content as well as the total number of movies on each service
basic_tv_stats = sqldf("SELECT AVG(release_year) as mean_year, COUNT(*) as n_titles, service FROM tv GROUP BY service")
basic_tv_stats

Unnamed: 0,mean_year,n_titles,service
0,2014.545307,1854,amazon
1,2013.296482,398,disney
2,2012.530522,1589,hulu
3,2016.605755,2676,netflix


## Genre Breakdown

In [23]:
# Get a list of all genres that appear in movies
movie_genres = []
for index, row in movies.iterrows():
    for genre in row["listed_in"].split(", "):
        if(genre == "Arts"):
            movie_genres.append("Arts, Entertainment, and Culture")
        elif(genre == "Entertainment" or genre == "and Culture"):
            pass
        elif(genre not in movie_genres):
            movie_genres.append(genre)
            
tv_genres = []
for index, row in tv.iterrows():
    for genre in row["listed_in"].split(", "):
        if(genre not in tv_genres):
            tv_genres.append(genre)

In [69]:
# Rewriting similar genres so that formats are consistent across services
def get_genre(genre, content_type):
    # Dictionary for replacing inconsistent movie genres
    movie_rewrites = {"Arts": ["Arts, Entertainment, and Culture"], "Entertainment": [None], "and Culture": [None], 
                     "Action & Adventure": ["Action", "Adventure"], "Action-Adventure": ["Action", "Adventure"], 
                     "Anime Features": ["Anime"], "Classic Movies": ["Classics"], "Comedies": ["Comedy"], "Documentaries": ["Documentary"],
                     "Dramas": ["Drama"], "Faith & Spirituality": ["Faith and Spirituality"], "Children & Family Movies": ["Family"], 
                     "Fitness": ["Health and Wellness"], "Historical": ["History"], "Horror Movies": ["Horror"], 
                     "International Movies": ["International"], "LGBTQ": ["LGBTQ+"], "LGBTQ Movies": ["LGBTQ+"],
                     "Lifestyle & Culture": ["Lifestyle"], "Music & Musicals": ["Music"], 
                     "Musical": ["Music"], "Concert Film": ["Music Videos and Concerts"], "Romantic Comedy": ["Romance", "Comedy"],
                     "Romantic Movies": ["Romance"], "Sci-Fi & Fantasy": ["Science Fiction", "Fantasy"], "Sitcom": ["Comedy"], 
                     "Sports Movies": ["Sports"], "Stand-Up Comedy": ["Stand Up"], "Talk Show and Variety": ["Variety"], 
                     "Thrillers": ["Thriller"], "Travel": ["Lifestyle"], "Cartoons": ["Animation", "Cartoons"]}

    # Dictionary for replacing inconsistent TV genres
    tv_rewrites = {"Arts": ["Arts, Entertainment, and Culture"], "Entertainment": [None], "and Culture": [None],
                  "Action-Adventure": ["Action", "Adventure"], "Anime Series": ["Anime"], "British TV Shows": ["International", "British"],
                  "Classic & Cult TV": ["Classics"], "Crime TV Shows": ["Crime"], "Documentaries": ["Documentary"], 
                  "Docuseries": ["Documentary"], "Game Show / Competition": ["Game Show"], "Game Shows": ["Game Show"], 
                  "Fitness": ["Health & Wellness"], "Historical": ["History"], "International TV Shows": ["International"],
                  "Kids' TV": ["Kids"], "Korean TV Shows": ["International", "Korean"], "LGBTQ": ["LGBTQ+"], 
                  "Lifestyle & Culture": ["Lifestyle"], "Musical": ["Music"], "Reality TV": ["Reality"], "Romantic TV Shows": ["Romance"],
                  "Science & Nature TV": ["Science and Nature"], "Animals & Nature": ["Science and Nature"], "Series": [None],
                  "Sitcom": ["Sitcom", "Comedy"], "Sketch Comedy": ["Sketch Comedy", "Comedy"], "Soap Opera / Melodrama": ["Soap Opera"],
                  "Spanish-Language TV Shows": ["International", "Spanish"], "Stand-Up Comedy & Talk Shows": ["Stand Up", "Talk Show and Variety"],
                  "Latino": ["International", "Latino"], "TV Action & Adventure": ["Action", "Adventure"], "TV Comedies": ["Comedy"],
                  "TV Dramas": ["Drama"], "TV Horror": ["Horror"], "TV Mysteries": ["Mystery"], 
                  "TV Sci-Fi & Fantasy": ["Science Fiction", "Fantasy"], "TV Shows": [None], "TV Thrillers": ["Thriller"], 
                  "Stand Up": ["Talk Show and Variety"], "Talk Show": ["Talk Show and Variety"], "Teen TV Shows": ["Teen"],
                  "Variety": ["Talk Show and Variety"], "Travel": ["Lifestyle"], "Cartoons": ["Animation", "Cartoons"]}

    if(content_type == "Movie" and genre in movie_rewrites):
        return movie_rewrites.get(genre)
    elif(content_type == "TV Show" and genre in tv_rewrites):
        return tv_rewrites.get(genre)
    elif("&" in genre):
        return [genre.replace("&", "and")]
    else:
        return [genre]


In [70]:
# Append each instance of a genre to the service_genres DataFrame
service_genres = pd.DataFrame(columns = ["service", "type", "genre"])
for index, row in services.iterrows():
    for genre in row["listed_in"].split(", "):
        for i in get_genre(genre, row["type"]):
            count = 0
            for genre in row["listed_in"].split(", "):
                if(i != None and i in genre):
                    count += 1
            if(count < 2 and i != None):
                service_genres = service_genres.append({"service": row["service"], "type": row["type"], "genre": i}, ignore_index=True)

In [71]:
service_genres

Unnamed: 0,service,type,genre
0,netflix,Movie,Documentary
1,disney,Movie,Animation
2,disney,Movie,Family
3,hulu,Movie,Comedy
4,hulu,Movie,Stand Up
...,...,...,...
49703,netflix,Movie,International
49704,amazon,Movie,Comedy
49705,hulu,TV Show,Black Stories
49706,hulu,TV Show,Drama


In [72]:
# Count the instances of a genre for each service and type
service_genres_counted = sqldf("SELECT service, type, genre, COUNT(*) as count FROM service_genres GROUP BY service, type, genre")

In [73]:
# Separate movie genres from TV genres
service_genres_movies = sqldf("SELECT * FROM service_genres_counted WHERE type == \"Movie\"")
service_genres_movies

Unnamed: 0,service,type,genre,count
0,amazon,Movie,Action,1502
1,amazon,Movie,Adventure,230
2,amazon,Movie,Animation,279
3,amazon,Movie,Anime,29
4,amazon,Movie,Arthouse,140
...,...,...,...,...
113,netflix,Movie,Romance,616
114,netflix,Movie,Science Fiction,243
115,netflix,Movie,Sports,219
116,netflix,Movie,Stand Up,343


In [74]:
# Separate TV genres from movie genres
service_genres_tv = sqldf("SELECT * FROM service_genres_counted WHERE type == \"TV Show\"")
service_genres_tv

Unnamed: 0,service,type,genre,count
0,amazon,TV Show,Action,155
1,amazon,TV Show,Adventure,29
2,amazon,TV Show,Animation,268
3,amazon,TV Show,Anime,51
4,amazon,TV Show,Arthouse,1
...,...,...,...,...
116,netflix,TV Show,Spanish,174
117,netflix,TV Show,Stand Up,56
118,netflix,TV Show,Talk Show and Variety,56
119,netflix,TV Show,Teen,69


In [75]:
# Find genres that only appear on one service
single_service_genres = sqldf("SELECT * FROM (SELECT genre, COUNT(DISTINCT(service)) as num_services FROM service_genres GROUP BY genre) WHERE num_services == 1")
single_service_genres

Unnamed: 0,genre,num_services
0,Adult Animation,1
1,Animals and Nature,1
2,Anthology,1
3,Arthouse,1
4,"Arts, Entertainment, and Culture",1
5,Biographical,1
6,Black Stories,1
7,British,1
8,Buddy,1
9,Cartoons,1
