In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf

## Importing Data

In [2]:
# Add column with service name to each DataFrame
amazon = pd.read_csv("data/amazon_clean.csv").assign(service="amazon")
disney = pd.read_csv("data/disney_clean.csv").assign(service="disney")
hulu = pd.read_csv("data/hulu_clean.csv").assign(service="hulu")
netflix = pd.read_csv("data/netflix_clean.csv").assign(service="netflix")

In [3]:
raw_dfs = [amazon,disney,hulu,netflix]

## Initial Restriction and Combining of Data

In [4]:
# Combine all DataFrames
services = raw_dfs[0].copy()
for df in raw_dfs[1:]:
    services = sqldf("SELECT * FROM services UNION SELECT * FROM df")

In [5]:
services.head()

Unnamed: 0,type,title,country,date_added,release_year,rating,duration,listed_in,description,service
0,Movie,"""Mixed Up""",,,2020,R,106,"Drama,Romance","""Mixed Up"" examines casual factors that make u...",amazon
1,Movie,#Alive,South Korea,"September 8, 2020",2020,R,99,"Horror,International,Thriller","As a grisly virus rampages a city, a lone man ...",netflix
2,Movie,#AnneFrank - Parallel Stories,Italy,"July 1, 2020",2019,PG-13,95,"Documentary,International","Through her diary, Anne Frank's story is retol...",netflix
3,Movie,#FriendButMarried,Indonesia,"May 21, 2020",2018,G,102,"Drama,International,Romance","Pining for his high school crush for years, a ...",netflix
4,Movie,#FriendButMarried 2,Indonesia,"June 28, 2020",2020,G,104,"Drama,International,Romance",As Ayu and Ditto finally transition from best ...,netflix


In [6]:
# Separate DataFrames based on content type 
movies = sqldf("SELECT * FROM services WHERE type == \"Movie\"")
tv = sqldf("SELECT * FROM services WHERE type == \"TV Show\"")

## Release Years

This will be used to recommend services based on whether they prefer new or older content.

In [7]:
# Find the mean release year of content as well as the total number of movies on each service
basic_movie_stats = sqldf("SELECT AVG(release_year) as mean_year, COUNT(*) as n_titles, service FROM movies GROUP BY service")
basic_movie_stats

Unnamed: 0,mean_year,n_titles,service
0,2006.875577,7804,amazon
1,1999.169685,1049,disney
2,2011.942174,1297,hulu
3,2013.108824,6074,netflix


In [8]:
# Find the mean release year of content as well as the total number of movies on each service
basic_tv_stats = sqldf("SELECT AVG(release_year) as mean_year, COUNT(*) as n_titles, service FROM tv GROUP BY service")
basic_tv_stats

Unnamed: 0,mean_year,n_titles,service
0,2014.385921,1591,amazon
1,2013.253165,395,disney
2,2012.530522,1589,hulu
3,2016.709398,2660,netflix


## Genre Breakdown

This will be used to recommend services based on what genres people most enjoy.

In [9]:
# Get a list of all genres that appear in movies
movie_genres = []
for index, row in movies.iterrows():
    try:
        for genre in row["listed_in"].split(","):
            if(genre not in movie_genres):
                movie_genres.append(genre)
    except:
        pass
        
# Get a list of all genres that appear in tv shows         
tv_genres = []
for index, row in tv.iterrows():
    try:
        for genre in row["listed_in"].split(","):
            if(genre not in tv_genres):
                tv_genres.append(genre)
    except:
        pass

In [10]:
# Append each instance of a genre to the service_genres DataFrame
service_genres = pd.DataFrame(columns = ["service", "type", "genre"])
for index, row in services.iterrows():
    try:
        for genre in row["listed_in"].split(","):
            service_genres = service_genres.append({"service": row["service"], "type": row["type"], "genre": genre}, ignore_index=True)
    except:
        pass

In [12]:
# Count the instances of a genre for each service and type
service_genres_counted = sqldf("SELECT service, type, genre, COUNT(*) as count FROM service_genres GROUP BY service, type, genre")

In [13]:
# Separate movie genres from TV genres
service_genres_movies = sqldf("SELECT * FROM service_genres_counted WHERE type == \"Movie\"")

In [14]:
# Separate TV genres from movie genres
service_genres_tv = sqldf("SELECT * FROM service_genres_counted WHERE type == \"TV Show\"")

In [15]:
# Find genres that only appear on one service
single_service_movie_genres = sqldf("SELECT * FROM (SELECT genre, COUNT(DISTINCT(service)) as num_services FROM service_genres_movies GROUP BY genre) WHERE num_services == 1")
single_service_tv_genres = sqldf("SELECT * FROM (SELECT genre, COUNT(DISTINCT(service)) as num_services FROM service_genres_tv GROUP BY genre) WHERE num_services == 1")

In [25]:
# Sort each service's genres by count
service_genres_movies = sqldf("SELECT * FROM service_genres_movies ORDER BY service, count DESC")

In [17]:
# Get the top 5 genres on each service
limit = 5
frames = []
for service in np.unique(service_genres_movies["service"]):
    frames.append(sqldf(f"SELECT * FROM (SELECT * FROM service_genres_movies ORDER BY service, count DESC) WHERE service == \"{service}\" LIMIT {limit}"))
    
top_by_genre = frames[0].copy()
for frame in frames[1:]:
    top_by_genre = sqldf("SELECT * FROM top_by_genre UNION SELECT * FROM frame")
top_by_genre = sqldf("SELECT * FROM top_by_genre ORDER BY service, count DESC")
top_by_genre

Unnamed: 0,service,type,genre,count
0,amazon,Movie,Drama,3204
1,amazon,Movie,Comedy,1815
2,amazon,Movie,Action,1498
3,amazon,Movie,Thriller,1381
4,amazon,Movie,Horror,845
5,disney,Movie,Family,533
6,disney,Movie,Comedy,407
7,disney,Movie,Animation,381
8,disney,Movie,Action,314
9,disney,Movie,Adventure,314


In [18]:
# Get top service for each genre
movies_top_each_genre = sqldf("SELECT genre, service as top_service, MAX(count) as count FROM service_genres_movies WHERE genre NOT IN (SELECT genre FROM single_service_movie_genres) GROUP BY genre")
movies_top_each_genre

Unnamed: 0,genre,top_service,count
0,Action,amazon,1498
1,Adventure,netflix,859
2,Animation,disney,381
3,Anime,netflix,71
4,Classics,netflix,116
5,Comedy,amazon,1815
6,Crime,hulu,69
7,Documentary,netflix,869
8,Drama,amazon,3204
9,Faith and Spirituality,netflix,65


## Ratings

G, PG, PG-13, R, etc. 

This will be used to recommend services based on whether family friendly content is needed.

In [22]:
ratings_by_service = sqldf("SELECT service, type, COUNT(rating) as count, rating FROM services GROUP BY service, rating")

In [24]:
limit = 5
frames = []
for service in np.unique(ratings_by_service["service"]):
    frames.append(sqldf(f"SELECT * FROM (SELECT * FROM ratings_by_service ORDER BY service, count DESC) WHERE service == \"{service}\" LIMIT {limit}"))
    
top_by_rating = frames[0].copy()
for frame in frames[1:]:
    top_by_rating = sqldf("SELECT * FROM top_by_rating UNION SELECT * FROM frame")
top_by_rating = sqldf("SELECT service, count, rating FROM top_by_rating ORDER BY service, count DESC")
top_by_rating

Unnamed: 0,service,count,rating
0,amazon,2641,PG-13
1,amazon,2588,R
2,amazon,1861,G
3,amazon,1232,NC-17
4,amazon,425,PG
5,disney,677,PG
6,disney,619,G
7,disney,145,PG-13
8,disney,0,
9,hulu,827,PG-13


## Merging with TMDb API

In [26]:
# Import and initial API setup
from tmdbv3api import TMDb, Search, Movie

tmdb = TMDb()
tmdb.api_key = '93854ffff857e6d378b356f9caea9972'

In [36]:
def get_TMDb_id(title, release_year):
    search = Search()
    
    for i in range(1,3):
        results = search.movies({"query": title, "page": i})
        for result in results:
            try:
                if(str(release_year) == result["release_date"].split("-")[0]):
                    return result["id"]
            except:
                pass
    return None

27205

In [37]:
ids = []
for index, row in movies.iterrows():
    ids.append(get_TMDb_id(row["title"], row["release_year"]))
movies["tmdb_id"] = ids
movies

AttributeError: 'AsObj' object has no attribute 'release_date'