In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf
from IPython.display import clear_output

## Importing Data

In [2]:
# Add column with service name to each DataFrame
amazon = pd.read_csv("data/amazon_clean.csv")
disney = pd.read_csv("data/disney_clean.csv")
hbo = pd.read_csv("data/hbo_clean.csv")
hulu = pd.read_csv("data/hulu_clean.csv")
netflix = pd.read_csv("data/netflix_clean.csv")

In [3]:
raw_dfs = [amazon,disney,hbo,hulu,netflix]

## Initial Restriction and Combining of Data

In [4]:
# Combine all DataFrames
services = raw_dfs[0].copy()
for df in raw_dfs[1:]:
    services = sqldf("SELECT * FROM services UNION SELECT * FROM df")

In [5]:
services.sample(10)

Unnamed: 0,title,release_year,type,rating,service
5877,Disney Gravity Falls,2011,TV Show,PG,disney
22452,The Wishing Tree,2017,Movie,PG-13,netflix
15429,Pinkfong! Baby Shark Special,2017,TV Show,G,amazon
5088,Curious George 3: Back to the Jungle,2015,Movie,G,hulu
8844,He Named Me Malala,2015,Movie,PG-13,netflix
1500,American Experience: The Island Murder,2018,Movie,PG-13,netflix
1786,Antar: Son of Shadad,2017,Movie,R,netflix
14757,One Night,2021,Movie,R,amazon
14268,"Nico, 1988",2017,Movie,R,hulu
1524,American Masters: Inventing David Geffen,2012,Movie,R,netflix


In [6]:
# Separate DataFrames based on content type 
movies = sqldf("SELECT * FROM services WHERE type == \"Movie\"")
tv = sqldf("SELECT * FROM services WHERE type == \"TV Show\"")

## Merging with TMDb API

In [7]:
# Import and initial API setup
from tmdbv3api import TMDb, Search, Movie

tmdb = TMDb()
tmdb.api_key = '93854ffff857e6d378b356f9caea9972'

In [8]:
# Search the TMDb API for a given film
def get_TMDb_id(title, release_year):
    search = Search()
    # Check first 3 pages. If not found by then, move on to the next title
    for i in range(1,3):
        results = search.movies({"query": title, "page": i})
        for result in results:
            try:
                # If the release years are the same, it is likely that the instances are the same
                if(str(release_year) == result["release_date"].split("-")[0]):
                    return result["id"]
            except:
                pass
    return None

In [9]:
# Get other information for a movie based on TMDb id
def get_TMDb_info(tmdb_id=None, requested_info = []):
    # If there is no provided id, return None for all requested data
    if(tmdb_id == None):
        return [None]*len(requested_info)
    
    # Get the details of the specified title
    movie = Movie().details(tmdb_id)
    
    collected_info = []
    for i in requested_info:
        # Genres must be combined into string
        if(i == "genres"):
            genre_string = ""
            for genre in movie[i]:
                genre_string += genre["name"] + ","
            collected_info.append(genre_string[:-1])
        else:
            collected_info.append(movie[i])
    return collected_info

In [10]:
# Add columns to movies from TMDb API
ids = []
try:
    # Try to import data. If it is imported, drop rows that do not have provided IDs
    movies_imported = pd.read_csv("data/modified/movies_api_merged.csv")
    
    # Allow for re-running. Would want this if more data is requested. 
    ans = input("File found. Do you want to re-run this code? (y/n): ")
    if(ans == "y"):
        # Allow for full restarting - wipes previously calculated IDs
        ans = input("Do you want to fully restart? Warning: This will take a long time to compute (y/n): ")
        if(ans == "n"):
            movies = movies_imported.copy()
            movies = movies.dropna(subset=["tmdb_id"])
            movies = movies.reset_index(drop=True)
            # Store IDs to speed up computation if re-running
            ids = list(movies["tmdb_id"])
        # Raise an exception so that code can be re-run
        raise Exception("Re-running")
    else:
        ans = input("Do you wish to load saved data? (y/n): ")
        if(ans == "y"):
            movies = movies_imported.copy()
# If data is not found or user requests re-running, code will execute
except Exception as e:
    # Check whether IDs already exist
    if(ids == []):
        ids_loaded = False
    else:
        ids_loaded = True
        
    # Data requested from API
    other_requests = ["genres", "imdb_id", "popularity", "vote_average", "vote_count", "poster_path", "budget", "revenue", "runtime"]
    
    # Dictionary of lists that contain requested information
    other = {}
    for request in other_requests:
        other.update({request: []})
    
    for index, row in movies.iterrows():
        # Output progress 
        if(index%10 == 0):
            clear_output()
            print(index/movies.shape[0])
            
        # If there are not already IDs, get IDs from API
        if(not ids_loaded):
            ids.append(get_TMDb_id(row["title"], row["release_year"]))
            other_info = get_TMDb_info(ids[-1], other_requests)
        else:
            # If the IDs exist and the current ID is valid, get other info
            if(ids[index] != None and ids[index] != np.nan):
                other_info = get_TMDb_info(ids[index], other_requests)
            # If current ID is not valid, other info is None
            else:
                other_info = [None]*len(other_requests)
        
        if(other_info != None):
            # Add new information to dictionary of info
            for other_index in range(len(other_info)):
                other[list(other.keys())[other_index]].append(other_info[other_index])
                
    # Set new IDs if necessary
    if(not ids_loaded):
        movies["tmdb_id"] = ids
    
    # Add columns for new information
    for key in other.keys():
        movies[key] = other.get(key)
        
    # Save data as .csv
    movies.to_csv("data/modified/movies_api_merged.csv", index=False)
clear_output()

In [11]:
movies.describe()

Unnamed: 0,release_year,tmdb_id,popularity,vote_average,vote_count,budget,revenue,runtime
count,18153.0,13797.0,13797.0,13797.0,13797.0,13797.0,13797.0,13752.0
mean,2007.932628,333322.253896,13.973216,5.721381,605.890121,8628376.0,26951300.0,93.003563
std,18.321856,266816.307728,67.021047,2.118846,1926.234438,28049930.0,115550500.0,34.627442
min,1915.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2006.0,50720.0,2.075,5.3,5.0,0.0,0.0,82.0
50%,2015.0,340030.0,5.548,6.2,33.0,0.0,0.0,95.0
75%,2019.0,537795.0,13.661,7.0,254.0,50000.0,7.0,110.0
max,2021.0,942567.0,6865.258,10.0,30989.0,380000000.0,2847246000.0,467.0


In [12]:
# Remove rows without TMDb IDs
movies = movies.dropna(subset=["tmdb_id"])
movies = movies.reset_index(drop=True)

In [13]:
sqldf("SELECT * FROM movies ORDER BY release_year DESC, popularity DESC, vote_average DESC").head()

Unnamed: 0,title,release_year,type,rating,service,tmdb_id,genres,imdb_id,popularity,vote_average,vote_count,poster_path,budget,revenue,runtime
0,#Home,2021,Movie,PG-13,amazon,634649.0,"Action,Adventure,Science Fiction",tt10872600,6865.258,8.3,8238.0,/1g0dhYtq4irTY1GPXvft6k4YLjm.jpg,200000000.0,1809941000.0,148.0
1,Red,2021,Movie,PG-13,netflix,512195.0,"Action,Comedy,Crime,Thriller",tt7991608,1823.157,6.8,3077.0,/wdE6ewaKZHr62bLqCn7A2DiGShm.jpg,160000000.0,178143.0,117.0
2,One,2021,Movie,PG-13,amazon,811592.0,Action,tt14199590,942.826,6.7,304.0,/3OXiTjU30gWtqxmx4BU9RVp2OTv.jpg,0.0,0.0,97.0
3,Legend,2021,Movie,PG-13,amazon,566525.0,"Action,Adventure,Fantasy",tt9376612,907.507,7.8,5713.0,/1BIoJGKbXjdFDAqUEiA2VHqkK1Z.jpg,150000000.0,432243300.0,132.0
4,Shang-Chi and The Legend of The Ten Rings,2021,Movie,PG-13,disney,566525.0,"Action,Adventure,Fantasy",tt9376612,907.507,7.8,5713.0,/1BIoJGKbXjdFDAqUEiA2VHqkK1Z.jpg,150000000.0,432243300.0,132.0


## Merging with IMDb Data

In [14]:
titles_ratings = pd.read_csv("data/title.ratings.tsv.gz", sep="\t", compression="gzip")

In [15]:
titles_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1665
1,tt0000002,6.1,206
2,tt0000003,6.5,1382
3,tt0000004,6.2,123
4,tt0000005,6.2,2168
...,...,...,...
1097082,tt9916580,7.2,5
1097083,tt9916690,6.6,5
1097084,tt9916720,6.0,67
1097085,tt9916766,6.9,15


In [16]:
movies = sqldf("SELECT * FROM movies INNER JOIN titles_ratings ON movies.imdb_id = titles_ratings.tconst")

In [17]:
movies.rename(columns = {"vote_average": "tmdb_score", "vote_count": "tmdb_count", "averageRating": "imdb_score", "numVotes": "imdb_count"}, inplace=True)

In [18]:
movies

Unnamed: 0,title,release_year,type,rating,service,tmdb_id,genres,imdb_id,popularity,tmdb_score,tmdb_count,poster_path,budget,revenue,runtime,tconst,imdb_score,imdb_count
0,#Alive,2020,Movie,R,netflix,492414.0,"Horror,Thriller",tt7611352,14.867,6.3,55.0,/3KqM6l45QQVDYQgYWnQnK1jDDHa.jpg,0.0,0.0,91.0,tt7611352,5.4,450
1,#AnneFrank - Parallel Stories,2019,Movie,PG-13,netflix,610643.0,"Documentary,Drama,History",tt9850370,16.035,7.0,35.0,/hkC4yNDFmW1yQuQhtZydMeRuaAb.jpg,0.0,0.0,92.0,tt9850370,6.4,996
2,#FriendButMarried,2018,Movie,G,netflix,503352.0,"Comedy,Drama,Romance",tt8076266,2.357,6.8,9.0,/splV83B3CqMCbHUunoyaUoRUM60.jpg,0.0,0.0,102.0,tt8076266,6.9,524
3,#FriendButMarried 2,2020,Movie,G,netflix,655293.0,"Drama,Comedy,Romance",tt11640412,2.089,7.5,2.0,/pCcvdac4PbU7U1Dgce4j9VWZ72c.jpg,0.0,0.0,104.0,tt11640412,6.6,168
4,#Lucky Number,2015,Movie,,amazon,359732.0,Comedy,tt2538204,4.036,4.8,4.0,/wA4Ig8y8ggVVzbgZHlyOm56AxVD.jpg,1000000.0,0.0,80.0,tt2538204,5.0,444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12217,​Goli Soda 2,2018,Movie,PG-13,netflix,507574.0,"Drama,Action",tt8011288,1.693,5.9,5.0,/AaXQosgSTtNbVFh0T039iiysOxr.jpg,0.0,0.0,130.0,tt8011288,7.3,654
12218,​Maj Rati ​​Keteki,2017,Movie,PG-13,netflix,564862.0,Drama,tt8914956,1.161,7.0,2.0,/x8RtNnvFpB2dZWo4GAlOSh7n0s5.jpg,0.0,0.0,116.0,tt8914956,7.1,29
12219,​Mayurakshi,2017,Movie,PG-13,netflix,500787.0,Drama,tt7570242,1.960,6.8,2.0,/IR6LQgrXzCIhPTn4rKw1cViY0P.jpg,0.0,0.0,102.0,tt7570242,7.1,347
12220,​​Kuch Bheege Alfaaz,2018,Movie,PG-13,netflix,527851.0,"Romance,Drama",tt7617988,1.325,7.5,6.0,/25cv2bdc4IIIcMWdFiHC0uEy0eI.jpg,0.0,0.0,116.0,tt7617988,7.5,714


In [19]:
# Calculate mean score and number of votes for each row
mean_score = []
mean_votes = []
for index, row in movies.iterrows():
    mean_score.append((row["imdb_score"] + row["tmdb_score"])/2)
    mean_votes.append((row["imdb_count"] + row["tmdb_count"])/2)

movies["mean_score"] = mean_score
movies["mean_num_votes"] = mean_votes

In [20]:
movies.to_csv("data/modified/movies_api_imdb_merged.csv", index=False)

## Basic Averaging

Looking at averages of various columns.

In [21]:
# Find the mean release year, score, number of votes and popularity of each service
movie_years = sqldf("SELECT AVG(release_year) as mean_year, AVG(mean_score) as mean_score, AVG(mean_num_votes) as mean_num_votes, AVG(popularity) as mean_popularity, COUNT(*) as n_titles, service FROM movies GROUP BY service")
movie_years

Unnamed: 0,mean_year,mean_score,mean_num_votes,mean_popularity,n_titles,service
0,2000.970443,5.613391,5121.701193,6.573927,3857,amazon
1,1996.056713,6.54184,39236.132523,27.324029,864,disney
2,1994.505089,6.662532,32024.087786,16.756098,1572,hbo
3,2010.871795,6.190878,23619.059665,18.109446,1014,hulu
4,2012.279349,6.240732,16878.365615,13.120368,4915,netflix


## Genre Breakdown

This will be used to recommend services based on what genres people most enjoy.

In [22]:
# Append each instance of a genre to the service_genres DataFrame - Movies only
try:
    service_genres = pd.read_csv("data/modified/service_genres.csv")
    ans = input("File found. Do you want to re-run this code? (y/n): ")
    if(ans == "y"):
        raise Exception("Re-running")
except:
    service_genres = pd.DataFrame(columns = ["service", "type", "genre", "score", "popularity"])
    for index, row in movies.iterrows():
        try:
            # For each genre in the listed_in column, add a new row to the DataFrame with the service, content type, and genre
            for genre in row["genres"].split(","):
                if(genre != ""):
                    service_genres = service_genres.append({"service": row["service"], "type": row["type"], "genre": genre, "score": row["mean_score"], "popularity": row["popularity"]}, ignore_index=True)
        except:
            pass
    service_genres.to_csv("data/modified/service_genres.csv", index=False)
clear_output()

In [23]:
# Count the instances of a genre for each service and type
service_genres_counted = sqldf("SELECT service, type, genre, COUNT(*) as count, AVG(score) as mean_score, AVG(popularity) as mean_popularity FROM service_genres GROUP BY service, type, genre")

In [24]:
# Separate movie genres from TV genres
service_genres_movies = sqldf("SELECT * FROM service_genres_counted WHERE type == \"Movie\"")

In [25]:
# Get genre counts as percentage of total movies
total_count = []
percentages = []
for index, row in service_genres_movies.iterrows():
    total_count.append(np.sum(sqldf(f"SELECT count FROM service_genres_movies WHERE service == \"{row.service}\""))[0])
    percentages.append(row["count"]/total_count[-1])
    

service_genres_movies["total_on_service"] = total_count
service_genres_movies["percentage_of_total"] = percentages

In [26]:
# Sort each service's genres by count
service_genres_movies = sqldf("SELECT * FROM service_genres_movies ORDER BY service, count DESC")

In [27]:
# Save counted genres as .csv
service_genres_movies.to_csv("data/modified/service_genres_counted.csv", index=False)

### More Genre Exploration

Taking a look at distributions of genres and what services have the highest proportion of each genre. 

In [28]:
# Get the top 5 genres on each service
limit = 5
frames = []
# For each service, get `limit` most frequent genres
for service in np.unique(service_genres_movies["service"]):
    frames.append(sqldf(f"SELECT * FROM (SELECT * FROM service_genres_movies ORDER BY service, count DESC) WHERE service == \"{service}\" LIMIT {limit}"))

# Combine all of the previosly created DataFrames
top_by_genre = frames[0].copy()
for frame in frames[1:]:
    top_by_genre = sqldf("SELECT * FROM top_by_genre UNION SELECT * FROM frame")
    
# Re-order DataFrame
top_by_genre = sqldf("SELECT * FROM top_by_genre ORDER BY service, count DESC")
top_by_genre

Unnamed: 0,service,type,genre,count,mean_score,mean_popularity,total_on_service,percentage_of_total
0,amazon,Movie,Drama,1613,5.871513,6.633058,7661,0.210547
1,amazon,Movie,Comedy,1104,5.780525,6.59755,7661,0.144107
2,amazon,Movie,Thriller,737,5.503392,9.609533,7661,0.096202
3,amazon,Movie,Action,660,5.401364,10.450024,7661,0.086151
4,amazon,Movie,Romance,604,5.825993,6.80097,7661,0.078841
5,disney,Movie,Family,566,6.413516,28.356125,2490,0.227309
6,disney,Movie,Comedy,370,6.321216,27.241246,2490,0.148594
7,disney,Movie,Animation,328,6.803506,33.469277,2490,0.131727
8,disney,Movie,Adventure,285,6.566842,49.024428,2490,0.114458
9,disney,Movie,Fantasy,166,6.580422,46.51756,2490,0.066667


In [29]:
# Get top service for each genre
movies_top_each_genre = sqldf("SELECT genre, service as top_service, count, MAX(percentage_of_total) as percentage_of_total, mean_score, mean_popularity FROM service_genres_movies GROUP BY genre")
movies_top_each_genre

Unnamed: 0,genre,top_service,count,percentage_of_total,mean_score,mean_popularity
0,Action,amazon,660,0.086151,5.401364,10.450024
1,Adventure,disney,285,0.114458,6.566842,49.024428
2,Animation,disney,328,0.131727,6.803506,33.469277
3,Comedy,netflix,1708,0.173419,6.12623,11.438206
4,Crime,amazon,398,0.051951,5.728518,8.223005
5,Documentary,hbo,307,0.088805,6.779805,3.859573
6,Drama,amazon,1613,0.210547,5.871513,6.633058
7,Family,disney,566,0.227309,6.413516,28.356125
8,Fantasy,disney,166,0.066667,6.580422,46.51756
9,History,hbo,85,0.024588,7.184706,12.979929


## Ratings

G, PG, PG-13, R, etc. 

This will be used to recommend services based on whether family friendly content is needed.

In [30]:
# Count occurrences of each rating for each service
ratings_by_service = sqldf("SELECT service, type, COUNT(rating) as count, rating, AVG(mean_score) as mean_score, AVG(popularity) as mean_popularity FROM movies GROUP BY service, rating")
ratings_by_service = sqldf("SELECT * FROM ratings_by_service WHERE count != 0")

In [31]:
# Get genre counts as percentage of total movies
total_count = []
percentages = []
for index, row in ratings_by_service.iterrows():
    total_count.append(np.sum(sqldf(f"SELECT count FROM ratings_by_service WHERE service == \"{row.service}\""))[0])
    percentages.append(row["count"]/total_count[-1])
    

ratings_by_service["total_on_service"] = total_count
ratings_by_service["percentage_of_total"] = percentages

In [32]:
# Save counted ratings as .csv
ratings_by_service.to_csv("data/modified/ratings_counted.csv", index=False)

### More Rating Exploration

Taking a look at distributions of ratings and what services have the highest proportion of each rating. 

In [33]:
# Get the top `limit` ratings on each service
limit = 5
frames = []
# For each service, get `limit` most frequent ratings
for service in np.unique(ratings_by_service["service"]):
    frames.append(sqldf(f"SELECT * FROM (SELECT * FROM ratings_by_service ORDER BY service, count DESC) WHERE service == \"{service}\" LIMIT {limit}"))
    
# Combine all of the previosly created DataFrames
top_by_rating = frames[0].copy()
for frame in frames[1:]:
    top_by_rating = sqldf("SELECT * FROM top_by_rating UNION SELECT * FROM frame")
    
# Re-order DataFrame
top_by_rating = sqldf("SELECT service, count, percentage_of_total, rating, mean_score, mean_popularity FROM top_by_rating ORDER BY service, count DESC")
top_by_rating

Unnamed: 0,service,count,percentage_of_total,rating,mean_score,mean_popularity
0,amazon,1272,0.346783,PG-13,5.627948,6.19987
1,amazon,1223,0.333424,R,5.550736,8.684261
2,amazon,512,0.139586,NC-17,5.432715,4.409516
3,amazon,384,0.104689,G,5.701693,4.419117
4,amazon,194,0.05289,PG,6.025515,9.224577
5,disney,419,0.484954,G,6.538544,21.090036
6,disney,370,0.428241,PG,6.433919,24.707716
7,disney,75,0.086806,PG-13,7.092667,75.058413
8,hbo,430,0.36379,R,6.643488,18.392977
9,hbo,366,0.309645,PG-13,6.599863,23.085251


In [34]:
# Get top service for each rating
movies_top_each_rating = sqldf("SELECT rating, service as top_service, count, MAX(percentage_of_total) as percentage_of_total, mean_score, mean_popularity FROM ratings_by_service GROUP BY rating")
movies_top_each_rating

Unnamed: 0,rating,top_service,count,percentage_of_total,mean_score,mean_popularity
0,G,disney,419,0.484954,6.538544,21.090036
1,NC-17,amazon,512,0.139586,5.432715,4.409516
2,NR,amazon,83,0.022628,5.446386,4.152241
3,PG,disney,370,0.428241,6.433919,24.707716
4,PG-13,amazon,1272,0.346783,5.627948,6.19987
5,R,hulu,355,0.508596,6.245915,19.175552
