In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf
from IPython.display import clear_output

## Importing Data

In [2]:
# Add column with service name to each DataFrame
amazon = pd.read_csv("data/amazon_clean.csv").assign(service="amazon")
disney = pd.read_csv("data/disney_clean.csv").assign(service="disney")
hulu = pd.read_csv("data/hulu_clean.csv").assign(service="hulu")
netflix = pd.read_csv("data/netflix_clean.csv").assign(service="netflix")

In [3]:
raw_dfs = [amazon,disney,hulu,netflix]

## Initial Restriction and Combining of Data

In [4]:
# Combine all DataFrames
services = raw_dfs[0].copy()
for df in raw_dfs[1:]:
    services = sqldf("SELECT * FROM services UNION SELECT * FROM df")

In [5]:
services.head()

Unnamed: 0,type,title,country,date_added,release_year,rating,duration,listed_in,description,service
0,Movie,"""Mixed Up""",,,2020,R,106,"Drama,Romance","""Mixed Up"" examines casual factors that make u...",amazon
1,Movie,#Alive,South Korea,"September 8, 2020",2020,R,99,"Horror,International,Thriller","As a grisly virus rampages a city, a lone man ...",netflix
2,Movie,#AnneFrank - Parallel Stories,Italy,"July 1, 2020",2019,PG-13,95,"Documentary,International","Through her diary, Anne Frank's story is retol...",netflix
3,Movie,#FriendButMarried,Indonesia,"May 21, 2020",2018,G,102,"Drama,International,Romance","Pining for his high school crush for years, a ...",netflix
4,Movie,#FriendButMarried 2,Indonesia,"June 28, 2020",2020,G,104,"Drama,International,Romance",As Ayu and Ditto finally transition from best ...,netflix


In [6]:
# Separate DataFrames based on content type 
movies = sqldf("SELECT * FROM services WHERE type == \"Movie\"")
tv = sqldf("SELECT * FROM services WHERE type == \"TV Show\"")

## Merging with TMDb API

In [7]:
# Import and initial API setup
from tmdbv3api import TMDb, Search, Movie

tmdb = TMDb()
tmdb.api_key = '93854ffff857e6d378b356f9caea9972'

In [8]:
# Search the TMDb API for a given film
def get_TMDb_id(title, release_year):
    search = Search()
    # Check first 3 pages. If not found by then, move on to the next title
    for i in range(1,3):
        results = search.movies({"query": title, "page": i})
        for result in results:
            try:
                # If the release years are the same, it is likely that the instances are the same
                if(str(release_year) == result["release_date"].split("-")[0]):
                    return result["id"]
            except:
                pass
    return None

In [9]:
# Get other information for a movie based on TMDb id
def get_TMDb_info(tmdb_id=None, requested_info = []):
    # If there is no provided id, return None for all requested data
    if(tmdb_id == None):
        return [None]*len(requested_info)
    
    # Get the details of the specified title
    movie = Movie().details(tmdb_id)
    
    collected_info = []
    for i in requested_info:
        # Genres must be combined into string
        if(i == "genres"):
            genre_string = ""
            for genre in movie[i]:
                genre_string += genre["name"] + ","
            collected_info.append(genre_string[:-1])
        else:
            collected_info.append(movie[i])
    return collected_info

In [10]:
# Add columns to movies from TMDb API
ids = []
# Reset the index for consistency
movies = movies.reset_index(drop=True)
try:
    # Try to import data. If it is imported, drop rows that do not have provided IDs
    movies = pd.read_csv("data/modified/movies_api_merged.csv")
    movies = movies.dropna(subset=["tmdb_id"])
    movies = movies.reset_index(drop=True)
    # Allow for re-running. Would want this if more data is requested. 
    ans = input("File found. Do you want to re-run this code? (y/n): ")
    if(ans == "y"):
        # Allow for full restarting - wipes previously calculated IDs
        ans = input("Do you want to fully restart? Warning: This will take a long time to compute (y/n): ")
        if(ans == "n"):
            # Store IDs to speed up computation if re-running
            ids = list(movies["tmdb_id"])
        # Raise an exception so that code can be re-run
        raise Exception("Re-running")
# If data is not found or user requests re-running, code will execute
except Exception as e:
    # Check whether IDs already exist
    if(ids == []):
        ids_loaded = False
    else:
        ids_loaded = True
        
    # Data requested from API
    other_requests = ["genres", "imdb_id", "popularity", "vote_average", "vote_count", "poster_path", "budget", "revenue"]
    
    # Dictionary of lists that contain requested information
    other = {}
    for request in other_requests:
        other.update({request: []})
    
    for index, row in movies.iterrows():
        # Output progress 
        if(index%10 == 0):
            clear_output()
            print(index/movies.shape[0])
            
        # If there are not already IDs, get IDs from API
        if(not ids_loaded):
            ids.append(get_TMDb_id(row["title"], row["release_year"]))
            other_info = get_TMDb_info(ids[-1], other_requests)
        else:
            # If the IDs exist and the current ID is valid, get other info
            if(ids[index] != None and ids[index] != np.nan):
                other_info = get_TMDb_info(ids[index], other_requests)
            # If current ID is not valid, other info is None
            else:
                other_info = [None]*len(other_requests)
        
        if(other_info != None):
            # Add new information to dictionary of info
            for other_index in range(len(other_info)):
                other[list(other.keys())[other_index]].append(other_info[other_index])
                
    # Set new IDs if necessary
    if(not ids_loaded):
        movies["tmdb_id"] = ids
    
    # Add columns for new information
    for key in other.keys():
        movies[key] = other.get(key)
        
    # Save data as .csv
    movies.to_csv("data/modified/movies_api_merged.csv", index=False)
#movies.head()

File found. Do you want to re-run this code? (y/n): n


In [11]:
movies.describe()

Unnamed: 0,release_year,duration,tmdb_id,popularity,vote_average,vote_count,budget,revenue
count,12085.0,12085.0,12085.0,12085.0,12085.0,12085.0,12085.0,12085.0
mean,2008.102524,96.551345,354646.219528,13.695966,5.638684,543.73976,7814755.0,23792520.0
std,17.848363,28.667716,263629.891088,71.025953,2.144746,1818.741687,27008130.0,110149000.0
min,1920.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0
25%,2006.0,85.0,73443.0,1.973,5.3,4.0,0.0,0.0
50%,2015.0,96.0,374056.0,5.047,6.2,28.0,0.0,0.0
75%,2018.0,111.0,554265.0,13.002,6.9,201.0,0.0,0.0
max,2021.0,312.0,942567.0,6865.258,10.0,30989.0,380000000.0,2847246000.0


In [12]:
# Remove rows without TMDb IDs
movies = movies.dropna(subset=["tmdb_id"])
movies = movies.reset_index(drop=True)

In [13]:
sqldf("SELECT * FROM movies ORDER BY release_year DESC, popularity DESC, vote_average DESC").head()

Unnamed: 0,type,title,country,date_added,release_year,rating,duration,listed_in,description,service,tmdb_id,genres,imdb_id,popularity,vote_average,vote_count,poster_path,budget,revenue
0,Movie,#Home,,,2021,PG-13,161,Drama,"Home is about the humble, technology-challenge...",amazon,634649.0,"Action,Adventure,Science Fiction",tt10872600,6865.258,8.3,8243,/1g0dhYtq4irTY1GPXvft6k4YLjm.jpg,200000000,1809940686
1,Movie,Red,India,23-Feb-21,2021,PG-13,143,"Drama,International",A murder investigation leads police to a photo...,netflix,512195.0,"Action,Comedy,Crime,Thriller",tt7991608,1823.157,6.8,3078,/wdE6ewaKZHr62bLqCn7A2DiGShm.jpg,160000000,178143
2,Movie,One,,,2021,PG-13,151,Drama,A righteous politician's unflinching loyalty t...,amazon,811592.0,Action,tt14199590,942.826,6.7,305,/3OXiTjU30gWtqxmx4BU9RVp2OTv.jpg,0,0
3,Movie,Legend,,,2021,PG-13,67,Documentary,Sam Cooke continues to influence generation af...,amazon,566525.0,"Action,Adventure,Fantasy",tt9376612,907.507,7.8,5716,/1BIoJGKbXjdFDAqUEiA2VHqkK1Z.jpg,150000000,432243292
4,Movie,Shang-Chi and The Legend of The Ten Rings,"United States,Australia",12-Nov-21,2021,PG-13,134,"Action,Adventure,Fantasy,Superhero","Shang-Chi must confront his father, leader of ...",disney,566525.0,"Action,Adventure,Fantasy",tt9376612,907.507,7.8,5716,/1BIoJGKbXjdFDAqUEiA2VHqkK1Z.jpg,150000000,432243292


## Release Years

This will be used to recommend services based on whether they prefer new or older content.

In [14]:
# Find the mean release year of content as well as the total number of movies on each service
movie_years = sqldf("SELECT AVG(release_year) as mean_year, COUNT(*) as n_titles, service FROM movies GROUP BY service")
movie_years

Unnamed: 0,mean_year,n_titles,service
0,2003.956973,4718,amazon
1,1998.458811,959,disney
2,2011.333647,1064,hulu
3,2012.849738,5344,netflix


## Genre Breakdown

This will be used to recommend services based on what genres people most enjoy.

In [15]:
# Append each instance of a genre to the service_genres DataFrame - Movies only
try:
    service_genres = pd.read_csv("data/modified/service_genres.csv")
    ans = input("File found. Do you want to re-run this code? (y/n): ")
    if(ans == "y"):
        raise Exception("Re-running")
except:
    service_genres = pd.DataFrame(columns = ["service", "type", "genre"])
    for index, row in movies.iterrows():
        try:
            # For each genre in the listed_in column, add a new row to the DataFrame with the service, content type, and genre
            for genre in row["genres"].split(","):
                if(genre != ""):
                    service_genres = service_genres.append({"service": row["service"], "type": row["type"], "genre": genre}, ignore_index=True)
        except:
            pass
    service_genres.to_csv("data/modified/service_genres.csv", index=False)

File found. Do you want to re-run this code? (y/n): n


In [16]:
# Count the instances of a genre for each service and type
service_genres_counted = sqldf("SELECT service, type, genre, COUNT(*) as count FROM service_genres GROUP BY service, type, genre")

In [17]:
# Separate movie genres from TV genres
service_genres_movies = sqldf("SELECT * FROM service_genres_counted WHERE type == \"Movie\"")

In [18]:
# Get genre counts as percentage of total movies
total_count = []
percentages = []
for index, row in service_genres_movies.iterrows():
    total_count.append(np.sum(sqldf(f"SELECT count FROM service_genres_movies WHERE service == \"{row.service}\""))[0])
    percentages.append(row["count"]/total_count[-1])
    

service_genres_movies["total_on_service"] = total_count
service_genres_movies["percentage_of_total"] = percentages

In [19]:
# Sort each service's genres by count
service_genres_movies = sqldf("SELECT * FROM service_genres_movies ORDER BY service, count DESC")
# Save counted genres as .csv
service_genres_movies.to_csv("data/modified/service_genres_counted.csv", index=False)

### More Genre Exploration

In [20]:
# Get the top 5 genres on each service
limit = 5
frames = []
# For each service, get `limit` most frequent genres
for service in np.unique(service_genres_movies["service"]):
    frames.append(sqldf(f"SELECT * FROM (SELECT * FROM service_genres_movies ORDER BY service, count DESC) WHERE service == \"{service}\" LIMIT {limit}"))

# Combine all of the previosly created DataFrames
top_by_genre = frames[0].copy()
for frame in frames[1:]:
    top_by_genre = sqldf("SELECT * FROM top_by_genre UNION SELECT * FROM frame")
    
# Re-order DataFrame
top_by_genre = sqldf("SELECT * FROM top_by_genre ORDER BY service, count DESC")
top_by_genre

Unnamed: 0,service,type,genre,count,total_on_service,percentage_of_total
0,amazon,Movie,Drama,1857,8802,0.210975
1,amazon,Movie,Comedy,1240,8802,0.140877
2,amazon,Movie,Thriller,861,8802,0.097819
3,amazon,Movie,Action,736,8802,0.083617
4,amazon,Movie,Romance,652,8802,0.074074
5,disney,Movie,Family,588,2657,0.221302
6,disney,Movie,Comedy,391,2657,0.147158
7,disney,Movie,Animation,349,2657,0.131351
8,disney,Movie,Adventure,292,2657,0.109898
9,disney,Movie,Fantasy,177,2657,0.066616


In [21]:
# Get top service for each genre
movies_top_each_genre = sqldf("SELECT genre, service as top_service, count, MAX(percentage_of_total) as percentage_of_total FROM service_genres_movies GROUP BY genre")
movies_top_each_genre

Unnamed: 0,genre,top_service,count,percentage_of_total
0,Action,amazon,736,0.083617
1,Adventure,disney,292,0.109898
2,Animation,disney,349,0.131351
3,Comedy,netflix,1821,0.172101
4,Crime,amazon,451,0.051238
5,Documentary,netflix,747,0.070598
6,Drama,amazon,1857,0.210975
7,Family,disney,588,0.221302
8,Fantasy,disney,177,0.066616
9,History,hulu,44,0.018731


## Ratings

G, PG, PG-13, R, etc. 

This will be used to recommend services based on whether family friendly content is needed.

In [22]:
# Count occurrences of each rating for each service
ratings_by_service = sqldf("SELECT service, type, COUNT(rating) as count, rating FROM movies GROUP BY service, rating")
ratings_by_service = sqldf("SELECT * FROM ratings_by_service WHERE count != 0")

In [23]:
# Get genre counts as percentage of total movies
total_count = []
percentages = []
for index, row in ratings_by_service.iterrows():
    total_count.append(np.sum(sqldf(f"SELECT count FROM ratings_by_service WHERE service == \"{row.service}\""))[0])
    percentages.append(row["count"]/total_count[-1])
    

ratings_by_service["total_on_service"] = total_count
ratings_by_service["percentage_of_total"] = percentages

In [24]:
# Save counted ratings as .csv
ratings_by_service.to_csv("data/modified/ratings_counted.csv", index=False)

### More Rating Exploration

Taking a look at distributions of ratings and what services have the highest proportion of each rating. 

In [25]:
# Get the top `limit` ratings on each service
limit = 5
frames = []
# For each service, get `limit` most frequent ratings
for service in np.unique(ratings_by_service["service"]):
    frames.append(sqldf(f"SELECT * FROM (SELECT * FROM ratings_by_service ORDER BY service, count DESC) WHERE service == \"{service}\" LIMIT {limit}"))
    
# Combine all of the previosly created DataFrames
top_by_rating = frames[0].copy()
for frame in frames[1:]:
    top_by_rating = sqldf("SELECT * FROM top_by_rating UNION SELECT * FROM frame")
    
# Re-order DataFrame
top_by_rating = sqldf("SELECT service, count, percentage_of_total, rating FROM top_by_rating ORDER BY service, count DESC")
top_by_rating

Unnamed: 0,service,count,percentage_of_total,rating
0,amazon,1504,0.334371,R
1,amazon,1490,0.331258,PG-13
2,amazon,702,0.156069,NC-17
3,amazon,501,0.111383,G
4,amazon,196,0.043575,PG
5,disney,442,0.460897,G
6,disney,419,0.436913,PG
7,disney,98,0.10219,PG-13
8,hulu,374,0.518724,R
9,hulu,193,0.267684,PG-13


In [26]:
# Get top service for each rating
movies_top_each_rating = sqldf("SELECT rating, service as top_service, count, MAX(percentage_of_total) as percentage_of_total FROM ratings_by_service GROUP BY rating")
movies_top_each_rating

Unnamed: 0,rating,top_service,count,percentage_of_total
0,G,disney,442,0.460897
1,NC-17,amazon,702,0.156069
2,NR,amazon,105,0.023344
3,PG,disney,419,0.436913
4,PG-13,amazon,1490,0.331258
5,R,hulu,374,0.518724
