# Proof of Concept

This exists to decide whether the overall goal of this project is feasible or not. It will be doing some of the following tasks:
* Asking for user inputs for preference deciding
* Generating metric that represents a user
* Generating metric that represents a service
* Comparing user to services and finding best fit

In [2]:
import pandas as pd
import numpy as np
from pandasql import sqldf
import random

In [3]:
movies = pd.read_csv("data/modified/movies_api_imdb_merged.csv")
genres = pd.read_csv("data/modified/service_genres_counted.csv")
ratings = pd.read_csv("data/modified/ratings_counted.csv")
decades = pd.read_csv("data/modified/decades.csv")

In [4]:
list(movies.columns)

['title',
 'release_year',
 'type',
 'rating',
 'service',
 'tmdb_id',
 'genres',
 'imdb_id',
 'popularity',
 'tmdb_score',
 'tmdb_count',
 'poster_path',
 'budget',
 'revenue',
 'runtime',
 'tconst',
 'imdb_score',
 'imdb_count',
 'mean_score',
 'mean_num_votes',
 'decade']

In [5]:
list(genres.columns)

['service',
 'type',
 'genre',
 'count',
 'mean_score',
 'mean_popularity',
 'total_on_service',
 'percentage_of_total']

In [6]:
list(ratings.columns)

['service',
 'type',
 'count',
 'rating',
 'mean_score',
 'mean_popularity',
 'total_on_service',
 'percentage_of_total']

In [7]:
list(decades.columns)

['service',
 'decade',
 'count',
 'mean_score',
 'mean_popularity',
 'total_on_service',
 'percentage_of_total']

## Analysis

In [50]:
scores = {}

### Genres

Finding the "best" streaming service for a user based on genre preferences.

In [8]:
# Dictionary of genres and corresponding user-input values initialized to 0
user_genres = {key: 0 for key in genres.genre.unique()}

In [9]:
# A dictionary generated to provide an example response set for genres
example_genres = {'Drama': 1,'Comedy': 1,'Thriller': 1,'Action': 1,'Romance': 0,'Horror': 1,'Crime': 0,'Documentary': 0,'Family': 0,'Adventure': 1,'TV Movie': 0,'Mystery': 1,'Science Fiction': 1,'Western': 1,'Fantasy': 0,'Music': 0,'History': 0,'War': 1,'Animation': 1}

# Uncomment this line to redo genre calculating
#example_genres = None

In [10]:
# Ask the user whether they like each genre or not.
if(example_genres is None):
    for genre in user_genres:
        ans = input(f"Do you like the {genre} genre? (y/n): ")
        if(ans == "y"):
            # Value is 1 if the user likes the genre and 0 otherwise
            user_genres[genre] += 1
else:
    user_genres = example_genres

In [11]:
user_genres

{'Drama': 1,
 'Comedy': 1,
 'Thriller': 1,
 'Action': 1,
 'Romance': 0,
 'Horror': 1,
 'Crime': 0,
 'Documentary': 0,
 'Family': 0,
 'Adventure': 1,
 'TV Movie': 0,
 'Mystery': 1,
 'Science Fiction': 1,
 'Western': 1,
 'Fantasy': 0,
 'Music': 0,
 'History': 0,
 'War': 1,
 'Animation': 1}

In [12]:
# Summarize how a service performs for each genre and whether the user likes the genre or not
genre_service_summaries = {key:[] for key in genres.service.unique()}
for index, row in genres.iterrows():
    # Append (genre's user score)*(genre's average score on service)*(genre's percentage of total on service)
    genre_service_summaries[row["service"]] += [user_genres.get(row["genre"])*row["mean_score"]*row["percentage_of_total"]]

In [13]:
# Generate one-number genre summary for each service by calculating sum of all genre values 
# Sum chosen because scores of 0 should not impact choice considering that the genre can be ignored on service
for key, value in genre_service_summaries.items():
    genre_service_summaries[key] = np.sum(value)

In [51]:
scores.update({"genre": genre_service_summaries})
genre_service_summaries

{'amazon': 4.0411956663620945,
 'disney': 3.620682730923695,
 'hbo': 4.193505929997107,
 'hulu': 4.314811529933482,
 'netflix': 4.276764138491216}

In [52]:
# Ideal service based on genres alone is the one with the highest total genre score
print(max(genre_service_summaries, key=genre_service_summaries.get))

hulu


### Ratings

Finding the "best" streaming service for a user based on rating preferences.

In [16]:
user_ratings = {rating:0 for rating in ratings.rating.unique()}

In [17]:
children = {'G': 1, 'NC-17': -1, 'NR': -1, 'PG': 1, 'PG-13': 0, 'R': -1}
no_adult_content = {'G': 0, 'NC-17': -1, 'NR': -1, 'PG': 0, 'PG-13': 0, 'R': 0}
middle_ground = {'G': 0, 'NC-17': -1, 'NR': -1, 'PG': 0, 'PG-13': 1, 'R': 1}
anything_goes = {'G': 0, 'NC-17': 0, 'NR': 0, 'PG': 0, 'PG-13': 0, 'R': 0}
example_ratings = random.choice([children, no_adult_content, middle_ground, anything_goes])

#example_ratings = None

In [18]:
if(example_ratings is None):
    for rating in user_ratings:
        ans = input(f"Are you okay with the {rating} rating being on your service? (1: want, 0: don't care, -1: don't want): ")
        user_ratings[rating] = int(ans)
else:
    user_ratings = example_ratings

In [19]:
user_ratings

{'G': 1, 'NC-17': -1, 'NR': -1, 'PG': 1, 'PG-13': 0, 'R': -1}

In [20]:
# Summarize how a service performs for each genre and whether the user likes the genre or not
rating_service_summaries = {key:[] for key in ratings.service.unique()}
for index, row in ratings.iterrows():
    # Append (genre's user score)*(genre's average score on service)*(genre's percentage of total on service)
    rating_service_summaries[row["service"]] += [user_ratings.get(row["rating"])*row["mean_score"]*row["percentage_of_total"]]

In [22]:
# Generate one-number genre summary for each service by calculating sum of all genre values 
# Sum chosen because scores of 0 should not impact choice considering that the genre can be ignored on service
for key, value in rating_service_summaries.items():
    rating_service_summaries[key] = np.sum(value)

In [54]:
scores.update({"rating": rating_service_summaries})
rating_service_summaries

{'amazon': -1.8167257360959654,
 'disney': 5.926157407407411,
 'hbo': -0.2796531302876506,
 'hulu': -1.8096704871060156,
 'netflix': -1.8732179226069232}

In [24]:
# Ideal service based on ratings alone is the one with the highest total rating score
print(max(rating_service_summaries, key=rating_service_summaries.get))

disney


### Decades

Finding the "best" streaming service for a user based on decade preferences.

In [34]:
user_decades = {decade:0 for decade in sorted(decades.decade.unique())}

In [35]:
example_decades = {1910: 0, 1920: 0, 1930: 0, 1940: 0, 1950: 1, 1960: 1, 1970: 1, 1980: 1, 1990: 0, 2000: 0, 2010: 0, 2020: 0}

#example_decades = None

In [36]:
if(example_decades is None):
    for decade in user_decades:
        ans = input(f"Do you like movies from the {decade}s? (y/n): ")
        user_decades[decade] = 1 if ans == "y" else 0
else:
    user_decades = example_decades

In [37]:
print(user_decades)

{1910: 0, 1920: 0, 1930: 0, 1940: 0, 1950: 1, 1960: 1, 1970: 1, 1980: 1, 1990: 0, 2000: 0, 2010: 0, 2020: 0}


In [46]:
# Summarize how a service performs for each decade and whether the user likes the decade or not
decade_service_summaries = {key:[] for key in decades.service.unique()}
for index, row in decades.iterrows():
    # Append (decade's user score)*(decade's average score on service)*(decade's total count on service)
    decade_service_summaries[row["service"]] += [user_decades.get(row["decade"])*row["mean_score"]*row["percentage_of_total"]]

In [48]:
# Generate one-number genre summary for each service by calculating sum of all genre values 
# Sum chosen because scores of 0 should not impact choice considering that the genre can be ignored on service
for key, value in decade_service_summaries.items():
    decade_service_summaries[key] = np.sum(value)

In [64]:
scores.update({"decades": decade_service_summaries})
print(decade_service_summaries)
sorted(decade_service_summaries, key=decade_service_summaries.__getitem__)[::-1]

{'amazon': 0.8039797770287788, 'disney': 1.0868055555555556, 'hbo': 1.6203562340966915, 'hulu': 0.46923076923076923, 'netflix': 0.2829501525940997}


['hbo', 'disney', 'amazon', 'hulu', 'netflix']

## Final Evaluation

In [70]:
# Count the number of category top 3 placements for each service
services = {service: 0 for service in movies.service.unique()}
for i in scores:
    # First place gets 3 points, second gets 2, 
    sorted_scores = sorted(scores[i], key=scores[i].__getitem__)[2:]
    for index, value in enumerate(sorted_scores):
        services[value] += index+1

In [71]:
services

{'netflix': 2, 'amazon': 1, 'hulu': 4, 'hbo': 6, 'disney': 5}