# Proof of Concept

This exists to decide whether the overall goal of this project is feasible or not. It will be doing some of the following tasks:
* Asking for user inputs for preference deciding
* Generating metric that represents a user
* Generating metric that represents a service
* Comparing user to services and finding best fit

In [1]:
import pandas as pd
import numpy as np
from pandasql import sqldf
import random

In [2]:
movies = pd.read_csv("data/modified/movies_api_imdb_merged.csv")
genres = pd.read_csv("data/modified/service_genres_counted.csv")
ratings = pd.read_csv("data/modified/ratings_counted.csv")

In [3]:
list(movies.columns)

['title',
 'release_year',
 'type',
 'rating',
 'service',
 'tmdb_id',
 'genres',
 'imdb_id',
 'popularity',
 'tmdb_score',
 'tmdb_count',
 'poster_path',
 'budget',
 'revenue',
 'runtime',
 'tconst',
 'imdb_score',
 'imdb_count',
 'mean_score',
 'mean_num_votes']

In [4]:
list(genres.columns)

['service',
 'type',
 'genre',
 'count',
 'mean_score',
 'mean_popularity',
 'total_on_service',
 'percentage_of_total']

In [5]:
list(ratings.columns)

['service',
 'type',
 'count',
 'rating',
 'mean_score',
 'mean_popularity',
 'total_on_service',
 'percentage_of_total']

## Genres

Finding the "best" streaming service for a user based on genre preferences.

In [6]:
# Dictionary of genres and corresponding user-input values initialized to 0
user_genres = {key: 0 for key in genres.genre.unique()}

In [7]:
# A dictionary generated to provide an example response set for genres
example_genres = {'Drama': 1,'Comedy': 1,'Thriller': 1,'Action': 1,'Romance': 0,'Horror': 1,'Crime': 0,'Documentary': 0,'Family': 0,'Adventure': 1,'TV Movie': 0,'Mystery': 1,'Science Fiction': 1,'Western': 1,'Fantasy': 0,'Music': 0,'History': 0,'War': 1,'Animation': 1}

# Uncomment this line to redo genre calculating
#example_genres = None

In [8]:
# Ask the user whether they like each genre or not.
if(example_genres is None):
    for genre in user_genres:
        ans = input(f"Do you like the {genre} genre? (y/n): ")
        if(ans == "y"):
            # Value is 1 if the user likes the genre and 0 otherwise
            user_genres[genre] += 1
else:
    user_genres = example_genres

In [9]:
user_genres

{'Drama': 1,
 'Comedy': 1,
 'Thriller': 1,
 'Action': 1,
 'Romance': 0,
 'Horror': 1,
 'Crime': 0,
 'Documentary': 0,
 'Family': 0,
 'Adventure': 1,
 'TV Movie': 0,
 'Mystery': 1,
 'Science Fiction': 1,
 'Western': 1,
 'Fantasy': 0,
 'Music': 0,
 'History': 0,
 'War': 1,
 'Animation': 1}

In [10]:
# Summarize how a service performs for each genre and whether the user likes the genre or not
genre_service_summaries = {key:[] for key in genres.service.unique()}
for index, row in genres.iterrows():
    # Append (genre's user score)*(genre's average score on service)*(genre's percentage of total on service)
    genre_service_summaries[row["service"]] += [user_genres.get(row["genre"])*row["mean_score"]*row["percentage_of_total"]]

In [11]:
# Generate one-number genre summary for each service by calculating sum of all genre values 
# Sum chosen because scores of 0 should not impact choice considering that the genre can be ignored on service
for key, value in genre_service_summaries.items():
    genre_service_summaries[key] = np.sum(value)

In [12]:
genre_service_summaries

{'amazon': 4.0411956663620945,
 'disney': 3.620682730923695,
 'hbo': 4.193505929997107,
 'hulu': 4.314811529933482,
 'netflix': 4.276764138491216}

In [13]:
# Ideal service based on genres alone is the one with the highest total genre score
print(max(genre_service_summaries, key=genre_service_summaries.get))

hulu


## Ratings

Finding the "best" streaming service for a user based on rating preferences.

In [14]:
user_ratings = {rating:0 for rating in ratings.rating.unique()}

In [15]:
children = {'G': 1, 'NC-17': -1, 'NR': -1, 'PG': 1, 'PG-13': 0, 'R': -1}
no_adult_content = {'G': 0, 'NC-17': -1, 'NR': -1, 'PG': 0, 'PG-13': 0, 'R': 0}
middle_ground = {'G': 0, 'NC-17': -1, 'NR': -1, 'PG': 0, 'PG-13': 1, 'R': 1}
anything_goes = {'G': 0, 'NC-17': 0, 'NR': 0, 'PG': 0, 'PG-13': 0, 'R': 0}
example_ratings = random.choice([children, no_adult_content, middle_ground, anything_goes])

#example_ratings = None

In [16]:
if(example_ratings is None):
    for rating in user_ratings:
        ans = input(f"Are you okay with the {rating} rating being on your service? (1: want, 0: don't care, -1: don't want): ")
        user_ratings[rating] = int(ans)
else:
    user_ratings = example_ratings

In [17]:
user_ratings

{'G': 0, 'NC-17': -1, 'NR': -1, 'PG': 0, 'PG-13': 1, 'R': 1}

In [18]:
# Summarize how a service performs for each genre and whether the user likes the genre or not
rating_service_summaries = {key:[] for key in ratings.service.unique()}
for index, row in ratings.iterrows():
    # Append (genre's user score)*(genre's average score on service)*(genre's percentage of total on service)
    rating_service_summaries[row["service"]] += [user_ratings.get(row["rating"])*row["mean_score"]*row["percentage_of_total"]]

In [19]:
rating_service_summaries

{'amazon': [0.0,
  -0.7583287895310798,
  -0.12324154852780807,
  0.0,
  1.9516766630316251,
  1.8507497273718654],
 'disney': [0.0, 0.0, 0.61568287037037],
 'hbo': [0.0, 0.0, 2.043612521150592, 2.4168358714044014],
 'hulu': [0.0,
  -0.00988538681948424,
  0.0,
  1.7259312320916917,
  3.176647564469913],
 'netflix': [0.0,
  -0.0026476578411405295,
  -0.08668024439918533,
  0.0,
  1.9851934826883906,
  2.9746639511201622]}

In [20]:
# Generate one-number genre summary for each service by calculating sum of all genre values 
# Sum chosen because scores of 0 should not impact choice considering that the genre can be ignored on service
for key, value in rating_service_summaries.items():
    rating_service_summaries[key] = np.sum(value)

In [21]:
rating_service_summaries

{'amazon': 2.9208560523446025,
 'disney': 0.61568287037037,
 'hbo': 4.460448392554993,
 'hulu': 4.892693409742121,
 'netflix': 4.870529531568227}

In [22]:
# Ideal service based on ratings alone is the one with the highest total rating score
print(max(rating_service_summaries, key=rating_service_summaries.get))

hulu
