In [50]:
import pandas as pd
import numpy as np
import _pickle as pickle
import datetime
import random
import timestring
import statistics

from bokeh.io import output_file, show, output_notebook
from bokeh.models import ColumnDataSource
from bokeh.palettes import *
from bokeh.plotting import figure

fileHandler = open(b"../vagrant_shared_folder/movieRatings.obj", "rb")
movieRatings = pickle.load(fileHandler)
fileHandler.close()

output_notebook()

In [51]:
def random_HEX_colors(n=1):

    color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
             for i in range(n)]
    
    return color

In [52]:
print(f'There are {len(movieRatings)} movies.')

There are 276 movies.


In [53]:
for ID, data in movieRatings.items():
    if 'Silence' in data['title']:
        print("{}, vote = {}".format(data['title'],data['personal_rating']))

The Silence of the Lambs, vote = 9


In [54]:
movieRatings['tt4550098']

{'ID': 'tt4550098',
 'type': 'movie',
 'title': 'Nocturnal Animals',
 'genre': 'Drama, Thriller',
 'year': '2016',
 'personal_rating': '9',
 'IMDB_rating': '7.5',
 'rated_on': '03 Jan 2021',
 'actors': ['Amy Adams',
  'Jake Gyllenhaal',
  'Michael Shannon',
  'Aaron Taylor-Johnson'],
 'director': 'Tom Ford',
 'runtime': '1 hr 56 min'}

## Best-rated {9-10}

In [55]:
tens = []
nines = []

for ID, data in movieRatings.items():
    t = int(data['personal_rating'])
    if t in [9,10]:
        if t == 9:
            nines.append(data['title'])
        else:
            tens.append(data['title'])

In [56]:
print(f"10s: {tens}; {len(tens)}")
print(f"9s: {nines}; {len(nines)}")

10s: ['Soul', 'Heat', 'Se7en', 'The Last Dance', 'Ozark', 'When Harry Met Sally', 'Succession', 'Peaky Blinders', 'Game of Thrones', 'The Big Bang Theory']; 10
9s: ['The Departed', 'The Night Of', 'Promising Young Woman', 'Dark Waters', 'Night Stalker: The Hunt for a Serial Killer', 'Run', 'Inside Out', 'Nocturnal Animals', 'The Boys', 'Prisoners', 'There Will Be Blood', '1917', '12 Angry Men', 'Casino', 'Nightcrawler', 'The Shawshank Redemption', 'Goodfellas', 'Inglourious Basterds', 'The Lobster', 'Moneyball', 'Apollo 11', 'Joker', 'BoJack Horseman', 'James May: Our Man in Japan', 'The Mandalorian', 'Once Upon a Time in America', 'The Day After Tomorrow', 'Mindhunter', 'Billions', 'Chernobyl', 'Presumed Innocent', 'Luther', 'Black Mirror: Bandersnatch', 'The Staircase', 'Making a Murderer', 'Black Mirror', 'Icarus', 'The Silence of the Lambs', 'Spotlight', 'Last Week Tonight with John Oliver', 'Halt and Catch Fire', 'Inception', 'The Godfather: Part II', 'The Godfather', 'House of Ca

## Year-based stats

In [57]:
def retrieve_year_specific_data(year, debug=False):
    data_year = []
    movies_year = []
    tv_year = []
    
    year = str(year)

    for ID, data in movieRatings.items():
        t = data['rated_on']
        if year in t:
            data_year.append(data)

    for d in data_year:
        t = d['type']
        if 'movie' in t:
            movies_year.append(d)
        else:
            tv_year.append(d)
    
    if len(movies_year)+len(tv_year) == len(data_year):
        if debug:
            print(f"Amount of movies/tv_series seen in {year}: {len(data_year)}")
            print(f"Movies seen in {year}: {len(movies_year)}")
            print(f"TV Series seen in {year}: {len(tv_year)}")
        return movies_year, tv_year
    else:
        print("Error...")
        return None, None

In [95]:
mov, tv = retrieve_year_specific_data(2021)
len(mov), len(tv)

(19, 6)

## Yearly-stats distribution

In [59]:
def yearly_stats_distribution():
    current_year = datetime.datetime.now().year + 1
    possible_years = [i for i in range(2014, current_year)]
    counts_tot = []
    counts_mov = []
    counts_tv = []
    
    effective_years = []

    for y in possible_years:
        mov, tv = retrieve_year_specific_data(y)

        effective_years.append(y)
        counts_tot.append(len(mov)+len(tv))
        counts_mov.append(len(mov))
        counts_tv.append(len(tv))

    strYears = []
    for _ in effective_years:
        strYears.append(str(_))
    
    return strYears, counts_tot, counts_mov, counts_tv

In [60]:
# output_file("bar_colors.html")

years, counts, _, _ = yearly_stats_distribution()

source = ColumnDataSource(data=dict(year=years, counts=counts, color=small_palettes['Viridis'][len(years)]))

p = figure(x_range=years, y_range=(min(counts),max(counts)+20), plot_height=350, title="Yearly Distribution",
           toolbar_location=None, tools="hover", tooltips="data points: @counts")

p.vbar(x='year', top='counts', width=0.9, color='color', source=source)

p.xgrid.grid_line_color = None

show(p)

In [113]:
years, counts, mov, tv = yearly_stats_distribution()

type_data = {
    'years'    : years,
    'movies'   : mov,
    'tv series': tv
            }

p = figure(x_range=years, plot_height=250,y_range=(min(counts),max(counts)+20), title="Yearly Distribution by Type",
           toolbar_location=None, tools="hover", tooltips="$name in @years: @$name")

p.vbar_stack(['movies', 'tv series'], x='years', width=0.9, color=["#FFC300", "#058C0B"], source=type_data,
             legend_label=['movies', 'tv series'])

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = "top_center"
p.legend.orientation = "horizontal"

show(p)

## Genre-based analysis

In [62]:
def build_genre_dict():
    genres = {}

    for ID, data in movieRatings.items():
        genre_current = data['genre'].split(',')
        for g in genre_current:
            g = g.strip()
            try:
                genres[g] += 1
            except KeyError:
                genres[g] = 1
            
    return genres

In [63]:
genre_dict = build_genre_dict()
genre_dict

{'Crime': 109,
 'Drama': 187,
 'Thriller': 78,
 'Comedy': 53,
 'Mystery': 55,
 'Adventure': 34,
 'Fantasy': 11,
 'Biography': 29,
 'History': 20,
 'Documentary': 24,
 'Horror': 20,
 'Sci-Fi': 26,
 'Action': 46,
 'Animation': 5,
 'Romance': 18,
 'Family': 2,
 'War': 7,
 'Sport': 5,
 'News': 1,
 'Talk-Show': 2,
 'Music': 2}

In [64]:
# Genre-based on yearly data
def genre_yearly_data():
    current_year = datetime.datetime.now().year + 1
    # TODO: 2014 should become 'signup date'
    possible_years = [str(i) for i in range(2014, current_year)]

    current_year_mov, current_year_tv = None, None
    yearly_genres = {}
    
    for year in possible_years:
        
        yearly_genres[year] = {}
        
        current_year_mov, current_year_tv = retrieve_year_specific_data(year)
        current_year_data = current_year_mov + current_year_tv
        
        for data in current_year_data:
            for g in data['genre'].split(','):
                g = g.strip()
                try:
                    yearly_genres[year][g] += 1
                except KeyError:
                    yearly_genres[year][g] = 1
                    
    return yearly_genres

In [65]:
genre_yearly_data()

{'2014': {},
 '2015': {'Crime': 4,
  'Mystery': 1,
  'Thriller': 6,
  'Comedy': 6,
  'Romance': 5,
  'Action': 4,
  'Adventure': 3,
  'Sci-Fi': 2,
  'Biography': 2,
  'Drama': 11,
  'Music': 1,
  'History': 1},
 '2016': {'Crime': 9,
  'Drama': 27,
  'Mystery': 8,
  'Action': 7,
  'Adventure': 5,
  'Fantasy': 4,
  'Thriller': 3,
  'Comedy': 9,
  'Romance': 2,
  'Sci-Fi': 2,
  'Biography': 8,
  'Horror': 3,
  'Documentary': 2,
  'History': 3,
  'War': 3,
  'Talk-Show': 1,
  'Music': 1,
  'Animation': 1},
 '2017': {'Drama': 30,
  'Sci-Fi': 6,
  'Thriller': 13,
  'Crime': 16,
  'Romance': 4,
  'Biography': 7,
  'Action': 11,
  'Adventure': 6,
  'Comedy': 5,
  'Horror': 4,
  'Documentary': 2,
  'History': 4,
  'Fantasy': 2,
  'Mystery': 7,
  'War': 1,
  'News': 1,
  'Talk-Show': 1},
 '2018': {'Drama': 19,
  'Mystery': 8,
  'Thriller': 13,
  'Action': 9,
  'Adventure': 6,
  'Sci-Fi': 6,
  'Crime': 12,
  'Biography': 2,
  'Sport': 2,
  'History': 2,
  'Comedy': 2,
  'Horror': 2,
  'Documentar

In [66]:
def build_data_source_for_graph():
    all_possible_genres_seen = build_genre_dict().keys()

    source = {}

    yearly_data = genre_yearly_data()

    source['years'] = [str(i) for i in yearly_data.keys()]

    for genre in all_possible_genres_seen:
        source[genre] = [0 for i in range(0, len(source['years']))]

    for n, year in enumerate(yearly_data.keys()):
        genres = yearly_data[year]
        for g in genres:
            source[g][n] = genres[g]

    return all_possible_genres_seen, source

In [67]:
all_possible_genres_seen, source = build_data_source_for_graph()

labels = [str(l) for l in all_possible_genres_seen]

p = figure(x_range=source['years'], plot_height=450, plot_width=1000, y_range=(0,200),
           title="Genre Distribution per Year",
           toolbar_location=None, tools="hover", tooltips="$name in @years: @$name")

p.vbar_stack(labels, x='years', width=0.3, 
             color=random_HEX_colors(len(labels)), source=source,
             legend_label=labels)

# p.y_range.start = 0
p.x_range.range_padding = .15
# p.min_border_bottom=10
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None

p.legend.orientation = "horizontal"
# p.legend.label_text_font_size = "5px"

new_legend = p.legend[0]
p.add_layout(new_legend, 'below')

show(p)

## Most recent data

In [68]:
def retrieve_most_recent_seen():
    
    print("""Retrieving the most recently seen shows/movies this past 2 weeks""")
    
    recent_data = []
    
    last_2_weeks = timestring.Range('last 14 days')
    
    for ID, data in movieRatings.items():
        d = data['rated_on'].split(' ')
        finalDate = d[1] +' ' + d[0] + ' ' + d[2]
        finalDate = timestring.Date(finalDate)
        
        if finalDate not in last_2_weeks:
            break
        else:
            recent_data.append(data)
        
    return recent_data


retrieve_most_recent_seen()

Retrieving the most recently seen shows/movies this past 2 weeks


[{'ID': 'tt0407887',
  'type': 'movie',
  'title': 'The Departed',
  'genre': 'Crime, Drama, Thriller',
  'year': '2006',
  'personal_rating': '9',
  'IMDB_rating': '8.5',
  'rated_on': '14 Feb 2021',
  'actors': ['Leonardo DiCaprio',
   'Matt Damon',
   'Jack Nicholson',
   'Mark Wahlberg'],
  'director': 'Martin Scorsese',
  'runtime': '2 hr 31 min'},
 {'ID': 'tt0117951',
  'type': 'movie',
  'title': 'Trainspotting',
  'genre': 'Drama',
  'year': '1996',
  'personal_rating': '8',
  'IMDB_rating': '8.1',
  'rated_on': '12 Feb 2021',
  'actors': ['Ewan McGregor',
   'Ewen Bremner',
   'Jonny Lee Miller',
   'Kevin McKidd'],
  'director': 'Danny Boyle',
  'runtime': '1 hr 33 min'},
 {'ID': 'tt8980602',
  'type': 'movie',
  'title': 'The Kid Detective',
  'genre': 'Comedy, Drama, Mystery',
  'year': '2020',
  'personal_rating': '8',
  'IMDB_rating': '6.9',
  'rated_on': '11 Feb 2021',
  'actors': ['Kaitlyn Chalmers-Rizzato',
   'Adam Brody',
   'Kaleb Horn',
   'Wendy Crewson'],
  'dire

## TMDB API

In [69]:
import requests
import json
import shutil
import glob, os
from PIL import Image

TMDB_API = '2410c5007d1bd89e6f524c8659332ca4'
external_source = '&external_source=imdb_id'
img_url = []
        
recent_data = retrieve_most_recent_seen()

for recent in recent_data:
    ID_find = recent['ID']
    r = requests.get(f'https://api.themoviedb.org/3/find/{ID_find}?api_key={TMDB_API}&{external_source}')
    data = r.json()

    for i in data:
        for data_inner in data[i]:
            if data_inner != '':
                img_TMDB = data_inner['poster_path']
                
    img_url.append(f'https://image.tmdb.org/t/p/w500{img_TMDB}')

    
for n, img in enumerate(img_url):
    response = requests.get(img, stream=True)
    with open(f'img_{n}.png', 'wb') as out_file:
        shutil.copyfileobj(response.raw, out_file)
    del response

Retrieving the most recently seen shows/movies this past 2 weeks


## Retrieve people data

In [70]:
def retrieve_people(d):
    actors = {}
    directors = {}
    
    for ID, data in d.items():
        tmp_act = data['actors']
        tmp_dir = data['director']
        
        if tmp_act != '':
            for _ in tmp_act:
                try:
                    if _ == '':
                        print(f"This: {data['title']}")
                    actors[_] += 1
                except KeyError:
                    actors[_] = 1
                                       
        if tmp_dir != '':
            try:
                directors[tmp_dir] += 1
            except KeyError:
                directors[tmp_dir] = 1
    
    return directors, actors

In [71]:
def bulk_download_images(people_type, url):
    
    for n, img in enumerate(url):
        response = requests.get(img, stream=True)
        with open(f'{people_type}_{n}.png', 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
        del response
        
    return f"Downloaded {people_type}-related data."

In [72]:
di, ac = retrieve_people(movieRatings)

sorted_actors = dict(sorted(ac.items(), key=lambda item: item[1]))
sorted_directors = dict(sorted(di.items(), key=lambda item: item[1]))

top_three_actors = {k: sorted_actors[k] for k in list(sorted_actors)[len(sorted_actors)-3:]}
top_three_directors = {k: sorted_directors[k] for k in list(sorted_directors)[len(sorted_directors)-3:]}
top_three_actors, top_three_directors

({'Brad Pitt': 7, 'Jake Gyllenhaal': 8, 'Robert De Niro': 14},
 {'Christopher Nolan': 4, 'David Fincher': 5, 'Martin Scorsese': 8})

In [73]:
# Retriving TMDB images for actors
TMDB_API = '2410c5007d1bd89e6f524c8659332ca4'
img_url = []
        
# https://api.themoviedb.org/3/search/person?api_key=2410c5007d1bd89e6f524c8659332ca4&query=asdasd

for actor in top_three_actors.keys():
    query = actor
    page_number = 1
    r = requests.get(f'https://api.themoviedb.org/3/search/person?api_key={TMDB_API}&query={query}&page={page_number}')
    data = r.json()

    for d in data['results']:
        img_TMDB = d['profile_path']
        if img_TMDB != None:
            img_url.append(f'https://image.tmdb.org/t/p/w500{img_TMDB}')
            break


bulk_download_images('actor', img_url)

'Downloaded actor-related data.'

In [74]:
# Retriving TMDB images for directors
TMDB_API = '2410c5007d1bd89e6f524c8659332ca4'
img_url = []
        
# https://api.themoviedb.org/3/search/person?api_key=2410c5007d1bd89e6f524c8659332ca4&query=asdasd

for director in top_three_directors.keys():
    query = director
    page_number = 1
    r = requests.get(f'https://api.themoviedb.org/3/search/person?api_key={TMDB_API}&query={query}&page={page_number}')
    data = r.json()

    for d in data['results']:
        img_TMDB = d['profile_path']
        if img_TMDB != None:
            img_url.append(f'https://image.tmdb.org/t/p/w500{img_TMDB}')
            break

bulk_download_images('director', img_url)

'Downloaded director-related data.'

## Average rating yearly

In [75]:
def average_yearly_rating(years, debug=False):
    average_rating_data = []
    
    for y in years:
        tmp_ratings = []
        current_m, current_t = retrieve_year_specific_data(y)
        current_data = current_m + current_t

        for data in current_data:
            tmp_ratings.append(int(data['personal_rating']))
        try:
            mean = statistics.mean(tmp_ratings)
            median = statistics.median(tmp_ratings)
            stdev = statistics.stdev(tmp_ratings)
            
            if debug:
                print(f"In {y}:")
                print(f'Mean: {mean}, Median: {median}, Stdev: {stdev}')
            
            average_rating_data.append(round(mean,2))
            
        except statistics.StatisticsError:
            if debug:
                print(f"No Data in {y}.")
            average_rating_data.append(np.nan)
    
    return average_rating_data

In [76]:
years, counts, _, _ = yearly_stats_distribution()

avg = average_yearly_rating(years)

source = ColumnDataSource(data=dict(year=years, counts=counts, avg=avg, color=small_palettes['Viridis'][len(years)]))

p = figure(x_range=years, y_range=(min(counts),max(counts)+20), plot_height=350, title="Yearly Distribution w/ Ratings Average",
           toolbar_location=None, tools="hover", tooltips="data points: @counts, average: @avg")

p.vbar(x='year', top='counts', width=0.9, color='color', source=source)

p.xgrid.grid_line_color = None

show(p)

In [77]:
global_mean = np.nanmean(avg)
global_median = np.nanmedian(avg)
global_min = np.nanmin(avg)
global_max = np.nanmax(avg)

print(global_mean, global_median, global_min, global_max)

7.525714285714286 7.61 7.07 8.17


## Longest Runtime Yearly

In [137]:
# Extrapolating minutes
def runtime_extrapolator(s):    
    s = s.split('hr')
    
    if len(s) != 1:
        if 'min' not in str(s):
            h = runtime_extrapolator(s[0])
            m = 0
        else:
            h = runtime_extrapolator(s[0])
            m = runtime_extrapolator(s[1])
        
        s = h*60 + m
        
    else:
        if 'hr' in s:
            s = s[0].split('hr')[0]
        else:
            s = s[0].split('min')[0]
    
    
    return int(s)


def longest_runtime_movie_yearly(debug=False):
    current_year = datetime.datetime.now().year + 1
    years = [i for i in range(2014, current_year)]

    longest_runtimes = [0 for i in range(0, len(years))]
    
    runtimes_yearly = []
    
    try:
        for y in years:
            mov, _ = retrieve_year_specific_data(y)

            # rated_on(year), ID, runtime
            max_runtime = [y, 'None', 0]

            for d in mov:
                if d['runtime'] == '' or '– ' in d['year']:
                    continue
                new_runtime = runtime_extrapolator(d['runtime'])
                if new_runtime > max_runtime[2]:
                    max_runtime[1] = d['ID']
                    max_runtime[2] = new_runtime
                if debug:
                    print(f"{d['runtime']}, {d['year']}, {d['title']}")

            runtimes_yearly.append(max_runtime)
    
    except Exception as e:
        print("*** Error ***")
        print(e)
        print(d)
        
    return runtimes_yearly

r = longest_runtime_movie_yearly()
r
# movieRatings[r[1][1]]

{'ID': 'tt0087843',
 'type': 'movie',
 'title': 'Once Upon a Time in America',
 'genre': 'Crime, Drama',
 'year': '1984',
 'personal_rating': '9',
 'IMDB_rating': '8.4',
 'rated_on': '25 Dec 2019',
 'actors': ['Robert De Niro',
  'James Woods',
  'Elizabeth McGovern',
  'Treat Williams'],
 'director': 'Sergio Leone',
 'runtime': '3 hr 49 min'}