# Metrics for movies and actors

In [13]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [10]:
movies = pickle.load(open('../data/pickles/movies.p', 'rb'))
characters = pickle.load(open('../data/pickles/characters.p', 'rb'))
characters_movies = pd.merge(characters[['movie_wiki_id', 'actor_name', 'combined_birth', 'combined_gender']], movies[['wikipedia_id', 'title', 'combined_release_year', 'combined_runtime', 'combined_box_office', 'averageRating_imdb', 'numVotes_imdb']], left_on='movie_wiki_id', right_on='wikipedia_id', how='left')

In [None]:
df = movies[['title', 'averageRating_imdb', 'numVotes_imdb', 'combined_box_office']].copy()
df['numVotes_imdb'] = np.log(df.numVotes_imdb)
df['combined_box_office'] = np.log(df.combined_box_office)
df['numVotes_imdb'] =(df.numVotes_imdb - df.numVotes_imdb.mean()) / df.numVotes_imdb.std()
df['combined_box_office'] =(df.combined_box_office - df.combined_box_office.mean()) / df.combined_box_office.std()
df['averageRating_imdb'] =(df.averageRating_imdb - df.averageRating_imdb.mean()) / df.averageRating_imdb.std()



In [None]:
df['metric'] = df.numVotes_imdb * np.abs(df.averageRating_imdb)

In [None]:
df.sort_values('metric', ascending=False).head(20)

In [None]:
movies = pd.merge(movies, df[['metric']], left_index=True, right_index=True)

In [12]:
movies[['title', 'metric']].sort_values('metric', ascending=False).head(20)

Unnamed: 0,title,metric
26366,The Shawshank Redemption,9.897882
36402,The Godfather,9.1588
38158,The Dark Knight,8.914723
36295,Justin Bieber: Never Say Never,8.731866
80920,The Lord of the Rings: The Return of the King,8.53233
7899,Disaster Movie,8.474059
66299,Pulp Fiction,8.329917
25386,Schindler's List,8.208227
60260,Inception,8.149376
11063,The Godfather Part II,8.139823


In [None]:
characters = pd.merge(characters, movies[['wikipedia_id', 'metric']], left_on='movie_wiki_id', right_on='wikipedia_id', how='left').rename(columns={'metric':'movie_metric'})

In [None]:
def get_actor_metrics(metrics):
    metrics.dropna(inplace=True)

    if (metrics is None):
        return np.nan
    else:
        metrics = metrics.tolist()
        index_metrics = (list(np.argsort(metrics)))
        return np.sum([metrics[i] * 1/(len(metrics) - i) for i in index_metrics])

def get_actor_metrics_dcg(metrics):
    metrics.dropna(inplace=True)

    if (metrics is None):
        return np.nan
    else:
        # sort the metrics in descending order
        metrics = sorted(metrics, reverse=True)
        # compute the DCG score
        if len(metrics) > 0:
            dcg = metrics[0] + sum([(metrics[i] / np.log2(i+2)) for i in range(1, len(metrics))])
            return dcg
        else:
            return np.nan
# aggregate the movie metric by actor
actors_metrics = characters[['freebase_actor_id', 'movie_wiki_id', 'movie_metric']].drop_duplicates()[['freebase_actor_id', 'movie_metric']].groupby('freebase_actor_id').agg(get_actor_metrics_dcg).rename(columns={'movie_metric':'actor_metric'})


In [None]:
characters = pd.merge(characters, actors_metrics, left_on='freebase_actor_id', right_index=True, how='left')

In [11]:
temp_characters = characters.copy()
temp_characters['age'] = temp_characters.combined_birth.apply(lambda x: 2022 - x)
temp_characters.sort_values('actor_metric', ascending=False)[['actor_name', 'combined_gender',  'age', 'actor_metric']].drop_duplicates().head(50).reset_index()

Unnamed: 0,index,actor_name,combined_gender,age,actor_metric
0,302108,Samuel L. Jackson,male,74.0,47.284516
1,91262,Robert De Niro,male,79.0,45.917416
2,90041,Michael Caine,male,89.0,42.677866
3,282784,Morgan Freeman,male,85.0,42.38244
4,330651,Al Pacino,male,82.0,39.385457
5,274370,Bruce Willis,male,67.0,38.878472
6,164138,Gary Oldman,male,64.0,37.168668
7,77359,Robert Duvall,male,91.0,36.768965
8,180117,Tom Hanks,male,66.0,36.712741
9,382743,Brad Pitt,male,59.0,36.554337


In [None]:
tempdf

In [None]:

# import the vincent library
import vincent
  
# To initialize vincent in the notebook
vincent.core.initialize_notebook()

# pass the parameters to the bar method
bar = vincent.Bar(tempdf.loc[0])
bar.axis_titles(x='Features', y='Percentage of filled entries')
bar.to_json('../temp/movie_percentage.json')
  
# Display the bar chart
bar.display()

In [None]:
import plotly

plotly.offline.init_notebook_mode(connected=True)

import plotly.graph_objs as go

data = [go.Bar(
            x=tempdf.columns,
            y=tempdf.loc[0]
    )]
plotly.offline.iplot(data, filename='basic-bar')

# export to html
plotly.offline.plot(data, filename='../temp/movie_percentage.html')
