In [1]:
%matplotlib inline
from utils import read, createGraph, readExtraInfo, readGenre, readBirthdays, getCharacterAges
from glob import iglob as glob
from os.path import exists, basename
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
import numpy as np
import scipy as sp
import warnings
import networkx as nx
import logging
from funcy import walk_values, partial
from scipy import stats
import itertools
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison


plt.rcParams["figure.figsize"] = (10, 8)

In [2]:
logging.basicConfig(format = "%(asctime)-15s %(message)s", level = logging.DEBUG)

In [3]:
th = 2
data_dir = "../data/utterances_with_charnames/*"
info_dir = "../data/charandmovie_info/"
birthdays_f = "../data/age/actor_birthdays.txt"

## Read all scripts

In [4]:
birthdays = readBirthdays(birthdays_f)

In [5]:
data = {}
for script in glob(data_dir):
    
    _, char_list, adj = read(script, threshold = th)

    extra_info = info_dir + basename(script)
    if exists(extra_info):
        genders, races, namesids, movieyear = readExtraInfo(extra_info)
        genres = readGenre(extra_info)
    else:
#         logging.warning("Info for {} not found".format(basename(script)))
#         gens = defaultdict(lambda: 'unknown')
#         races = defaultdict(lambda: 'unknown')  
        continue
    
    def splitRaces(x):
        r = x.split(",")
        if len(r) > 0:
            if len(r) > 1:
                return "mixed"
            else:
                return r[0]
    
    races = walk_values(splitRaces, races)
    ages = getCharacterAges(char_list, namesids, movieyear, birthdays)
        
        
    G = createGraph(char_list,
                    adj,
                    genders = genders,
                    races = races,
                    ages = ages)

    

    key = basename(script)

    data[key] = {}
    data[key]['graph'] = G
    data[key]['chars'] = char_list
    data[key]['genres'] = genres
    data[key]['ages'] = ages
    data[key]['races'] = races
    data[key]['year'] = movieyear
    

In [6]:
len(data)

986

In [7]:
np.sum([nx.number_of_nodes(d['graph']) for d in data.values()])

15133

In [8]:
types = Counter([y for x in [d['genres'] for script, d in data.items()] for y in x])
print(types)
types = list(types.keys())

Counter({'Drama': 559, 'Thriller': 368, 'Comedy': 287, 'Action': 252, 'Crime': 242, 'Romance': 194, 'Adventure': 170, 'Sci-Fi': 156, 'Mystery': 145, 'Horror': 116, 'Fantasy': 115, 'Biography': 70, 'Family': 49, 'History': 34, 'War': 34, 'Sport': 32, 'Animation': 32, 'Music': 22, 'Musical': 19, 'Western': 17, 'Short': 5, 'Film-Noir': 5})


In [9]:
races = Counter([y for x in [list(d['races'].values()) for d in data.values()] for y in x])
print(races)
races = list(races.keys())

Counter({'unknown': 7893, 'caucasian': 6887, 'african': 618, 'mixed': 449, 'latino': 165, 'eastasian': 78, 'asianindian': 44, 'other': 25, 'nativeamerican': 15, 'pacificislander': 7, 'others': 2})


In [10]:
centrality_measures = ['degree_cent', 'betweenness_cent']

In [11]:
xkcd_colors = sns.xkcd_palette(["grass green", "sand", "blue", "light red", "cerulean",
                                "red", "light blue", "teal", "orange", "light green",
                                "magenta", "yellow", "sky blue", "grey", "cobalt",
                                "grass", "algae green", "coral", "cerise", "steel",
                                "hot purple", "mango", "pale lime", "rouge"])
colors = itertools.cycle(xkcd_colors)

In [12]:
# Using Holm-Bonferroni method
def holmBonferroni(tests):
    res_2 = sorted(tests, key = lambda x: x[1].pvalue)
    m = len(res_2)

    k = 0
    while k < len(res_2) and res_2[k][1].pvalue < 0.05 / (m + 1 - k - 1):
        k += 1

    return res_2[:(k - 1)]

# Pre-check
In [GENDER BIAS WITHOUT BORDERS](http://seejane.org/wp-content/uploads/gender-bias-without-borders-executive-summary.pdf), there is a ratio of 2.25 men for every women on screen (women = $30.9$%). Lets check our numbers.

In [None]:
total, males, females = 0, 0, 0
for _, d in data.items():
    G = d['graph']
    
    for i in G.nodes():
        if G.node[i]['gender'] == 'male':
            males += 1
        elif G.node[i]['gender'] == 'female':
            females += 1
        
        total += 1
        
print("total: {}".format(total))
print("males: {:.2f}%".format(float(males) / total))
print("females: {:.2f}%".format(float(females) / total))


What if we drop unknown?... shouldn't matter right?

In [None]:
print("males: {:.2f}%".format(float(males) / (males + females)))
print("females: {:.2f}%".format(float(females) / (males + females)))

# Graph analysis

## Centrality Measurements

### Calculate and save centralities

In [13]:
for script, d in data.items():
    G = d['graph']
    
    degree = nx.degree_centrality(G)
    nx.set_node_attributes(G, 'degree_cent', degree)
    
    betweenness = nx.betweenness_centrality(G)
    nx.set_node_attributes(G, 'betweenness_cent', betweenness)
    
    #eigen = nx.eigenvector_centrality(G)
    #nx.set_node_attributes(G, 'eigen_cent', eigen)
        
    # closeness = nx.closeness_centrality(G)
    # nx.set_node_attributes(G, 'closeness_cent', closeness)
        
    # pagerank = nx.pagerank(G)
    # nx.set_node_attributes(G, 'pagerank_cent', pagerank)
    
    

### Some examples

Most prominent women / most prominent men

In [None]:
def averageCents(node):
    return np.mean([node['degree_cent'], node['betweenness_cent'], node['closeness_cent'], node['pagerank_cent']])  
    
male_cents, female_cents = [], []
for script, d in data.items():
    G = d['graph']
    char_list = d['chars']
       
    male_cents.extend([(averageCents(G.node[i]), char_list[i], script) for i in G.nodes() if G.node[i]['gender'] == 'male'])
    female_cents.extend([(averageCents(G.node[i]), char_list[i], script) for i in G.nodes() if G.node[i]['gender'] == 'female'])

In [None]:
sorted(male_cents, key=lambda x: x[0], reverse=True)[0:10]

In [None]:
sorted(female_cents, key=lambda x: x[0], reverse=True)[0:10]

## By Gender

In [14]:
for cent in centrality_measures:
    print(cent)
    print(stats.describe([vals for script, d in data.items() for vals in nx.get_node_attributes(d['graph'], cent).values()]))

degree_cent
DescribeResult(nobs=15133, minmax=(0.0, 1.0), mean=0.39085344359028013, variance=0.060658846118447539, skewness=0.8413833310233241, kurtosis=-0.17330570803736434)
betweenness_cent
DescribeResult(nobs=15133, minmax=(0.0, 1.0), mean=0.045594842347982223, variance=0.0087036555743516143, skewness=3.5561931822032755, kurtosis=15.501383735343612)


In [20]:
tests = {}
for cent in centrality_measures:
    
    male_cent, female_cent = [], []
    
    for script, d in data.items():
        G = d['graph']    
        male_cent.extend([G.node[i][cent] for i in G.nodes() if G.node[i]['gender'] == 'male'])
        female_cent.extend([G.node[i][cent] for i in G.nodes() if G.node[i]['gender'] == 'female'])
    
    #Remove nans
    male_cent, female_cent = np.array(male_cent), np.array(female_cent)
    male_cent = male_cent[~np.isnan(male_cent)]
    female_cent = female_cent[~np.isnan(female_cent)]

    tests[cent] = stats.mannwhitneyu(male_cent, female_cent)

In [23]:
for cent in centrality_measures:
    male_cent, female_cent = [], []
    for script, d in data.items():
        G = d['graph']
        male_cent.extend([G.node[i][cent] for i in G.nodes() if G.node[i]['gender'] == 'male'])
        female_cent.extend([G.node[i][cent] for i in G.nodes() if G.node[i]['gender'] == 'female'])
    print(cent)
    print(len(male_cent), len(female_cent))
    print(np.mean(male_cent), np.mean(female_cent))

degree_cent
8270 3168
0.439224457857 0.449661867503
betweenness_cent
8270 3168
0.0588726277977 0.0512860384073


In [22]:
# Using Holm-Bonferroni method
holmBonferroni(tests.items())

[]

### Split by genre

In [24]:
centr_byGenre = []
for script, d in data.items():
    G = d['graph']
    genres = d['genres']
        
    for centr_t in centrality_measures:
        centr_v = nx.get_node_attributes(G, centr_t)
        genders = nx.get_node_attributes(G, 'gender')
        
        for k in genders:
            if k in centr_v:
                centr_byGenre.extend([(t, centr_t, genders[k], centr_v[k]) for t in genres])
            else:
                centr_byGenre.extend([(t, centr_t, genders[k], np.nan) for t in genres])
                
centr_byGenre = pd.DataFrame(centr_byGenre, columns = ["genre", "centrality", "gender", "value"])
centr_byGenre.head()
    

Unnamed: 0,genre,centrality,gender,value
0,Drama,degree_cent,male,1.0
1,Sport,degree_cent,male,1.0
2,Thriller,degree_cent,male,1.0
3,Drama,degree_cent,male,0.181818
4,Sport,degree_cent,male,0.181818


In [25]:
#Drop nan's
centr_byGenre = centr_byGenre[(centr_byGenre.genre!="Documentary")]
centr_byGenre = centr_byGenre[(centr_byGenre.genre!="Short")]
centr_byGenre = centr_byGenre[(centr_byGenre.genre!="Reality-TV")]

centr_byGenre = centr_byGenre[(centr_byGenre.gender!="unknown")]

centr_byGenre = centr_byGenre.dropna()

centr_byGenre.head()

Unnamed: 0,genre,centrality,gender,value
0,Drama,degree_cent,male,1.0
1,Sport,degree_cent,male,1.0
2,Thriller,degree_cent,male,1.0
3,Drama,degree_cent,male,0.181818
4,Sport,degree_cent,male,0.181818


In [28]:
res = centr_byGenre.groupby(["centrality", "genre"]).apply(lambda x: stats.mannwhitneyu(x.ix[x['gender'] == 'male', 'value'].values,
                                                                     x.ix[x['gender'] == 'female', 'value'].values))

__ There is a real difference in the centrality of characters in: __

In [40]:
centr_byGenre[["genre", "gender", "value"]].groupby(["genre", "gender"]).agg([len, sp.median])

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,len,median
genre,gender,Unnamed: 2_level_2,Unnamed: 3_level_2
Action,female,1368.0,0.16
Action,male,4746.0,0.166667
Adventure,female,924.0,0.16
Adventure,male,3154.0,0.163023
Animation,female,212.0,0.1875
Animation,male,554.0,0.157287
Biography,female,510.0,0.137848
Biography,male,1292.0,0.136601
Comedy,female,2106.0,0.16
Comedy,male,4608.0,0.166667


In [30]:
# Using Holm-Bonferroni's method
bygenre = res.unstack(level=0)
for centr_t in centrality_measures:
    
    tmp = zip(bygenre[centr_t].index, bygenre[centr_t].values)

    print(centr_t)
    for g, ttest in holmBonferroni(tmp):
        print(g, ttest)
    print()

degree_cent
Horror MannwhitneyuResult(statistic=144206.0, pvalue=0.00098616602154669359)

betweenness_cent



In [None]:
# t = centr_byGenre[centr_byGenre.genre.isin(['Drama', 'Comedy', 'Horror'])]
# t = t[t.centrality == "betweenness_cent"]
# bplt = sns.boxplot(x = 'genre', y = 'value', hue = 'gender', data = t)
# bplt.axes.set_ylim([0, .3]);

## By Race
I can either agg by race (e.g., caucassian males get more important roles) or I could agg races by gender (e.g., latino women are more demanded than latino men).

In [31]:
# Agg by race
centr_race_byGenre = []
for script, d in data.items():
    G = d['graph']
    genres = d['genres']
        
    for centr_t in centrality_measures:
        centr_v = nx.get_node_attributes(G, centr_t)
        genders = nx.get_node_attributes(G, 'gender')
        races = nx.get_node_attributes(G, 'race')
        
        for k in genders:
            if k in centr_v:
                centr_race_byGenre.extend([(t, centr_t, genders[k], races[k], centr_v[k]) for t in genres])
            else:
                centr_race_byGenre.extend([(t, centr_t, genders[k], races[k], np.nan) for t in genres])
                
centr_race_byGenre = pd.DataFrame(centr_race_byGenre, columns = ["genre", "centrality", "gender", "race", "value"])
centr_race_byGenre.head()

Unnamed: 0,genre,centrality,gender,race,value
0,Drama,degree_cent,male,caucasian,1.0
1,Sport,degree_cent,male,caucasian,1.0
2,Thriller,degree_cent,male,caucasian,1.0
3,Drama,degree_cent,male,caucasian,0.181818
4,Sport,degree_cent,male,caucasian,0.181818


In [32]:
# Fix others -> other
centr_race_byGenre.ix[centr_race_byGenre.race == "others", "race"] = "other"

In [42]:
centr_race_byGenre.groupby(["centrality", "race"]).agg([len, sp.median])

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,len,median
centrality,race,Unnamed: 2_level_2,Unnamed: 3_level_2
betweenness_cent,african,1570.0,0.026181
betweenness_cent,asianindian,102.0,0.029853
betweenness_cent,caucasian,17878.0,0.031528
betweenness_cent,eastasian,212.0,0.02374
betweenness_cent,latino,412.0,0.02358
betweenness_cent,mixed,1256.0,0.029609
betweenness_cent,nativeamerican,53.0,0.006349
betweenness_cent,other,53.0,0.022894
betweenness_cent,pacificislander,11.0,0.088603
betweenness_cent,unknown,23628.0,0.00262


__ ~~ANOVA~~  Kruskal-Wallis was moved to R __

In [34]:
# Save to use in R
centr_race_byGenre.to_csv("../data/R/aggByRace.csv", index = False)

## By Age

In [35]:
# WHO THE FUCK HAD >!00 YRS!?
for script, d in data.items():
    G = d['graph']
    char_list = d['chars']
    
    for n in G.nodes():
        if G.node[n]['age'] > 100:
            print(script, char_list[n], G.node[n]['age'])

In [36]:
# Agg by age

centr_age_byGenre = []
for script, d in data.items():
    G = d['graph']
    genres = d['genres']
        
    for centr_t in centrality_measures:
        centr_v = nx.get_node_attributes(G, centr_t)
        genders = nx.get_node_attributes(G, 'gender')
        ages = nx.get_node_attributes(G, 'age')
        
        for k in genders:
            if k in centr_v:
                centr_age_byGenre.extend([(t, centr_t, genders[k], ages[k], centr_v[k]) for t in genres])
            else:
                centr_age_byGenre.extend([(t, centr_t, genders[k], ages[k], np.nan) for t in genres])
                
centr_age_byGenre = pd.DataFrame(centr_age_byGenre, columns = ["genre", "centrality", "gender", "age", "value"])
centr_age_byGenre.head()

Unnamed: 0,genre,centrality,gender,age,value
0,Drama,degree_cent,male,36,1.0
1,Sport,degree_cent,male,36,1.0
2,Thriller,degree_cent,male,36,1.0
3,Drama,degree_cent,male,36,0.181818
4,Sport,degree_cent,male,36,0.181818


In [37]:
# Drop the guy with more than 100 yrs
centr_age_byGenre = centr_age_byGenre[centr_age_byGenre.age < 100]

__ Analysis moved to R __

In [38]:
centr_age_byGenre.to_csv("../data/R/aggByAgeGender.csv", index=False)