In [1]:
%matplotlib inline
from utils import read, createGraph, readGenders, functionals
from glob import iglob as glob
from os.path import exists, basename
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
import numpy as np
import scipy as sp
import pickle
import warnings
import community
import networkx as nx
import itertools
import scipy as sp 
import logging

plt.rcParams["figure.figsize"] = (10, 8)

In [2]:
logging.basicConfig(format = "%(asctime)-15s %(message)s", level = logging.DEBUG)

In [3]:
th = 2
data_dir = "../data/utterances_with_charnames/*"
info_dir = "../data/character_info/"
genre_dir = "../data/genres/*"

## Read genres

In [4]:
genres = {}
for file in glob(genre_dir):
    with open(file) as inpt:
        try:
            next(inpt)
            next(inpt)
            next(inpt)

            genre_list = next(inpt).strip().split(": ")[1]

            genres[basename(file).replace('.tsv', '.txt')] = list(map(lambda x: x.replace(' ', ''), genre_list.split(",")))
        except:
            continue

## Read all scripts

In [5]:
#genres = pickle.load(open(genre_file, 'rb'))

data = {}
for script in glob(data_dir):
    
    _, char_list, adj = read(script, threshold = th)

    gender_file = info_dir + basename(script)
    if exists(gender_file):
        gens, races = readGenders(gender_file)
    else:
        gens = defaultdict(lambda: 'unknown')
        races = defaultdict(lambda: 'unknown')
        
        logging.warn("Info for {} not found".format(basename(script)))

    G = createGraph(char_list, adj, gens, races)


    key = basename(script)

    data[key] = {}
    data[key]['graph'] = G
    data[key]['chars'] = char_list
    data[key]['genres'] = genres.get(key, [])
    
#     if isinstance(data[key]['genres'], np.ndarray):
#         data[key]['genres'] = list(data[key]['genres'])
#     elif isinstance(data[key]['genres'], str):
#         data[key]['genres'] = [data[key]['genres']]



In [6]:
types = Counter([y for x in [d['genres'] for script, d in data.items()] for y in x])
print(types)
types = list(types.keys())

Counter({'Drama': 650, 'Thriller': 404, 'Comedy': 336, 'Action': 282, 'Crime': 273, 'Romance': 231, 'Adventure': 195, 'Sci-Fi': 178, 'Mystery': 171, 'Horror': 137, 'Fantasy': 125, 'Biography': 75, 'History': 71, 'Family': 53, 'War': 42, 'Animation': 37, 'Sport': 35, 'Music': 25, 'Musical': 19, 'Western': 19, 'Short': 9, 'Documentary': 4, 'Film-Noir': 4})


In [13]:
G = data['12_monkeys.txt']['graph']
[x for x in G.nodes_iter()]

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29]

In [None]:
xkcd_colors = sns.xkcd_palette(["grass green", "sand", "blue", "light red", "cerulean",
                                "red", "light blue", "teal", "orange", "light green",
                                "magenta", "yellow", "sky blue", "grey", "cobalt",
                                "grass", "algae green", "coral", "cerise", "steel",
                                "hot purple", "mango", "pale lime", "rouge"])
colors = itertools.cycle(xkcd_colors)

# Pre-check
In [GENDER BIAS WITHOUT BORDERS](http://seejane.org/wp-content/uploads/gender-bias-without-borders-executive-summary.pdf), there is a ratio of 2.25 men for every women on screen (women = $30.9$%). Lets check our numbers.

In [None]:
total, males, females = 0, 0, 0
for _, d in data.items():
    G = d['graph']
    
    for i in G.nodes():
        if G.node[i]['gender'] == 'male':
            males += 1
        elif G.node[i]['gender'] == 'female':
            females += 1
        
        total += 1
        
print("total: {}".format(total))
print("males: {:.2f}%".format(float(males) / total))
print("females: {:.2f}%".format(float(females) / total))


What if we drop unknown?... shouldn't matter right?

In [None]:
print("males: {:.2f}%".format(float(males) / (males + females)))
print("females: {:.2f}%".format(float(females) / (males + females)))

# Graph analysis

# Degree centrality

#### Average degree

In [None]:
warnings.simplefilter("ignore")
male_deg, female_deg = [], []
for script, d in data.items():
    G = d['graph']
    degrees = nx.degree(G)
    
    male_deg.append(np.mean([degrees[i] for i in G.nodes() if G.node[i]['gender'] == 'male']))
    female_deg.append(np.mean([degrees[i] for i in G.nodes() if G.node[i]['gender'] == 'female']))

In [None]:
#Remove nans

male_deg = np.array(male_deg)
female_deg = np.array(female_deg)

male_deg = male_deg[~np.isnan(male_deg)]
female_deg = female_deg[~np.isnan(female_deg)]

In [None]:
sp.stats.describe([y for x in [list(nx.degree(d['graph']).values()) for script, d in data.items()]
     for y in x])

In [None]:
sns.distplot(male_deg, hist = False, label = "male");
sns.distplot(female_deg, hist = False, label = "female");
plt.legend();
plt.xlabel("average degree");

There is no real difference in the number of connections between men and women.

In [None]:
sp.stats.ttest_ind(male_deg, female_deg)

### Based on Genre

In [None]:
res = {}
for t in types:
    male_deg, female_deg = [], []
    for script, d in data.items():
        
        if t not in d['genres']:
            continue 
            
        G = d['graph']
        degrees = nx.degree(G)

        male_deg.append(np.mean([degrees[i] for i in G.nodes() if G.node[i]['gender'] == 'male']))
        female_deg.append(np.mean([degrees[i] for i in G.nodes() if G.node[i]['gender'] == 'female']))
    
    res[t] = {}
    res[t]['male'] = male_deg
    res[t]['female'] = female_deg

In [None]:
# Pass into long format
tmp = []
for t, k in res.items():
    for g, v in k.items():
        for i in v:
            tmp.append((t, g, i))

tmp = pd.DataFrame(tmp, columns=["genre", "gender", "value"])

#Drop nan's
tmp = tmp[(tmp.genre!="Documentary")]
tmp = tmp[(tmp.genre!="Short")]
tmp = tmp[(tmp.genre!="Reality-TV")]

tmp.head()

In [None]:
g = sns.FacetGrid(tmp, col = "genre", row = "gender")
g = g.map(sns.distplot, "value", hist=False)

In [None]:
res = {}
for t in types:
    male_deg = tmp.ix[(tmp.genre == t) & (tmp.gender == "male"), "value"]
    female_deg = tmp.ix[(tmp.genre == t) & (tmp.gender == "female"), "value"]
    
    #Remove nans
    male_deg, female_deg = np.array(male_deg), np.array(female_deg)
    male_deg = male_deg[~np.isnan(male_deg)]
    female_deg = female_deg[~np.isnan(female_deg)]

    ttest = sp.stats.ttest_ind(male_deg, female_deg)

    if not np.isnan(ttest.pvalue):
        res[t] = ttest
    
res;

__There are no significant differences between the degrees based on gender.__

In [None]:
# Using Benjamin-Hochberg's method
res_2 = sorted(res.items(), key = lambda x: x[1].pvalue)
m = len(res_2)

i = 1
while i < len(res_2) + 1:
    if res_2[i - 1][1].pvalue > i / m * 0.05:
        break
    i = i + 1
        
res_2[:(i - 1)]


# Betweeness Centrality

#### Average centrality

In [None]:
male_cent, female_cent = [], []
for script, d in data.items():
    G = d['graph']
    cents = nx.betweenness_centrality(G)
    
    male_cent.append(np.mean([cents[i] for i in G.nodes() if G.node[i]['gender'] == 'male']))
    female_cent.append(np.mean([cents[i] for i in G.nodes() if G.node[i]['gender'] == 'female']))

In [None]:
#Remove nans
male_cent, female_cent = np.array(male_cent), np.array(female_cent)
male_cent = male_cent[~np.isnan(male_cent)]
female_cent = female_cent[~np.isnan(female_cent)]

Just a quick note, this is a _density_, not a probability. The AUC adds up to 1. 

In [None]:
sns.distplot(male_cent, hist = False, label = "male");
sns.distplot(female_cent, hist = False, label = "female");
plt.legend();
plt.xlabel("average centrality");

In this case, there is a significant difference in the average centrality of male characters and female characters ($t(?) = 4.8805,\, p < 0.01$). In most of the scripts, males occupy a more central role than females.

In [None]:
sp.stats.ttest_ind(male_cent, female_cent)

### Condition on Genre

In [None]:
res = {}
for t in types:
    male_centr, female_centr = [], []
    for script, d in data.items():
        
        if t not in d['genres']:
            continue 
            
        G = d['graph']
        centralities = nx.betweenness_centrality(G)

        male_centr.append(np.mean([centralities[i] for i in G.nodes() if G.node[i]['gender'] == 'male']))
        female_centr.append(np.mean([centralities[i] for i in G.nodes() if G.node[i]['gender'] == 'female']))
    
    res[t] = {}
    res[t]['male'] = male_centr
    res[t]['female'] = female_centr

In [None]:
# Pass into long format
tmp = []
for t, k in res.items():
    for g, v in k.items():
        for i in v:
            tmp.append((t, g, i))

tmp = pd.DataFrame(tmp, columns=["genre", "gender", "value"])

#Drop nan's
tmp = tmp[(tmp.genre!="Documentary")]
tmp = tmp[(tmp.genre!="Short")]
tmp = tmp[(tmp.genre!="Reality-TV")]

tmp.head()

In [None]:
g = sns.FacetGrid(tmp, col = "genre", row = "gender")
g = g.map(sns.distplot, "value", hist=False)

In [None]:
res = {}
for t in types:
    male_deg = tmp.ix[(tmp.genre == t) & (tmp.gender == "male"), "value"]
    female_deg = tmp.ix[(tmp.genre == t) & (tmp.gender == "female"), "value"]
    
    #Remove nans
    male_deg, female_deg = np.array(male_deg), np.array(female_deg)
    male_deg = male_deg[~np.isnan(male_deg)]
    female_deg = female_deg[~np.isnan(female_deg)]

    ttest = sp.stats.ttest_ind(male_deg, female_deg)

    if not np.isnan(ttest.pvalue):
        res[t] = ttest
    
res;

__ There is a real difference in the centrality of characters in: __

In [None]:
# Using Benjamin-Hochberg's method
res_2 = sorted(res.items(), key = lambda x: x[1].pvalue)
m = len(res_2)

i = 1
while i < len(res_2) + 1:
    if res_2[i - 1][1].pvalue > i / m * 0.05:
        break
    i = i + 1
        
res_2[:(i - 1)]

## Communities and Modularities

Extract the community structure and modularity measurement of each graph using Louvain Method. The Louvain Method was created by Blondel et. al [https://arxiv.org/abs/0803.0476]

In [None]:
communities = []
modularities = []

for t in types:
    for script, d in data.items():
        if t not in d['genres']:
            continue
        
        G = d['graph']
        parts = community.best_partition(G)
        q = community.modularity(parts, G)
        
        communities.append((t, len(set(parts.values()))))
        modularities.append((t, q))
        
communities = pd.DataFrame(communities, columns=["genre", "communities"])
modularities = pd.DataFrame(modularities, columns=["genre", "modularity"])

In [None]:
communities.communities.describe()

In [None]:
communities.groupby("genre").apply(np.mean).sort_values(by="communities")

In [None]:
axs = sns.boxplot(x = "genre", y = "communities", data = communities, palette=xkcd_colors)
for item in axs.get_xticklabels():
    item.set_rotation(90)

In [None]:
modularities.modularity.describe()

In [None]:
modularities.groupby("genre").apply(np.mean).sort_values(by="modularity")

In [None]:
axs = sns.boxplot(x = "genre", y = "modularity", data = modularities, palette=xkcd_colors)
for item in axs.get_xticklabels():
    item.set_rotation(90)

## Can modularity predict genre?
The baselines are already incredibly high, so might not be much point in doing this.

In [None]:
tmp = []
for t in types:
    pos = modularities[modularities.genre == t]
    neg = modularities[modularities.genre != t]
    
    base_score = 100*np.max([len(pos) / (len(pos) + len(neg)), len(neg) / (len(pos) + len(neg))])
    
    # Baseline is most popular class in genre
    print ("{} baseline: {:.2f}%".format(t, base_score))
    
    tmp.append(base_score)
    
np.mean(tmp), np.std(tmp)

Mayyyybeeeee for drama...

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

for t in types:
    pos = modularities[modularities.genre == t].copy()
    neg = modularities[modularities.genre != t].copy()

    pos['genre_drama'] = 1
    neg['genre_drama'] = 0


    drama_data = pd.concat([pos, neg])
    
    

    X = drama_data.modularity
    X = X.reshape(X.shape[0], 1)

    Y = drama_data.genre_drama

    print("{} model: {:.2f}%".format(t, 100 * cross_val_score(LogisticRegression(), X, Y, cv = 10).mean()))