# Load 

In [1]:
import pandas as pd
import networkx as nx
from collections import Counter, defaultdict
import numpy as np
from scipy.stats import mannwhitneyu
from itertools import combinations, permutations
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

In [2]:
G = nx.read_gpickle("data/G_analysisready_03.pkl")

In [3]:
performers = [x for x in G.nodes() if G.node[x]['bipartite'] == 0]
fp = []

for p in performers:
    if 'a_pcat' in G.node[p]:
        if G.node[p]['a_pcat'] in ['f_gay', 'f_straight']:
            if 'unknown' not in p:
                fp.append(p)

# Table 1: Counts

In [4]:
total = len(fp)
out = [('all', total, 100, 0, 0, 0)]
out = pd.DataFrame(out, columns=['feature', 'count', '%total', '%subset', '%titles', '%titlegain'])

In [5]:
feature = 'b_ethnicity'
p_c = Counter()
for x in fp:
    f = G.node[x][feature]
    if pd.notnull(f):
        p_c.update([feature])
        if len(f) > 1:
            p_c.update(['multiple'])
        else:
            p_c.update(f)
        
t_c = defaultdict(list)
for x in fp:
    f = G.node[x][feature]
    if pd.notnull(f):
        nei = G.neighbors(x)
        t_c[feature] += list(set(nei))
        if len(f) > 1:
            t_c['multiple'] += list(set(nei))
        else:
            t_c[f[0]] += list(set(nei))

thisout = []
for k, v in p_c.most_common():
    this = [k, v, (v/total)*100, (v/p_c[feature])*100, (len(set(t_c[k]))/len(set(t_c[feature])))*100]
    this.append(this[-1] - this[-2])
    thisout.append(this)
thisout = pd.DataFrame(thisout, columns=['feature', 'count', '%total', '%subset', '%titles', '%titlegain'])
out = pd.concat([out, thisout])

In [6]:
feature = 'b_haircolor'
p_c = Counter()
for x in fp:
    f = G.node[x][feature]
    if pd.notnull(f):
        p_c.update([feature])
        if len(f) > 1:
            p_c.update(['multiple'])
        else:
            p_c.update(f)
        
t_c = defaultdict(list)
for x in fp:
    f = G.node[x][feature]
    if pd.notnull(f):
        nei = G.neighbors(x)
        t_c[feature] += list(set(nei))
        if len(f) > 1:
            t_c['multiple'] += list(set(nei))
        else:
            t_c[f[0]] += list(set(nei))

thisout = []
for k, v in p_c.most_common():
    this = [k, v, (v/total)*100, (v/p_c[feature])*100, (len(set(t_c[k]))/len(set(t_c[feature])))*100]
    this.append(this[-1] - this[-2])
    thisout.append(this)
thisout = pd.DataFrame(thisout, columns=['feature', 'count', '%total', '%subset', '%titles', '%titlegain'])
out = pd.concat([out, thisout])

In [7]:
feature = 'b_cup'
p_c = Counter()
for x in fp:
    f = G.node[x][feature]
    if pd.notnull(f):
        p_c.update([feature])
        p_c.update(f)

t_c = defaultdict(list)
for x in fp:
    f = G.node[x][feature]
    if pd.notnull(f):
        nei = G.neighbors(x)
        t_c[feature] += nei
        for y in f:
            t_c[y] += nei

thisout = []
for k, v in p_c.most_common():
    this = [k, v, (v/total)*100, (v/p_c[feature])*100, (len(set(t_c[k]))/len(set(t_c[feature])))*100]
    this.append(this[-1] - this[-2])
    thisout.append(this)
thisout = pd.DataFrame(thisout, columns=['feature', 'count', '%total', '%subset', '%titles', '%titlegain'])
thisout = thisout.sort_values(by='feature')
out = pd.concat([out, thisout])

In [8]:
out.round(2)

Unnamed: 0,feature,count,%total,%subset,%titles,%titlegain
0,all,46320,100.0,0.0,0.0,0.0
0,b_ethnicity,35368,76.36,100.0,100.0,0.0
1,caucasian,24806,53.55,70.14,87.45,17.31
2,black,4276,9.23,12.09,12.4,0.31
3,latin,3855,8.32,10.9,21.79,10.89
4,asian,1720,3.71,4.86,11.38,6.52
5,multiple,711,1.53,2.01,13.45,11.44
0,b_haircolor,34952,75.46,100.0,100.0,0.0
1,brown,11784,25.44,33.71,62.08,28.37
2,blond,10365,22.38,29.65,63.31,33.65


# Accolades

In [9]:
for x in fp:
    G.node[x]['b_awards'] = None
    out = []
    a = G.node[x]['awards']
    if pd.notnull(a):
        for v in a.values():
            for b in v:
                if 'Nominee' in b:
                    out.append('nominated')
                if 'Winner' in b:
                    out.append('won')
    if out:
        G.node[x]['b_awards'] = out

In [10]:
with_acc = [x for x in fp if G.node[x]['b_awards']]
with_acc = list(set(with_acc))
with_win = []
for x in with_acc:
    if 'won' in G.node[x]['b_awards']:
        with_win.append(x)
with_win = list(set(with_win))
with_nom = []
for x in with_acc:
    if 'nominated' in G.node[x]['b_awards']:
        with_nom.append(x)
with_nom = list(set(with_nom))
print("with acc", len(with_acc), np.round((len(with_acc)/total)*100, 2))
print("with nom", len(with_nom), np.round((len(with_nom)/total)*100, 2))
print("with win", len(with_win), np.round((len(with_win)/total)*100, 2))

with acc 1430 3.09
with nom 1326 2.86
with win 348 0.75


In [11]:
t_c = defaultdict(list)

for x in fp:
    t_c['all'] += G.neighbors(x)
    f = G.node[x]['b_awards']
    if f:
        nei = G.neighbors(x)
        t_c['acc'] += nei
        if 'nominated' in f:
            t_c['nom'] += nei
        if 'won' in f:
            t_c['won'] += nei
for k, v in t_c.items():
    print(k, len(set(v)))
    
for k, v in t_c.items():
    n = len(set(v))
    d = len(set(t_c['all']))
    prop = n/d
    per = prop * 100
    print(k, np.round(per, 2))

won 31974
all 122269
nom 59561
acc 63747
won 26.15
all 100.0
nom 48.71
acc 52.14


# Table 2: Averages

In [12]:
for f in ['proc_height', 'proc_weight', 'proc_bust', 'proc_waist', 'proc_hip']:
    v = []
    for x in fp:
        if pd.notnull(G.node[x][f]):
            add = G.node[x][f]
            if f in ['proc_bust', 'proc_waist', 'proc_hip']:
                add = add * 2.54
            v.append(add)
    print(f, len(v), np.round(len(v)/total*100, 2), np.round(np.mean(v), 2), np.round(np.std(v), 2))
    

proc_height 9771 21.09 165.23 6.87
proc_weight 7769 16.77 54.59 8.58
proc_bust 8713 18.81 88.18 7.94
proc_waist 7746 16.72 64.45 7.08
proc_hip 7734 16.7 89.4 8.04


# Fig 1: Title Distribution; Ethnicity

In [13]:
import powerlaw

Do we want "exclusive", or "has"?

In [14]:
d = defaultdict(list)
f = 'b_ethnicity'
for x in performers:
    deg = G.degree(x)
    #d['performers'].append(deg)
    if x in fp:
        d['fp'].append(deg)
        tf = G.node[x][f]
        if pd.notnull(tf):
            if len(tf) > 1:
                d['multiple'].append(deg)
            else:
                d[tf[0]].append(deg)

In [15]:
fit_d = {}
for k, v in d.items():
    fit_d[k] = powerlaw.Fit(v, discrete=True, xmin=1)

  (Theoretical_CDF * (1 - Theoretical_CDF))


In [16]:
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

In [17]:
plt.clf()
plt.cla()
ax = fit_d['fp'].plot_ccdf(color='k', linewidth=4, label='All')
fit_d['caucasian'].plot_ccdf(color='b', linewidth=2, linestyle='--', ax=ax, label='Caucasian')
fit_d['latin'].plot_ccdf(color='g', linewidth=2, linestyle='--', ax=ax, label='Latina')
fit_d['black'].plot_ccdf(color='c', linewidth=2, linestyle='--', ax=ax, label='Black')
fit_d['asian'].plot_ccdf(color='m', linewidth=2, linestyle='--', ax=ax, label='Asian')
fit_d['multiple'].plot_ccdf(color='y', linewidth=2, linestyle='--', ax=ax, label='Multiple')
ax.get_yaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(x, ",")))
ax.get_xaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ",")))
ax.set_xlabel(r"$x$: Number of Titles")
ax.set_ylabel(r"$p(X \geq x)$")
plt.legend()
fig = plt.gcf()
fig.set_size_inches(6.5, 4.5)
#plt.show()
plt.savefig('distribution_ccdf.png', dpi=1200)

# Truncated Power Law estimates

In [18]:
l = ['fp', 'caucasian', 'black', 'latin', 'asian', 'multiple']

for k in l:
    v = fit_d[k]
    print(k, np.round(v.truncated_power_law.parameter1, 4), np.round(v.truncated_power_law.parameter2, 4))

  (Theoretical_CDF * (1 - Theoretical_CDF))


fp 1.5008 0.0037
caucasian 1.2772 0.0062
black 1.4868 0.0077
latin 1.4856 0.0049
asian 1.5911 0.0022
multiple 1.1845 0.004


In [19]:
d = defaultdict(list)
f = 'b_haircolor'
for x in performers:
    deg = G.degree(x)
    #d['performers'].append(deg)
    if x in fp:
        tf = G.node[x][f]
        if pd.notnull(tf):
            if len(tf) > 1:
                d['multiple'].append(deg)
            else:
                d[tf[0]].append(deg)
fit_d = {}
for k, v in d.items():
    fit_d[k] = powerlaw.Fit(v, discrete=True, xmin=1)
l = ['brown', 'blond', 'black', 'red', 'multiple']

for k in l:
    v = fit_d[k]
    print(k, np.round(v.truncated_power_law.parameter1, 4), np.round(v.truncated_power_law.parameter2, 4))

  (Theoretical_CDF * (1 - Theoretical_CDF))


brown 1.4106 0.0061
blond 1.3074 0.0066
black 1.4974 0.0055
red 1.4076 0.0103
multiple 1.0 0.0046


In [20]:
d = defaultdict(list)
f = 'b_cup'
for x in performers:
    deg = G.degree(x)
    #d['performers'].append(deg)
    if x in fp:
        tf = G.node[x][f]
        if pd.notnull(tf):
            d['b_cup'].append(deg)
            d[tf[0]].append(deg)
fit_d = {}
for k, v in d.items():
    fit_d[k] = powerlaw.Fit(v, discrete=True, xmin=10)
l = ['A', 'B', 'C', 'D', 'E', '> E']

for k in l:
    v = fit_d[k]
    print(k, np.round(v.truncated_power_law.parameter1, 4), np.round(v.truncated_power_law.parameter2, 4))

  (Theoretical_CDF * (1 - Theoretical_CDF))


A 1.016 0.0082
B 1.0 0.0068
C 1.0 0.0055
D 1.0 0.0141
E 1.0 0.0055
> E 1.1019 0.0079


# Win/Nom comparison

In [21]:
top = set(with_acc)
bottom = set(fp) - top

In [22]:
f = 'b_ethnicity'
eths = set()
for x in fp:
    if pd.notnull(G.node[x][f]):
        eths.update(G.node[x][f])
eths

{'asian', 'black', 'caucasian', 'latin'}

In [63]:
from scipy.stats import chi2_contingency
from scipy.stats.contingency import margins

def stdres(observed, expected):
    n = observed.sum()
    rsum, csum = margins(observed)
    v = csum * rsum * (n - rsum) * (n - csum) / n**3
    return (observed - expected) / np.sqrt(v)

out = []

f = 'b_ethnicity'
eths = set()
for x in fp:
    if pd.notnull(G.node[x][f]):
        eths.update(G.node[x][f])

for eth in eths:
    m = np.zeros((2, 2))
    for x in top:
        this = G.node[x][f]
        if this:
            if eth in this:
                m[0, 0] += 1
            else:
                m[1, 0] += 1
                
    for x in bottom:
        this = G.node[x][f]
        if this:
            if eth in this:
                m[0, 1] += 1
            else:
                m[1, 1] += 1
    print(eth)
    print(m)
    N = m.sum().sum()
    chi2, p, dof, ex = chi2_contingency(m, correction=False)
    phi = (m[1, 1]*m[0, 0] - m[1, 0]*m[0, 1])/np.sqrt(m[1, :].sum() * m[0, :].sum() * m[:, 0].sum() * m[:, 1].sum())
    zm = m.copy()
    if m[0, 0] > m[1, 0]:
        zm = m[[1, 0]].copy()
    pr1 = zm[0, :].sum()/zm.sum().sum()
    pr2 = zm[1, :].sum()/zm.sum().sum()
    pc1 = zm[:, 0].sum()/zm.sum().sum()
    pc2 = zm[:, 1].sum()/zm.sum().sum()
    maxphi = np.sqrt((pr1 * (1-pc2))/(pc2 * (1-pr1)))
    if phi < 0:
        maxphi = -maxphi
    sr = stdres(m, ex)
    sr = sr[0, 0]
    oddsratio = (m[0,0]/m[0, 1])/(m[1, 0]/m[1, 1])
    lor = np.log(oddsratio)
    clef = (m[0,0]/m[0, :].sum()) * (m[1, 1]/m[1, :].sum())
    out.append([eth, chi2, p, phi, maxphi, sr, oddsratio, lor, clef])

caucasian
[[  1147.  24168.]
 [   225.   9828.]]
latin
[[   131.   4345.]
 [  1241.  29651.]]
asian
[[    62.   1777.]
 [  1310.  32219.]]
black
[[    99.   4359.]
 [  1273.  29637.]]


In [64]:
df = pd.DataFrame(out, columns=['category', 'chi2', 'p', 'phi', 'max_phi', 'adjusted_residual', 'oddsratio', 'logoddsratio', 'CLEF'])
df

Unnamed: 0,category,chi2,p,phi,max_phi,adjusted_residual,oddsratio,logoddsratio,CLEF
0,caucasian,101.443488,7.353160999999999e-24,0.053556,0.126597,10.071916,2.073029,0.729011,0.044295
1,latin,12.468575,0.0004138558,-0.018776,-0.076469,-3.531087,0.720359,-0.328005,0.028091
2,asian,1.341598,0.2467524,-0.006159,-0.047048,-1.158274,0.858114,-0.153018,0.032397
3,black,37.628273,8.559479e-10,-0.032618,-0.076293,-6.134189,0.528755,-0.637231,0.021293


In [59]:
f = 'b_haircolor'
eths = set()
for x in fp:
    if pd.notnull(G.node[x][f]):
        eths.update(G.node[x][f])
eths

out = []

for eth in eths:
    m = np.zeros((2, 2))
    for x in top:
        this = G.node[x][f]
        if this:
            if eth in this:
                m[0, 0] += 1
            else:
                m[1, 0] += 1
                
    for x in bottom:
        this = G.node[x][f]
        if this:
            if eth in this:
                m[0, 1] += 1
            else:
                m[1, 1] += 1
    print(eth)
    print(m)
    N = m.sum().sum()
    chi2, p, dof, ex = chi2_contingency(m, correction=False)
    phi = (m[1, 1]*m[0, 0] - m[1, 0]*m[0, 1])/np.sqrt(m[1, :].sum() * m[0, :].sum() * m[:, 0].sum() * m[:, 1].sum())
    zm = m.copy()
    if m[0, 0] > m[1, 0]:
        zm = m[[1, 0]].copy()
    pr1 = zm[0, :].sum()/zm.sum().sum()
    pr2 = zm[1, :].sum()/zm.sum().sum()
    pc1 = zm[:, 0].sum()/zm.sum().sum()
    pc2 = zm[:, 1].sum()/zm.sum().sum()
    maxphi = np.sqrt((pr1 * (1-pc2))/(pc2 * (1-pr1)))
    if phi < 0:
        maxphi = -maxphi
    sr = stdres(m, ex)
    sr = sr[0, 0]
    oddsratio = (m[0,0]/m[0, 1])/(m[1, 0]/m[1, 1])
    lor = np.log(oddsratio)
    clef = (m[0,0]/m[0, :].sum()) * (m[1, 1]/m[1, :].sum())
    out.append([eth, chi2, p, phi, maxphi, sr, oddsratio, lor, clef])

red
[[   194.   2798.]
 [  1215.  30745.]]
blond
[[   625.  11792.]
 [   784.  21751.]]
brown
[[   708.  14141.]
 [   701.  19402.]]
black
[[   350.   8532.]
 [  1059.  25011.]]


In [60]:
df = pd.DataFrame(out, columns=['category', 'chi2', 'p', 'phi', 'max_phi', 'adjusted_residual', 'oddsratio', 'logoddsratio', 'CLEF'])
df[['category', 'chi2', 'oddsratio', 'logoddsratio', 'p']]

Unnamed: 0,category,chi2,oddsratio,logoddsratio,p
0,red,50.88057,1.754495,0.562181,9.816048e-13
1,blond,49.99804,1.470468,0.385581,1.538996e-12
2,brown,36.223108,1.38574,0.326234,1.759715e-09
3,black,0.253152,0.968841,-0.031655,0.6148643


In [67]:
f = 'b_cup'
eths = set()
for x in fp:
    if pd.notnull(G.node[x][f]):
        eths.update(G.node[x][f])
eths

out = []

for eth in eths:
    m = np.zeros((2, 2))
    for x in top:
        this = G.node[x][f]
        if this:
            if eth in this:
                m[0, 0] += 1
            else:
                m[1, 0] += 1
                
    for x in bottom:
        this = G.node[x][f]
        if this:
            if eth in this:
                m[0, 1] += 1
            else:
                m[1, 1] += 1
    print(eth)
    print(m)
    N = m.sum().sum()
    chi2, p, dof, ex = chi2_contingency(m, correction=False)
    phi = (m[1, 1]*m[0, 0] - m[1, 0]*m[0, 1])/np.sqrt(m[1, :].sum() * m[0, :].sum() * m[:, 0].sum() * m[:, 1].sum())
    zm = m.copy()
    if m[0, 0] > m[1, 0]:
        zm = m[[1, 0]].copy()
    pr1 = zm[0, :].sum()/zm.sum().sum()
    pr2 = zm[1, :].sum()/zm.sum().sum()
    pc1 = zm[:, 0].sum()/zm.sum().sum()
    pc2 = zm[:, 1].sum()/zm.sum().sum()
    maxphi = np.sqrt((pr1 * (1-pc2))/(pc2 * (1-pr1)))
    if phi < 0:
        maxphi = -maxphi
    sr = stdres(m, ex)
    sr = sr[0, 0]
    oddsratio = (m[0,0]/m[0, 1])/(m[1, 0]/m[1, 1])
    lor = np.log(oddsratio)
    clef = (m[0,0]/m[0, :].sum()) * (m[1, 1]/m[1, :].sum())
    out.append([eth, chi2, p, phi, maxphi, sr, oddsratio, lor, clef])

A
[[  108.   766.]
 [ 1102.  6134.]]
C
[[  276.  1571.]
 [  934.  5329.]]
B
[[  329.  2157.]
 [  881.  4743.]]
D
[[  270.  1124.]
 [  940.  5776.]]
> E
[[   77.   423.]
 [ 1133.  6477.]]
E
[[  150.   859.]
 [ 1060.  6041.]]


In [69]:
df = pd.DataFrame(out, columns=['category', 'chi2', 'p', 'phi', 'max_phi', 'adjusted_residual', 'oddsratio', 'logoddsratio', 'CLEF'])
df[['category', 'chi2', 'oddsratio', 'logoddsratio', 'p']].sort_values(by='category')

Unnamed: 0,category,chi2,oddsratio,logoddsratio,p
4,> E,0.096775,1.040625,0.039822,0.7557345
0,A,5.068682,0.784797,-0.242331,0.0243619
2,B,8.02538,0.821151,-0.197048,0.004612634
1,C,0.001023,1.002378,0.002376,0.9744871
3,D,26.247009,1.476035,0.38936,3.004197e-07
5,E,0.002613,0.995179,-0.004833,0.9592335


## 1 vs. 1

In [25]:
fset = set(['caucasian', 'black', 'latin', 'asian', 'multiple'])
fset.add('multiple')
fset

{'asian', 'black', 'caucasian', 'latin', 'multiple'}

In [26]:
out=[]
for a, b in permutations(fset, 2):
    m = np.zeros((2, 2))
    for x in top:
        this = G.node[x][f]
        if pd.notnull(this):
            if len(this) == 1:
                if (a, ) == this:
                    m[0, 0] += 1
                elif (b, ) == this:
                    m[1, 0] += 1
            else:
                if a == 'multiple':
                    m[0, 0] += 1
                elif b == 'multiple':
                    m[1, 0] += 1

    for x in bottom:
        this = G.node[x][f]
        if pd.notnull(this):
            if len(this) == 1:
                if (a, ) == this:
                    m[0, 1] += 1
                elif (b, ) == this:
                    m[1, 1] += 1
            else:
                if a == 'multiple':
                    m[0, 1] += 1
                elif b == 'multiple':
                    m[1, 1] += 1
    thisn = m[0, :].sum()
    N = m.sum().sum()
    chi2, p, dof, ex = chi2_contingency(m, correction=False)
    phi = (m[1, 1]*m[0, 0] - m[1, 0]*m[0, 1])/np.sqrt(m[1, :].sum() * m[0, :].sum() * m[:, 0].sum() * m[:, 1].sum())
    zm = m.copy()
    if m[0, 0] > m[1, 0]:
        zm = m[[1, 0]].copy()
    pr1 = zm[0, :].sum()/zm.sum().sum()
    pr2 = zm[1, :].sum()/zm.sum().sum()
    pc1 = zm[:, 0].sum()/zm.sum().sum()
    pc2 = zm[:, 1].sum()/zm.sum().sum()
    maxphi = np.sqrt((pr1 * (1-pc2))/(pc2 * (1-pr1)))
    if phi < 0:
        maxphi = -maxphi
    sr = stdres(m, ex)
    sr = sr[0, 0]
    oddsratio = (m[0,0]/m[0, 1])/(m[1, 0]/m[1, 1])
    lor = np.log(oddsratio)
    clef = (m[0,0]/m[0, :].sum()) * (m[1, 1]/m[1, :].sum())
    out.append([(a, b), int(thisn), chi2, p, phi, maxphi, sr, oddsratio, lor, clef])

In [27]:
out = pd.DataFrame(out, columns=['comp', 'n', 'chi2', 'p', 'phi', 'maxphi', 'sr', 'oddsratio', 'lor', 'clef'])
out

Unnamed: 0,comp,n,chi2,p,phi,maxphi,sr,oddsratio,lor,clef
0,"(caucasian, multiple)",24806,33.630798,6.662938e-09,-0.036304,-0.036913,-5.799207,0.466416,-0.762677,0.040132
1,"(caucasian, latin)",24806,35.36398,2.734984e-09,0.035127,0.081868,5.946762,1.908347,0.646237,0.043061
2,"(caucasian, asian)",24806,18.062123,2.138129e-05,0.026094,0.055622,4.249956,1.988622,0.687442,0.043102
3,"(caucasian, black)",24806,56.135602,6.764113e-14,0.043935,0.085306,7.49237,2.302455,0.833976,0.043236
4,"(multiple, caucasian)",711,33.630798,6.662938e-09,0.036304,0.036913,5.799207,2.144009,0.762677,0.086044
5,"(multiple, latin)",711,80.724733,2.594544e-19,0.132964,0.080504,8.984694,4.091513,1.408915,0.087889
6,"(multiple, asian)",711,56.221764,6.474093e-14,0.152076,0.327157,7.498117,4.263623,1.450119,0.087973
7,"(multiple, black)",711,104.83317,1.3286449999999999e-24,0.144987,0.071313,10.238807,4.936483,1.596653,0.088246
8,"(latin, caucasian)",3855,35.36398,2.734984e-09,-0.035127,-0.081868,-5.946762,0.524014,-0.646237,0.022565
9,"(latin, multiple)",3855,80.724733,2.594544e-19,-0.132964,-0.080504,-8.984694,0.244408,-1.408915,0.021481


In [28]:
m = np.zeros((len(fset), len(fset)))
m = pd.DataFrame(m, columns=fset)
m.index = fset
for _, row in out.iterrows():
    a, b = row['comp'][0], row['comp'][1]
    m.loc[a, b] = row['lor']
m = m.loc[['caucasian', 'black', 'latin', 'asian', 'multiple'], ['caucasian', 'black', 'latin', 'asian', 'multiple']]
m.round(2)

Unnamed: 0,caucasian,black,latin,asian,multiple
caucasian,0.0,0.83,0.65,0.69,-0.76
black,-0.83,0.0,-0.19,-0.15,-1.6
latin,-0.65,0.19,0.0,0.04,-1.41
asian,-0.69,0.15,-0.04,0.0,-1.45
multiple,0.76,1.6,1.41,1.45,0.0


## hair color

In [29]:
f = 'b_haircolor'
fset = set()
for x in fp:
    if pd.notnull(G.node[x][f]):
        fset.update(G.node[x][f])
fset.add('multiple')
fset

{'black', 'blond', 'brown', 'multiple', 'red'}

In [30]:
out=[]
for a, b in permutations(fset, 2):
    m = np.zeros((2, 2))
    for x in top:
        this = G.node[x][f]
        if pd.notnull(this):
            if len(this) == 1:
                if (a, ) == this:
                    m[0, 0] += 1
                elif (b, ) == this:
                    m[1, 0] += 1
            else:
                if a == 'multiple':
                    m[0, 0] += 1
                elif b == 'multiple':
                    m[1, 0] += 1

    for x in bottom:
        this = G.node[x][f]
        if pd.notnull(this):
            if len(this) == 1:
                if (a, ) == this:
                    m[0, 1] += 1
                elif (b, ) == this:
                    m[1, 1] += 1
            else:
                if a == 'multiple':
                    m[0, 1] += 1
                elif b == 'multiple':
                    m[1, 1] += 1
    thisn = m[0, :].sum()
    N = m.sum().sum()
    chi2, p, dof, ex = chi2_contingency(m, correction=False)
    phi = (m[1, 1]*m[0, 0] - m[1, 0]*m[0, 1])/np.sqrt(m[1, :].sum() * m[0, :].sum() * m[:, 0].sum() * m[:, 1].sum())
    zm = m.copy()
    if m[0, 0] > m[1, 0]:
        zm = m[[1, 0]].copy()
    pr1 = zm[0, :].sum()/zm.sum().sum()
    pr2 = zm[1, :].sum()/zm.sum().sum()
    pc1 = zm[:, 0].sum()/zm.sum().sum()
    pc2 = zm[:, 1].sum()/zm.sum().sum()
    maxphi = np.sqrt((pr1 * (1-pc2))/(pc2 * (1-pr1)))
    if phi < 0:
        maxphi = -maxphi
    sr = stdres(m, ex)
    sr = sr[0, 0]
    oddsratio = (m[0,0]/m[0, 1])/(m[1, 0]/m[1, 1])
    lor = np.log(oddsratio)
    clef = (m[0,0]/m[0, :].sum()) * (m[1, 1]/m[1, :].sum())
    out.append([(a, b), int(thisn), chi2, p, phi, maxphi, sr, oddsratio, lor, clef])

In [31]:
out = pd.DataFrame(out, columns=['comp', 'n', 'chi2', 'p', 'phi', 'maxphi', 'sr', 'oddsratio', 'lor', 'clef'])

In [32]:
m = np.zeros((len(fset), len(fset)))
m = pd.DataFrame(m, columns=fset)
m.index = fset
for _, row in out.iterrows():
    a, b = row['comp'][0], row['comp'][1]
    s = np.round(row['lor'], 2)
    s = str(s)
    if row['p'] <= 0.0025:
        s += "*"
    m.loc[a, b] = s


In [33]:
m.loc[['brown', 'blond', 'black', 'red', 'multiple'], ['brown', 'blond', 'black', 'red', 'multiple']]

Unnamed: 0,brown,blond,black,red,multiple
brown,0,-0.18,0.27,-0.13,-1.25*
blond,0.18,0,0.45*,0.05,-1.07*
black,-0.27,-0.45*,0,-0.39,-1.51*
red,0.13,-0.05,0.39,0,-1.12*
multiple,1.25*,1.07*,1.51*,1.12*,0


In [34]:
f = 'b_cup'
fset = set()
for x in fp:
    if pd.notnull(G.node[x][f]):
        fset.update(G.node[x][f])
fset

{'> E', 'A', 'B', 'C', 'D', 'E'}

In [35]:
out=[]
for a, b in permutations(fset, 2):
    m = np.zeros((2, 2))
    for x in top:
        this = G.node[x][f]
        if pd.notnull(this):
            if len(this) == 1:
                if (a, ) == this:
                    m[0, 0] += 1
                elif (b, ) == this:
                    m[1, 0] += 1
            else:
                if a == 'multiple':
                    m[0, 0] += 1
                elif b == 'multiple':
                    m[1, 0] += 1

    for x in bottom:
        this = G.node[x][f]
        if pd.notnull(this):
            if len(this) == 1:
                if (a, ) == this:
                    m[0, 1] += 1
                elif (b, ) == this:
                    m[1, 1] += 1
            else:
                if a == 'multiple':
                    m[0, 1] += 1
                elif b == 'multiple':
                    m[1, 1] += 1
    thisn = m[0, :].sum()
    N = m.sum().sum()
    chi2, p, dof, ex = chi2_contingency(m, correction=False)
    phi = (m[1, 1]*m[0, 0] - m[1, 0]*m[0, 1])/np.sqrt(m[1, :].sum() * m[0, :].sum() * m[:, 0].sum() * m[:, 1].sum())
    zm = m.copy()
    if m[0, 0] > m[1, 0]:
        zm = m[[1, 0]].copy()
    pr1 = zm[0, :].sum()/zm.sum().sum()
    pr2 = zm[1, :].sum()/zm.sum().sum()
    pc1 = zm[:, 0].sum()/zm.sum().sum()
    pc2 = zm[:, 1].sum()/zm.sum().sum()
    maxphi = np.sqrt((pr1 * (1-pc2))/(pc2 * (1-pr1)))
    if phi < 0:
        maxphi = -maxphi
    sr = stdres(m, ex)
    sr = sr[0, 0]
    oddsratio = (m[0,0]/m[0, 1])/(m[1, 0]/m[1, 1])
    lor = np.log(oddsratio)
    clef = (m[0,0]/m[0, :].sum()) * (m[1, 1]/m[1, :].sum())
    out.append([(a, b), int(thisn), chi2, p, phi, maxphi, sr, oddsratio, lor, clef])

In [36]:
out = pd.DataFrame(out, columns=['comp', 'n', 'chi2', 'p', 'phi', 'maxphi', 'sr', 'oddsratio', 'lor', 'clef'])
out

Unnamed: 0,comp,n,chi2,p,phi,maxphi,sr,oddsratio,lor,clef
0,"(A, C)",874,3.273645,0.07040103,-0.034686,-0.278842,-1.809322,0.802532,-0.219984,0.105105
1,"(A, B)",874,0.439715,0.5072601,-0.01144,-0.229262,-0.66311,0.924377,-0.078635,0.107216
2,"(A, D)",874,19.015849,1.296372e-05,-0.091566,-0.354111,-4.360717,0.586945,-0.532824,0.099636
3,"(A, > E)",874,2.527693,0.1118636,-0.042891,-0.298349,-1.589872,0.774541,-0.255484,0.10454
4,"(A, E)",874,2.493785,0.1142966,-0.036392,-0.370846,-1.579172,0.807415,-0.213917,0.1052
5,"(C, A)",1847,3.273645,0.07040103,0.034686,0.278842,1.809322,1.246057,0.219984,0.130966
6,"(C, B)",1847,2.576499,0.1084613,0.024385,0.347234,1.605148,1.151827,0.141349,0.129656
7,"(C, D)",1847,11.106983,0.0008600327,-0.058541,-0.391034,-3.332714,0.731367,-0.31284,0.120489
8,"(C, > E)",1847,0.064268,0.799873,-0.005233,-0.218915,-0.253512,0.965123,-0.0355,0.126419
9,"(C, E)",1847,0.003044,0.9559992,0.001032,0.309467,0.055175,1.006085,0.006067,0.127217


In [37]:
m = np.zeros((len(fset), len(fset)))
m = pd.DataFrame(m, columns=fset)
m.index = fset
for _, row in out.iterrows():
    a, b = row['comp'][0], row['comp'][1]
    s = np.round(row['lor'], 2)
    m.loc[a, b] = s
m = m.loc[['A', 'B', 'C', 'D', 'E', '> E'], ['A', 'B', 'C', 'D', 'E', '> E']]
m

Unnamed: 0,A,B,C,D,E,> E
A,0.0,-0.08,-0.22,-0.53,-0.21,-0.26
B,0.08,0.0,-0.14,-0.45,-0.14,-0.18
C,0.22,0.14,0.0,-0.31,0.01,-0.04
D,0.53,0.45,0.31,0.0,0.32,0.28
E,0.21,0.14,-0.01,-0.32,0.0,-0.04
> E,0.26,0.18,0.04,-0.28,0.04,0.0


## Continuous, win/nom

In [38]:
from scipy.stats import ttest_ind
this = top.union(bottom)
for f in ['proc_height', 'proc_weight', 'proc_bust', 'proc_waist', 'proc_hip']:
    t = []
    b = []
    for x in this:
        if pd.notnull(G.node[x][f]):
            add = G.node[x][f]
            if f in ['proc_bust', 'proc_waist', 'proc_hip']:
                add = add * 2.54
            if x in top:
                t.append(add)
            elif x in bottom:
                b.append(add)
    print(f, np.round(np.mean(t), 2), np.round(np.mean(b), 2), np.round(ttest_ind(t, b, equal_var=False)[1], 4))

proc_height 164.89 165.28 0.056
proc_weight 53.81 54.73 0.0001
proc_bust 86.96 88.38 0.0
proc_waist 64.14 64.51 0.0847
proc_hip 89.48 89.38 0.7133


# Regression

Okay, we can use ridge or we can use statsmodels. If we use ridge, then we'll have to bootstrap the confidence intervals. If we use statsmodels, we can use the p-values presented. 

First and foremost, let's simply get the data (X) prepared. We'll consider bust, waist, hip, eth, hair, cup.

In [39]:
check = ['b_ethnicity', 'b_haircolor', 'b_cup']

for x in fp:
    this = G.node[x]
    for c in check:
        if pd.isnull(this[c]):
            this[c] = ()


In [40]:
out = []
y = []
for x in fp:
    data = G.node[x]
    vec = []
    vec.append(data['proc_bust'])
    vec.append(data['proc_waist'])
    vec.append(data['proc_hip'])
    vec.append(data['proc_height'])
    vec.append(data['proc_weight'])
    vec.append(data['b_ethnicity'])
    vec.append(data['b_haircolor'])
    vec.append(data['b_cup'])
    out.append(vec)
    y.append(data['degree_centrality'])
df = pd.DataFrame(out, columns=['bust', 'waist', 'hip', 'weight', 'height', 'ethnicity', 'haircolor', 'cup'])

In [41]:
from sklearn.preprocessing import MultiLabelBinarizer
X1 = df[['bust', 'waist', 'hip', 'height']]

mlb = MultiLabelBinarizer() 

X2 = mlb.fit_transform(df['ethnicity'])
ethnicity_labels = mlb.classes_
ethnicity_labels = ['eth_'+x for x in ethnicity_labels]
mlb = MultiLabelBinarizer() 
X3 = mlb.fit_transform(df['haircolor'])
haircolor_labels = mlb.classes_
haircolor_labels = ['hc_'+x for x in haircolor_labels]
mlb = MultiLabelBinarizer() 
X4 = mlb.fit_transform(df['cup'])
cup_labels = mlb.classes_

In [42]:
from sklearn.preprocessing import Imputer
imp = Imputer(strategy="mean")
X1 = imp.fit_transform(X1)
X1 = pd.DataFrame(X1, columns=['bust', 'waist', 'hip', 'height'])

In [43]:
X = np.hstack([X1, X2, X3, X4])

In [44]:
X = pd.DataFrame(X, columns = ['bust', 'waist', 'hip', 'height'] + list(ethnicity_labels) + list(haircolor_labels) + list(cup_labels))

In [45]:
y = np.log(y)

In [46]:
import statsmodels.api as sm
model = sm.OLS(y, sm.add_constant(X))
results = model.fit()
results.summary()

  from pandas.core import datetools


0,1,2,3
Dep. Variable:,y,R-squared:,0.41
Model:,OLS,Adj. R-squared:,0.41
Method:,Least Squares,F-statistic:,1789.0
Date:,"Fri, 18 May 2018",Prob (F-statistic):,0.0
Time:,19:33:11,Log-Likelihood:,-67542.0
No. Observations:,46320,AIC:,135100.0
Df Residuals:,46301,BIC:,135300.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.1718,0.161,7.265,0.000,0.856,1.488
bust,0.0189,0.004,4.278,0.000,0.010,0.027
waist,-0.0677,0.006,-11.688,0.000,-0.079,-0.056
hip,0.0170,0.005,3.312,0.001,0.007,0.027
height,-0.0090,0.002,-5.882,0.000,-0.012,-0.006
eth_asian,0.0519,0.027,1.894,0.058,-0.002,0.106
eth_black,0.1096,0.021,5.281,0.000,0.069,0.150
eth_caucasian,0.1078,0.016,6.835,0.000,0.077,0.139
eth_latin,-0.0182,0.020,-0.914,0.360,-0.057,0.021

0,1,2,3
Omnibus:,2765.079,Durbin-Watson:,2.008
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3513.172
Skew:,0.578,Prob(JB):,0.0
Kurtosis:,3.695,Cond. No.,2620.0


In [47]:
Xsub = X[['eth_asian', 'eth_caucasian', 'eth_black', 'eth_latin']]
#Xsub = X[['hc_black', 'hc_blond', 'hc_brown', 'hc_red']]
#Xsub = X[['A', 'B', 'C', 'D', 'E', '> E']]
model = sm.OLS(y, sm.add_constant(Xsub))
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.115
Model:,OLS,Adj. R-squared:,0.115
Method:,Least Squares,F-statistic:,1504.0
Date:,"Fri, 18 May 2018",Prob (F-statistic):,0.0
Time:,19:33:11,Log-Likelihood:,-76940.0
No. Observations:,46320,AIC:,153900.0
Df Residuals:,46315,BIC:,153900.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3691,0.012,31.175,0.000,0.346,0.392
eth_asian,0.7023,0.032,22.145,0.000,0.640,0.764
eth_caucasian,1.0935,0.014,77.437,0.000,1.066,1.121
eth_black,0.6662,0.022,29.961,0.000,0.623,0.710
eth_latin,0.6789,0.022,31.380,0.000,0.637,0.721

0,1,2,3
Omnibus:,6424.721,Durbin-Watson:,1.996
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9500.027
Skew:,1.038,Prob(JB):,0.0
Kurtosis:,3.782,Cond. No.,6.7


In [48]:
Xsub = X[['hc_black', 'hc_blond', 'hc_brown', 'hc_red']]
model = sm.OLS(y, sm.add_constant(Xsub))
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.204
Model:,OLS,Adj. R-squared:,0.204
Method:,Least Squares,F-statistic:,2974.0
Date:,"Fri, 18 May 2018",Prob (F-statistic):,0.0
Time:,19:33:11,Log-Likelihood:,-74474.0
No. Observations:,46320,AIC:,149000.0
Df Residuals:,46315,BIC:,149000.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2349,0.010,23.489,0.000,0.215,0.254
hc_black,0.9054,0.015,59.005,0.000,0.875,0.936
hc_blond,1.1997,0.014,86.729,0.000,1.173,1.227
hc_brown,1.0386,0.013,80.085,0.000,1.013,1.064
hc_red,0.9485,0.023,41.168,0.000,0.903,0.994

0,1,2,3
Omnibus:,5876.229,Durbin-Watson:,2.006
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8473.031
Skew:,0.971,Prob(JB):,0.0
Kurtosis:,3.789,Cond. No.,4.71


In [49]:
Xsub = X[['A', 'B', 'C', 'D', 'E', '> E']]
model = sm.OLS(y, sm.add_constant(Xsub))
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.328
Model:,OLS,Adj. R-squared:,0.328
Method:,Least Squares,F-statistic:,3764.0
Date:,"Fri, 18 May 2018",Prob (F-statistic):,0.0
Time:,19:33:11,Log-Likelihood:,-70570.0
No. Observations:,46320,AIC:,141200.0
Df Residuals:,46313,BIC:,141200.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.7685,0.006,135.286,0.000,0.757,0.780
A,1.9072,0.038,50.210,0.000,1.833,1.982
B,2.0388,0.023,88.714,0.000,1.994,2.084
C,2.0615,0.026,77.930,0.000,2.010,2.113
D,2.2640,0.030,74.778,0.000,2.205,2.323
E,1.9921,0.035,56.253,0.000,1.923,2.062
> E,1.5474,0.050,30.960,0.000,1.449,1.645

0,1,2,3
Omnibus:,4150.319,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5494.925
Skew:,0.767,Prob(JB):,0.0
Kurtosis:,3.703,Cond. No.,9.73
