In [102]:
import pandas as pd
import networkx as nx
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import matplotlib
import statsmodels.api as sm
import numpy as np
from scipy.stats import chi2_contingency
matplotlib.style.use('ggplot')

G = nx.read_gpickle("data/G_analysisready_03.pkl")

In [103]:
# checks 
sgrey = "_person.rme_perfid=sashagrey_gender=f_sasha-grey.htm"
nhartley = "_person.rme_perfid=hartley_gender=f_nina-hartley.htm"
bstarr = "_person.rme_perfid=bobbistar_gender=f_bobbi-starr.htm"
jfire = "_person.rme_perfid=jada_gender=f_jada-fire.htm"
vperidot = "_person.rme_perfid=virgoperidot_gender=f_virgo-peridot.htm"
rlinares = "_person.rme_perfid=rebecalinares_gender=f_rebeca-linares.htm"
bolsen = "/person.rme/perfid=breeolson/gender=f/bree-olson.htm"
bolsen = bolsen.replace("/", "_")
acarrera = "/person.rme/perfid=asiac/gender=f/asia-carrera.htm"
acarrera = acarrera.replace("/", "_")
achrist = "/person.rme/perfid=anastasia_eu_01/gender=f/anastasia-christ.htm"
achrist = achrist.replace("/", "_")

In [104]:
performers = [x for x in G.nodes() if G.node[x]['bipartite'] == 0]
fp = []

for p in performers:
    if 'a_pcat' in G.node[p]:
        if G.node[p]['a_pcat'] in ['f_gay', 'f_straight']:
            if 'unknown' not in p:
                fp.append(p)

# 1. Find the cutoff

In [105]:
CUTOFF = 0.8

degree = G.degree(fp)
degfreq = Counter(degree.values())
N = sum(degree.values())
n = 0
t = 0
for k, v in degfreq.most_common()[::-1]:
    n += k * v
    if n > 0.8 * N:
        break
    t = k
print("80/20 cutoff is {} performances.".format(t))

80/20 cutoff is 14 performances.


# 2. Divide into top bottom

Key consideration: are we dividing into top and bottom BEFORE Or AFTER filtering on category?

Before I think is better.

In [106]:
top = [p for p, v in degree.items() if v > t]
bottom = [p for p, v in degree.items() if v <= t]

# 3. Pick a category, get unique values.

In [107]:
eths = set()
for x in fp:
    if pd.notnull(G.node[x]['b_ethnicity']):
        eths.update(G.node[x]['b_ethnicity'])
eths

{'asian', 'black', 'caucasian', 'latin'}

# 4. Calculate Statistics

In [108]:
from scipy.stats import chi2_contingency
from scipy.stats.contingency import margins

def stdres(observed, expected):
    n = observed.sum()
    rsum, csum = margins(observed)
    v = csum * rsum * (n - rsum) * (n - csum) / n**3
    return (observed - expected) / np.sqrt(v)

out = []

for eth in eths:
    m = np.zeros((2, 2))
    for x in top:
        this = G.node[x]['b_ethnicity']
        if pd.notnull(this):
            if eth in this:
                m[0, 0] += 1
            else:
                m[1, 0] += 1
                
    for x in bottom:
        this = G.node[x]['b_ethnicity']
        if pd.notnull(this):
            if eth in this:
                m[0, 1] += 1
            else:
                m[1, 1] += 1
    N = m.sum().sum()
    chi2, p, dof, ex = chi2_contingency(m, correction=False)
    phi = (m[1, 1]*m[0, 0] - m[1, 0]*m[0, 1])/np.sqrt(m[1, :].sum() * m[0, :].sum() * m[:, 0].sum() * m[:, 1].sum())
    zm = m.copy()
    if m[0, 0] > m[1, 0]:
        zm = m[[1, 0]].copy()
    pr1 = zm[0, :].sum()/zm.sum().sum()
    pr2 = zm[1, :].sum()/zm.sum().sum()
    pc1 = zm[:, 0].sum()/zm.sum().sum()
    pc2 = zm[:, 1].sum()/zm.sum().sum()
    maxphi = np.sqrt((pr1 * (1-pc2))/(pc2 * (1-pr1)))
    if phi < 0:
        maxphi = -maxphi
    sr = stdres(m, ex)
    sr = sr[0, 0]
    oddsratio = (m[0,0]/m[0, 1])/(m[1, 0]/m[1, 1])
    lor = np.log(oddsratio)
    clef = (m[0,0]/m[0, :].sum()) * (m[1, 1]/m[1, :].sum())
    out.append([eth, chi2, p, phi, maxphi, sr, oddsratio, lor, clef])

In [109]:
df = pd.DataFrame(out, columns=['category', 'chi2', 'p', 'phi', 'max_phi', 'adjusted_residual', 'oddsratio', 'logoddsratio', 'CLEF'])

In [110]:
df

Unnamed: 0,category,chi2,p,phi,max_phi,adjusted_residual,oddsratio,logoddsratio,CLEF
0,latin,43.728056,3.773237e-11,-0.035162,-0.184899,-6.612719,0.749366,-0.288528,0.124274
1,asian,11.678133,0.0006323896,-0.018171,-0.113761,-3.417328,0.801059,-0.221821,0.129521
2,caucasian,309.165481,3.3190950000000004e-69,0.093495,0.306105,17.583102,1.781721,0.57758,0.185679
3,black,137.888397,7.708977e-32,-0.062439,-0.184473,-11.742589,0.57736,-0.54929,0.101003


# Consolidate into a function

In [121]:
def effectstable(G, fp, top, bottom, category):
    vals = set()
    for x in fp:
        if pd.notnull(G.node[x][category]):
            vals.update(G.node[x][category])
    out = []

    for val in vals:
        m = np.zeros((2, 2))
        for x in top:
            this = G.node[x][category]
            if pd.notnull(this):
                if val in this:
                    m[0, 0] += 1
                else:
                    m[1, 0] += 1

        for x in bottom:
            this = G.node[x][category]
            if pd.notnull(this):
                if val in this:
                    m[0, 1] += 1
                else:
                    m[1, 1] += 1
        thisn = m[0, :].sum()
        N = m.sum().sum()
        chi2, p, dof, ex = chi2_contingency(m, correction=False)
        phi = (m[1, 1]*m[0, 0] - m[1, 0]*m[0, 1])/np.sqrt(m[1, :].sum() * m[0, :].sum() * m[:, 0].sum() * m[:, 1].sum())
        zm = m.copy()
        if m[0, 0] > m[1, 0]:
            zm = m[[1, 0]].copy()
        pr1 = zm[0, :].sum()/zm.sum().sum()
        pr2 = zm[1, :].sum()/zm.sum().sum()
        pc1 = zm[:, 0].sum()/zm.sum().sum()
        pc2 = zm[:, 1].sum()/zm.sum().sum()
        maxphi = np.sqrt((pr1 * (1-pc2))/(pc2 * (1-pr1)))
        if phi < 0:
            maxphi = -maxphi
        sr = stdres(m, ex)
        sr = sr[0, 0]
        oddsratio = (m[0,0]/m[0, 1])/(m[1, 0]/m[1, 1])
        lor = np.log(oddsratio)
        clef = (m[0,0]/m[0, :].sum()) * (m[1, 1]/m[1, :].sum())
        out.append([val, int(thisn), chi2, p, phi, maxphi, sr, oddsratio, lor, clef])
    df = pd.DataFrame(out, columns=['category', 'n', 'chi2', 'p', 'phi', 'max_phi', 'adjusted_residual', 'oddsratio', 'logoddsratio', 'CLEF'])
    return df

In [122]:
df = effectstable(G, fp, top, bottom, 'b_ethnicity')
df = df.sort_values(by='phi')
df

Unnamed: 0,category,n,chi2,p,phi,max_phi,adjusted_residual,oddsratio,logoddsratio,CLEF
3,black,4458,137.888397,7.708977e-32,-0.062439,-0.184473,-11.742589,0.57736,-0.54929,0.101003
0,latin,4476,43.728056,3.773237e-11,-0.035162,-0.184899,-6.612719,0.749366,-0.288528,0.124274
1,asian,1839,11.678133,0.0006323896,-0.018171,-0.113761,-3.417328,0.801059,-0.221821,0.129521
2,caucasian,25315,309.165481,3.3190950000000004e-69,0.093495,0.306105,17.583102,1.781721,0.57758,0.185679


In [123]:
df = effectstable(G, fp, top, bottom, 'b_cup')
df = df.sort_values(by='phi')
df

Unnamed: 0,category,n,chi2,p,phi,max_phi,adjusted_residual,oddsratio,logoddsratio,CLEF
4,> E,500,30.942171,2.658323e-08,-0.061768,-0.27998,-5.562569,0.597652,-0.514746,0.189992
2,A,874,7.259266,0.007053611,-0.029918,-0.379613,-2.694303,0.824601,-0.192856,0.225917
3,E,1009,0.448657,0.5029735,-0.007438,-0.411737,-0.669819,0.955822,-0.045184,0.242835
5,B,2486,1.190857,0.2751568,0.012118,0.72621,1.091264,1.054242,0.052822,0.254421
1,C,1847,1.654559,0.1983396,0.014283,0.593166,1.286297,1.070909,0.068508,0.256188
0,D,1394,12.839701,0.0003393418,0.039789,0.497634,3.583253,1.238029,0.213521,0.273201


In [124]:
df = effectstable(G, fp, top, bottom, 'b_haircolor')
df = df.sort_values(by='phi')
df

Unnamed: 0,category,n,chi2,p,phi,max_phi,adjusted_residual,oddsratio,logoddsratio,CLEF
3,black,8882,0.509669,0.4752821,0.003819,0.287995,0.713911,1.022277,0.022032,0.159715
1,red,2992,114.634113,9.464389000000001e-27,0.057269,0.150966,10.706732,1.589279,0.463281,0.219059
0,brown,14849,146.481415,1.0188210000000001e-33,0.064737,0.424052,12.102951,1.386352,0.326676,0.186471
2,blond,12417,368.955719,3.159129e-82,0.102743,0.366252,19.20822,1.68684,0.522857,0.209208


In [136]:
def effectstable(G, fp, top, bottom, category):
    vals = set()
    for x in fp:
        if pd.notnull(G.node[x][category]):
            vals.update(G.node[x][category])
    out = []

    for val in vals:
        m = np.zeros((2, 2))
        for x in top:
            this = G.node[x][category]
            if pd.notnull(this):
                if (val, ) == this:
                    m[0, 0] += 1
                else:
                    m[1, 0] += 1

        for x in bottom:
            this = G.node[x][category]
            if pd.notnull(this):
                if (val, ) == this:
                    m[0, 1] += 1
                else:
                    m[1, 1] += 1
        thisn = m[0, :].sum()
        N = m.sum().sum()
        chi2, p, dof, ex = chi2_contingency(m, correction=False)
        phi = (m[1, 1]*m[0, 0] - m[1, 0]*m[0, 1])/np.sqrt(m[1, :].sum() * m[0, :].sum() * m[:, 0].sum() * m[:, 1].sum())
        zm = m.copy()
        if m[0, 0] > m[1, 0]:
            zm = m[[1, 0]].copy()
        pr1 = zm[0, :].sum()/zm.sum().sum()
        pr2 = zm[1, :].sum()/zm.sum().sum()
        pc1 = zm[:, 0].sum()/zm.sum().sum()
        pc2 = zm[:, 1].sum()/zm.sum().sum()
        maxphi = np.sqrt((pr1 * (1-pc2))/(pc2 * (1-pr1)))
        if phi < 0:
            maxphi = -maxphi
        sr = stdres(m, ex)
        sr = sr[0, 0]
        oddsratio = (m[0,0]/m[0, 1])/(m[1, 0]/m[1, 1])
        lor = np.log(oddsratio)
        clef = (m[0,0]/m[0, :].sum()) * (m[1, 1]/m[1, :].sum())
        out.append([val, int(thisn), chi2, p, phi, maxphi, sr, oddsratio, lor, clef])
    df = pd.DataFrame(out, columns=['category', 'n', 'chi2', 'p', 'phi', 'max_phi', 'adjusted_residual', 'oddsratio', 'logoddsratio', 'CLEF'])
    return df

In [133]:
this

In [137]:
df = effectstable(G, fp, top, bottom, 'b_ethnicity')
df = df.sort_values(by='phi')
df

Unnamed: 0,category,n,chi2,p,phi,max_phi,adjusted_residual,oddsratio,logoddsratio,CLEF
3,black,4276,154.30175,1.989688e-35,-0.066051,-0.180138,-12.421826,0.548315,-0.600905,0.096661
0,latin,3855,74.601771,5.7591520000000005e-18,-0.045927,-0.169894,-8.637232,0.658784,-0.41736,0.111827
1,asian,1720,37.502866,9.127875e-10,-0.032563,-0.109824,-6.123958,0.645364,-0.437941,0.108274
2,caucasian,24806,234.828266,5.277456e-53,0.081484,0.316961,15.324107,1.625949,0.486092,0.181764
