In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymc as pm

sns.set()

In [None]:
# the logit and logistic below seem to be inversed, but I couldnt figure out McElreaths text otherwise

alpha=1
beta=1

x = np.linspace(0,1,20)
y= np.arange(-10,10) 

def logit_pure(x):
    return (np.exp(x)) / (1 + np.exp(x))

In [None]:
def logistic_pure(p):
    return np.log(p / (1 - p))

In [None]:
fig,axes = plt.subplots(3,1,sharex=True,figsize=(18,12))

axes[0].plot(x,y)
axes[1].plot(x,logit_pure(x=y))
axes[2].plot(x,logistic_pure(logit_pure(x=y)))

In [None]:
df = pd.read_csv('UCBAdmit.csv',sep=';')
df['male'] = df['applicant.gender'].apply(lambda x : 1 if x=='male' else 0)

#### discriminate
#mask = df['applicant.gender'] == 'female'
#df.loc[mask,'admit'] = df.loc[mask,'admit'] // 2
df

In [None]:
pivot = pd.pivot_table(df,index='dept',columns='applicant.gender',values=['admit','applications'],
                       aggfunc=sum,margins=True)

pivot['applications_f'] = pivot[('applications','female')] / pivot[('applications','All')]
pivot['applications_m'] = pivot[('applications','male')] / pivot[('applications','All')]
pivot['f_choice'] = pivot[('applications','female')] / pivot.loc['All',('applications','female')]
pivot['m_choice'] = pivot[('applications','male')] / pivot.loc['All',('applications','male')]
pivot['admit_pct_f'] = pivot[('admit','female')] / pivot[('applications','female')]
pivot['admit_pct_m'] = pivot[('admit','male')] / pivot[('applications','male')]
pivot['admit_pct_tot'] = pivot[('admit','All')] / pivot[('applications','All')]
pivot['female_less'] = pivot['admit_pct_f'] < pivot['admit_pct_m']

raw_data = pivot[['admit','applications']].copy()
raw_data[('pct_appl_from','female')] = raw_data[('applications','female')] / raw_data[('applications','All')]
raw_data[('pct_appl_from','male')] = raw_data[('applications','male')] / raw_data[('applications','All')]



raw_data[('fair_admit','female')] = (raw_data[('pct_appl_from','female')] * raw_data[('admit','All')])#.astype(int)
raw_data[('fair_admit','male')] = (raw_data[('pct_appl_from','male')] * raw_data[('admit','All')])#.astype(int)
raw_data[('excess_admit','female')] = raw_data[('admit','female')] - raw_data[('fair_admit','female')]
raw_data[('excess_admit','male')] = raw_data[('admit','male')] - raw_data[('fair_admit','male')]

raw_data[('excess_pct','female')] = raw_data[('excess_admit','female')] / raw_data[('admit','female')]
raw_data[('excess_pct','male')] = raw_data[('excess_admit','male')] / raw_data[('admit','male')]


raw_data[('bias_factor','female')] = ( raw_data[('admit','female')] / raw_data[('fair_admit','female')])
raw_data[('bias_factor','male')] = ( raw_data[('admit','male')] / raw_data[('fair_admit','male')])

excess_female_admissions = raw_data.loc[:'F',('excess_admit','female')].sum()
print ('excess female admissions',excess_female_admissions)
total_female_admissions = raw_data.loc['All',('admit','female')]
print ('total female admissions',total_female_admissions)

pct_excess_female_admissions = excess_female_admissions / total_female_admissions
print ('pct excess female admissions',pct_excess_female_admissions)

#### SUMMARY ###

# unbiased admittance : pct admitted == pct applied per dept. That is, if 11% of the applicants to A are women,
# then 11% of the available seats (0.11 * 601 == 69) should go to women. 

# 15 of the women admitted should not have been admitted. An additional 15 men should have been admitted.
# of the 557 women admitted, 15 of them, 2.6%, should not have been admitted
# The number of men admitted, 1755, should have been 1770, that is, 0.85% larger

raw_data

In [None]:

pivot.drop(['admit','applications'], axis=1,inplace=True)

pivot.reset_index(inplace=True)
pivot.set_index('dept',drop=True,inplace=True)
pivot.columns = pivot.columns.droplevel(level=1)

pivot

In [None]:
pivot[['admit_pct_f','admit_pct_m']].plot.bar(color=['red','blue'],rot=0,figsize=(18,12),
                                             title="SImpson's Paradox UCB Admission")
plt.ylabel('percent of applicants admitted, by gender')

In [None]:
# model: 
# admit ~ Binomial(applications,p)
# p = logit(alpha + beta * male)

# probability is a function depending on gender
# so if female, male==0, p will become alpha,
# if male, beta * 1 will be added to alpha to produce probability.

# logit function maps the linear function to a non-linear space 0..1
# without logit nothing prevents the linear function alpha + beta * male going outside 0..1

alpha = pm.Normal('alpha',0, 1 / 10 ** 2)
beta = pm.Normal('beta',0,1 / 10 ** 2)

x = df['male']

@pm.deterministic
def logit(alpha=alpha,beta=beta,x=x):
    return (np.exp(alpha+beta*x)) / (1 + np.exp(alpha+beta*x))

lkh = pm.Binomial('lkh',n=df['applications'],p=logit,observed=True,value=df['admit'])

model = pm.Model([alpha,beta,logit,lkh])

mcmc = pm.MCMC(model)
sample = mcmc.sample(50000,10000,2)

In [None]:
post_alpha = mcmc.trace('alpha')[:]
post_beta = mcmc.trace('beta')[:]

result = pd.DataFrame({'post_alpha' : post_alpha,
                      'post_beta' : post_beta})

result['male_p'] = logit_pure(result['post_alpha'] + result['post_beta'])
result['female_p'] = logit_pure(result['post_alpha'])
result['male_advantage'] = result['male_p'] - result['female_p']

print (result.head())
result.describe()



In [None]:
plt.scatter(df['male'],df['admit'])

In [None]:
nr_rows = 100000
nr_applications = 4000

rows = np.random.choice(result.index,replace=True,size=nr_rows)

m_admitted = pm.rbinomial(n=nr_applications,p=result.iloc[rows].male_p,size=nr_rows)
f_admitted = pm.rbinomial(n=nr_applications,p = result.iloc[rows].female_p,size=nr_rows)

male_advantage = (m_admitted / f_admitted)
print (male_advantage.mean()) # same as 0.44 / 0.30 above
print (m_admitted.mean() / nr_applications)
print (f_admitted.mean() / nr_applications)
plt.hist(male_advantage)

In [None]:
### control for department by assigning a unique alpha to each department,
# setting that alpha as the p for admit women, and then having a common, uni-wide offset for p male

# model: 
# admit ~ Binomial(applications,p)
# p = logit(alpha[dept_id] + beta * male)
# alpha[dept_id] ~ Normal(0,10)


def assign_dept_id(d):
    if d == 'A' : return 1
    if d == 'B' : return 2
    if d == 'C' : return 3
    if d == 'D' : return 4
    if d == 'E' : return 5
    if d == 'F' : return 6


df['dept_id'] = df['dept'].apply(assign_dept_id)

alpha = pm.Normal('alpha',0, 1 / 10 ** 2,size=6) #six departments

beta = pm.Normal('beta',0,1 / 10 ** 2)

x = df['male']
dept_id = df['dept_id']


@pm.deterministic
def logit_dept(dept_id=dept_id-1,alpha=alpha,beta=beta,x=x):
    
    # index for alpha : subtract 1 from dept_id that goes 1..6 to get zero-based index of python
    
    return (np.exp(alpha[dept_id]+beta*x)) / (1 + np.exp(alpha[dept_id]+beta*x))

lkh_dept = pm.Binomial('lkh_dept',n=df['applications'],p=logit_dept,observed=True,value=df['admit'])

model = pm.Model([alpha,beta,logit_dept,lkh_dept])

mcmc = pm.MCMC(model)
sample = mcmc.sample(50000,20000,2)

alpha_post_0 = mcmc.trace('alpha')[:,0]
alpha_post_1 = mcmc.trace('alpha')[:,1]
alpha_post_2 = mcmc.trace('alpha')[:,2]
alpha_post_3 = mcmc.trace('alpha')[:,3]
alpha_post_4 = mcmc.trace('alpha')[:,4]
alpha_post_5 = mcmc.trace('alpha')[:,5]

beta_post = mcmc.trace('beta')[:]

result_dept = pd.DataFrame({'alpha_post_0' : alpha_post_0,
                          'alpha_post_1' : alpha_post_1,
                           'alpha_post_2': alpha_post_2,
                           'alpha_post_3': alpha_post_3,
                           'alpha_post_4' : alpha_post_4,
                           'alpha_post_5' : alpha_post_5,
                           'beta_post' : beta_post})



In [None]:
result_dept['Dep_A_F_p'] = logit_pure(result_dept['alpha_post_0'])
result_dept['Dep_A_M_p'] = logit_pure(result_dept['alpha_post_0'] + result_dept['beta_post']) 

result_dept['Dep_B_F_p'] = logit_pure(result_dept['alpha_post_1'])
result_dept['Dep_B_M_p'] = logit_pure(result_dept['alpha_post_1'] + result_dept['beta_post'])

result_dept['Dep_C_F_p'] = logit_pure(result_dept['alpha_post_2'])
result_dept['Dep_C_M_p'] = logit_pure(result_dept['alpha_post_2'] + result_dept['beta_post'])

result_dept['Dep_D_F_p'] = logit_pure(result_dept['alpha_post_3'])
result_dept['Dep_D_M_p'] = logit_pure(result_dept['alpha_post_3'] + result_dept['beta_post'])

result_dept['Dep_E_F_p'] = logit_pure(result_dept['alpha_post_4'])
result_dept['Dep_E_M_p'] = logit_pure(result_dept['alpha_post_4'] + result_dept['beta_post'])

result_dept['Dep_F_F_p'] = logit_pure(result_dept['alpha_post_5'])
result_dept['Dep_F_M_p'] = logit_pure(result_dept['alpha_post_5'] + result_dept['beta_post'])

result_dept['M_p'] = logit_pure(result_dept['beta_post']) #University wide male probability

result_dept_logit_scale = result_dept.loc[:,'alpha_post_0' : 'beta_post' ]
print (result_dept_logit_scale.describe())

result_dept_natural_scale = result_dept.iloc[:,7:]
result_dept_natural_scale = result_dept_natural_scale[['Dep_A_F_p','Dep_A_M_p',
                                                      'Dep_B_F_p','Dep_B_M_p',
                                                       'Dep_C_F_p','Dep_C_M_p',
                                                       'Dep_D_F_p','Dep_D_M_p',
                                                       'Dep_E_F_p','Dep_E_M_p',
                                                       'Dep_F_F_p','Dep_F_M_p',
                                                      'M_p']]
result_dept_natural_scale.describe()

In [None]:
advantage = pd.DataFrame(result_dept_natural_scale['Dep_A_M_p'] - result_dept_natural_scale['Dep_A_F_p'],
                        columns=['A'])

advantage['B']  = result_dept_natural_scale['Dep_B_M_p'] - result_dept_natural_scale['Dep_B_F_p']

advantage['C']  = result_dept_natural_scale['Dep_C_M_p'] - result_dept_natural_scale['Dep_C_F_p']
advantage['D']  = result_dept_natural_scale['Dep_D_M_p'] - result_dept_natural_scale['Dep_D_F_p']
advantage['E']  = result_dept_natural_scale['Dep_E_M_p'] - result_dept_natural_scale['Dep_E_F_p']
advantage['F']  = result_dept_natural_scale['Dep_F_M_p'] - result_dept_natural_scale['Dep_F_F_p']
advantage.describe(percentiles=[0.055,0.945])


In [None]:
# Simpsons paradox : UCB-admissions with fake data
# two departments
# 200 applicants, 100 men, 100 women

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymc as pm

sns.set()

# biased, with about 7 pct points advantage women in both depts, but overall 22 points disadvantage
men = 100
women = 100
men_apply_a = 90
men_admit_a = 75
men_admit_b = 1
women_apply_a = 50
women_admit_a = 45
women_admit_b = 9

'''
# perfectly equal
men_apply_a = 50
men_admit_a = 25
men_admit_b = 25
women_apply_a = 50
women_admit_a = 25
women_admit_b = 25
'''

'''
#equal, but with different preferences
men_apply_a = 80 
men_admit_a = 40 # 40 %
men_admit_b = 2 # 10 %
women_apply_a = 20
women_admit_a = 10 # 50%
women_admit_b = 8 # 10%
'''


df = pd.DataFrame({'M_applied': [men_apply_a,men - men_apply_a],
                  'M_admit': [men_admit_a,men_admit_b],
                 'F_applied' : [women_apply_a,women - women_apply_a],
                 'F_admit': [women_admit_a,women_admit_b]},index=['A','B'])

df['tot_applied'] = df[['M_applied','F_applied']].sum(axis=1)
df['tot_admit'] = df[['M_admit','F_admit']].sum(axis=1)

df['fair_admit_M'] = df['M_applied'] / df['tot_applied'] * df['tot_admit']
df['fair_admit_F'] = df['F_applied'] / df['tot_applied'] * df['tot_admit']

df['F_admit_pct'] = df['F_admit'] / df['F_applied']
df['M_admit_pct'] = df['M_admit'] / df['M_applied']

df['tot_admit_pct'] = df['tot_admit'] / df['tot_applied']
df.at['All',:] = df[['M_applied','M_admit','F_applied','F_admit','tot_applied','tot_admit']].sum()
df.at['All','M_admit_pct'] = df.loc['All','M_admit'] / df.loc['All','M_applied']
df.at['All','F_admit_pct'] = df.loc['All','F_admit'] / df.loc['All','F_applied']
df.at['All','tot_admit_pct'] = (df.loc['All','M_admit'] + df.loc['All','F_admit']) / \
(df.loc['All','M_applied'] + df.loc['All','F_applied'])


df.loc['All','fair_admit_M'] = df.loc['All','M_applied'] / df.loc['All','tot_applied'] * df.loc['All','tot_admit']
df.loc['All','fair_admit_F'] = df.loc['All','F_applied'] / df.loc['All','tot_applied'] * df.loc['All','tot_admit']


df['male_pct_points_diff'] = df['M_admit_pct'] - df['F_admit_pct']

#df['Male_advantage'] = df['M_admit_pct'] / df['tot_admit_pct']
#df['Female_advantage'] = df['F_admit_pct'] / df['tot_admit_pct']

print('unfair admitted women')
print (df['F_admit'] - df['fair_admit_F'])

print ('unfair admitted men')
print (df['M_admit'] - df['fair_admit_M'])

print ('ratio of unfair admiited women of admitted women')
print ( (df['F_admit'] - df['fair_admit_F']) / df['F_admit'])
df.index.name = 'Department'


df


In [None]:
ax = df.loc[:,['F_admit_pct','M_admit_pct']].plot.bar(color=['red','blue'], rot=0,
                                                title='Simpsons Paradox Illustration',figsize=(18,12))
plt.ylabel('ratio admitted/applied')
plt.xlabel('Department')
plt.savefig('simpsons_paradox_fake_data.jpg',format='jpg')

In [None]:
# construct a dataframe a bit easier to pass to Bayesian Inference

def logit_pure(x):
    return (np.exp(x)) / (1 + np.exp(x))

def assign_dept_idx(d):
    if d == 'A' : return 1
    if d == 'B' : return 2
    
data = pd.DataFrame({'dept' : ['A','A','B','B'],
                    'male': [1,0,1,0],
                    'applied' : [men_apply_a,women_apply_a,men - men_apply_a,women - women_apply_a],
                    'admit' : [men_admit_a,women_admit_a,men_admit_b,women_admit_b]})

data['dept_idx'] = data['dept'].apply(assign_dept_idx)
data

In [None]:
x = data['male']
dept_idx = data['dept_idx']

### control for department by assigning a unique alpha to each department,
# setting that alpha as the p for admit women, and then having a common, uni-wide offset for p male

# model: 
# admit ~ Binomial(applications,p)
# p = logit(alpha[dept_idx] + beta * male)
# alpha[dept_idx] ~ Normal(0,10)

alpha = pm.Normal('alpha',0, 1 / 10 ** 2,size=6) #six departments

beta = pm.Normal('beta',0,1 / 10 ** 2)

@pm.deterministic
def logit_dept(dept_idx=dept_idx-1,alpha=alpha,beta=beta,x=x):
    
    # index for alpha : subtract 1 from dept_idx that goes 1..2 to get zero-based index of python
    
    return (np.exp(alpha[dept_idx]+beta*x)) / (1 + np.exp(alpha[dept_idx]+beta*x))

lkh_dept = pm.Binomial('lkh_dept',n=data['applied'],p=logit_dept,observed=True,value=data['admit'])

model = pm.Model([alpha,beta,logit_dept,lkh_dept])

mcmc = pm.MCMC(model)
sample = mcmc.sample(50000,20000,2)

In [None]:
alpha_A = mcmc.trace('alpha')[:,0]
alpha_B = mcmc.trace('alpha')[:,1]
beta = mcmc.trace('beta')[:]

result = pd.DataFrame({'alpha_A' : alpha_A,
                      'alpha_B' : alpha_B,
                      'beta' : beta})

result['A_female_p'] = logit_pure(result['alpha_A'])
result['A_male_p'] = logit_pure(result['alpha_A'] + result['beta'])
result['B_female_p'] = logit_pure(result['alpha_B'])
result['B_male_p'] = logit_pure(result['alpha_B'] + result['beta'])

result['male_advantage_A'] = result['A_male_p'] - result['A_female_p']
result['male_advantage_B'] = result['B_male_p'] - result['B_female_p']
result.describe(percentiles=[0.055,0.945]).round(2)



In [None]:
# joe betting averages

df = pd.DataFrame({'A_bats' : [4,40],
                  'A_hits' : [1,15],
                  'B_bats' : [10,5],
                  'B_hits' : [3,2]})

df.index = ['1st_half','second_half']
df['A_batting_average'] = df['A_hits'] / df['A_bats']
df['B_batting_average'] = df['B_hits'] / df['B_bats']
df.loc['All',:] = df.loc[:,'A_bats' : 'B_hits'].sum()
df.loc['All','A_batting_average' : 'B_batting_average'] = df.loc[:,'A_batting_average' : 'B_batting_average'].mean()

true_averages = pd.DataFrame({'A' : [df.loc['All','A_hits'] / df.loc['All','A_bats']]})
true_averages['B'] = df.loc['All','B_hits'] / df.loc['All','B_bats']
true_averages.index = ['all_season_batting_average']
print (true_averages)
df

In [None]:
a_bats_1 = np.zeros(int(df.at['1st_half','A_bats']))
a_bats_1[int(df.loc['1st_half','A_bats']) - int(df.loc['1st_half','A_hits']):] = 1

a_bats_2 = np.zeros(int(df.at['second_half','A_bats']))
a_bats_2[int(df.loc['second_half','A_bats']) - int(df.loc['second_half','A_hits']):] = 1

a_bats_1 = np.concatenate([a_bats_1,a_bats_2])

a_bats_1_div = np.arange(1,len(a_bats_1) + 1)
a_bats_1 = a_bats_1.cumsum() / a_bats_1_div

a_bats_2_div = np.arange(1,len(a_bats_2) + 1)
a_bats_2 = a_bats_2.cumsum() / a_bats_2_div

b_bats_1 = np.zeros(int(df.at['1st_half','B_bats']))
b_bats_1[int(df.loc['1st_half','B_bats']) - int(df.loc['1st_half','B_hits']):] = 1

b_bats_2 = np.zeros(int(df.at['second_half','B_bats']))
b_bats_2[int(df.loc['second_half','B_bats']) - int(df.loc['second_half','B_hits']):] = 1

b_bats_1 = np.concatenate([b_bats_1,b_bats_2])

b_bats_1_div = np.arange(1,len(b_bats_1) + 1)
b_bats_1 = b_bats_1.cumsum() / b_bats_1_div


plt.figure(figsize=(18,12))
plt.title('Fooled by averages - Seasonal Baseball Batting Averages')
plt.plot(a_bats_1,'x--',label='Player A')
plt.plot(b_bats_1,'x--',label='Player B')

plt.xlabel('hits')
plt.ylabel('batting average')

plt.legend(loc='upper left')
plt.savefig('joe_baseball.jpg',format='jpg')

In [None]:
#weighted means
# wm = (nr_events_1 * value_event_1 + nr_events_2 * value_event_2) /( nr_events_1 + nr_events_2)

wm_A = (df.loc['1st_half','A_batting_average'] * df.loc['1st_half','A_bats'] + df.loc['second_half','A_bats'] * \
        df.loc['second_half','A_batting_average']) / (df.loc['1st_half','A_bats'] + df.loc['second_half','A_bats'])

wm_B = (df.loc['1st_half','B_batting_average'] * df.loc['1st_half','B_bats'] + df.loc['second_half','B_bats'] * \
        df.loc['second_half','B_batting_average']) / (df.loc['1st_half','B_bats'] + df.loc['second_half','B_bats'])

print (wm_A)
print (wm_B)

In [None]:

# nothing to do with UCB! 
# Elreath lec 13 2015 : rationale of Exponential dist - y machines with n parts each : system failure day
# when mean component failure rate is once per period
# as n increases, the distribution becomes exponential

import scipy.stats as sps

systems = 100000
n = 10
mean_part_life_expectancy = 365 

np.random.seed(4711)
failure_day = np.array([min(pm.rdiscrete_uniform(0,mean_part_life_expectancy,n)) for i in range(systems)])

plt.figure(figsize=(18,12))
plt.title('System fail day')
plt.xlabel('day number')
plt.ylabel('Probability of failed systems per day')
_=plt.hist(failure_day,bins=365,weights=np.ones_like(failure_day) / len (failure_day))
print (failure_day.mean())
print (1 / failure_day.mean())

lambda_ = 1 / failure_day.mean()

plt.plot(np.arange(365),sps.expon.pdf(np.arange(365),loc=lambda_,scale=1/lambda_))