## Effect Size

### Effect size for a t-test

In [5]:
import math as ma
import pandas as pd
import scipy.stats as stats
import numpy as np
import statsmodels as sm
from statsmodels.stats import power as pwr

In [2]:
data = pd.read_csv('/Users/xingkaiwu/OneDrive - York University/MBAN Winter/MBAN 6400/Datacamp Experiment Design/olyathswim.csv')
athletes = data[(data['Weight'].isna()== False) & (data['Team']=='United States') & (data['Sport'].isin(['Athletics','Swimming'])) & (data['Sex']=='M')]

In [3]:
# Set parameters
alpha = 0.05
power = 0.8
ratio = float(len(athletes[athletes.Sport == "Swimming"])) / len(athletes[athletes.Sport == "Athletics"])
samp_size = len(athletes[athletes.Sport == "Athletics"])

# Initialize analysis & calculate sample size
analysis = pwr.TTestIndPower()
esresult = analysis.solve_power(effect_size = None, 
                                power = power, 
                                nobs1 = samp_size, 
                                ratio = ratio, 
                                alpha = alpha)
print(esresult)

0.12239214635469942


In [4]:
# Set parameters
alpha = 0.05
power = 0.8
ratio = 1
samp_size = 300

# Initialize analysis & calculate sample size
analysis = pwr.TTestIndPower()
esresult = analysis.solve_power(effect_size=None, power=power, nobs1=samp_size, ratio=ratio, alpha=alpha)
print(esresult)

0.2291164925076639


### Computing Cohen's d

In [6]:
# Create series
athl = athletes[athletes.Sport=='Athletics'].Weight
swim = athletes[athletes.Sport=='Swimming'].Weight

# Calculate difference between means and pooled standard deviation
diff = swim.mean() - athl.mean()
pooledstdev = ma.sqrt((athl.std()**2 + swim.std()**2)/2 )

# Calculate Cohen's d
cohend = diff / pooledstdev
print(cohend)

0.42144949362633766


### Effect size for a Fisher exact test

In [8]:
medal = ['Gold','Silver','Bronze']
athletes['MedalTF']= athletes['Medal'].map(lambda x: True if (pd.notnull(x)) else False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [9]:
# Create a table of cross-tabulations
table = pd.crosstab(athletes.MedalTF,athletes.Sport)
print(table)

# Perform the Fisher exact test
chi = stats.fisher_exact(table, alternative='two-sided')

# Print p-value
print("p-value of test: " + str(round(chi[1], 5))  )

# Print odds ratio  
print("Odds ratio between groups: " + str(round(chi[0], 1)))

Sport    Athletics  Swimming
MedalTF                     
False         1355       190
True           780       505
p-value of test: 0.0
Odds ratio between groups: 4.6


### Effect sizes for Pearson correlation

In [16]:
athletes = data[(data['Weight'].isna()== False) & (data['Height'].isna()== False) & (data['Sport'].isin(['Athletics','Swimming'])) & (data['Sex']=='M')]
ken = athletes[(athletes.Team == 'Kenya') & athletes['Event'].str.endswith('10,000 metres')]
eth = athletes[(athletes.Team == 'Ethiopia') & athletes['Event'].str.endswith('10,000 metres')]

In [19]:
# Perform Pearson correlation
pearsonken = stats.pearsonr(ken.Weight, ken.Height)
print(pearsonken)

(0.745306450549595, 6.49711052150277e-07)


In [20]:
# Perform Pearson correlation
pearsoneth = stats.pearsonr(eth.Weight,eth.Height)
print(pearsoneth)

(0.37284506395004674, 0.03885112030532928)
