# Power Analysis
Investigate what power to get results we have for various study sizes

In [1]:
%matplotlib notebook
import scipy
import numpy
from IPython.display import display, HTML
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import re
import matplotlib.patches as mpatches
from scipy.cluster import hierarchy
import pylab

In [2]:
COHORT = 1
OUTDIR = f"../power_analysis/cohort{COHORT}/"

In [3]:
import pandas
full_activity = pandas.read_csv("../processed/activity_features_aggregate.txt", index_col=0, sep="\t", low_memory=False)
activity_summary = pandas.read_csv("../processed/activity_summary_aggregate.txt", index_col=0, sep="\t", low_memory=False)
ukbb = pandas.read_hdf("../processed/ukbb_data_table.h5", low_memory=False)
full_mental_health = pandas.read_hdf("../processed/ukbb_mental_health.h5", low_memory=False)

In [4]:
# Remove the activity variables that we don't want to use
bad_columns = ["_IV$", "_IS$", "^temp_", "^light_"]
good_columns = []
for c in full_activity.columns:
    fail = False
    for bad in bad_columns:
        if re.search(bad, c):
            fail = True
    if not fail:
        good_columns.append(c)
activity = full_activity[good_columns]

In [5]:
# drop activity for people who fail basic QC
[c for c in activity_summary.columns if 'quality' in c]
okay = activity_summary['quality-goodCalibration'].astype(bool) & (~activity_summary['quality-daylightSavingsCrossover'].astype(bool)) & (activity_summary['quality-goodWearTime'].astype(bool))
activity.columns = activity.columns.str.replace("-","_") # Can't use special characters easily
activity = activity[okay]
print(f"Dropping {(~okay).sum()} entries out of {len(okay)} due to bad quality or wear-time")

Dropping 11363 entries out of 103688 due to bad quality or wear-time


  """


In [6]:
# Clean up column names for mental variables that contain special characters
full_mental_health.columns = full_mental_health.columns.str.replace("[-',()&/:]", "_") # Can't use special characters easily
full_mental_health.drop(columns=["date_of_mental_health_questionnaire",
                                 "birth_month",
                                 "actigraphy_file"],
                        inplace=True)

In [7]:
data = activity.copy()
data = data.join(full_mental_health, how="left")

covariates = ["sex",
              #"ethnicity",
              #"overall_health",
              #"household_income",
              #"smoking",
              "birth_year",
              #"BMI",
               #'education_Prefer_not_to_answer', # This answer causes problems for some reason
               #'education_None_of_the_above',
               #'education_College_or_University_degree',
               #'education_A_levels/AS_levels_or_equivalent', 
               #'education_O_levels/GCSEs_or_equivalent',
               #'education_CSEs_or_equivalent',
               #'education_NVQ_or_HND_or_HNC_or_equivalent',
               #'education_Other_professional_qualifications_eg:_nursing,_teaching',
                ]
#covariates = ["BMI"]
#data = data.join(ukbb[covariates], how="inner")

print(f"Data starting size: {data.shape}")

Data starting size: (92325, 225)


In [8]:
# Many activity variables are highly non-normal shape (long tails and skewed)
# We transform those to 'normalized' standard normals by putting them in rank order,
# and then taking the corresponding point on the normal distribution inverse cdf
activity_norm = activity.select_dtypes("number")
import scipy.stats
activity_norm.iloc[:,:] = scipy.stats.norm(0,1).isf(1-activity_norm.rank(method="first")/(len(activity_norm)+1))
data = data.join(activity_norm, rsuffix="_norm")

  cond1 = (0 < q) & (q < 1)
  cond1 = (0 < q) & (q < 1)


In [9]:
# Determine a basic set of cases and controls for selecting the samples from
control = ((full_mental_health.ever_prolonged_depression == "No")
           & (full_mental_health.ever_prolonged_loss_of_interest == "No")
           & (full_mental_health.ever_extreme_irritability == "No")
           & (full_mental_health.ever_felt_worried_more_than_month == "No")
           & (full_mental_health.ever_mania == "No")
          )
case = (((full_mental_health.ever_prolonged_depression == "Yes")
             & (full_mental_health.number_depressed_periods > 2))
           | ((full_mental_health.ever_prolonged_loss_of_interest == "Yes")
               & (full_mental_health.number_depressed_periods > 2))
           | ((full_mental_health.ever_worried_much_more == "Yes")
               & (full_mental_health.impact_normal_roles_worst_anxiety.isin(["A lot", "Somewhat"])))
           | (full_mental_health.every_thought_life_not_worth_living == "Yes")
           | (full_mental_health.ever_self_harmed == "Yes")
           | (full_mental_health.ever_attempted_suicide == "Yes")
           | (full_mental_health.recent_thoughts_of_suicide == "Yes")
          ) & (~control)
print(f"Found {sum(control)} controls and {sum(case)} cases total.")

data = data.join(pandas.DataFrame({"control": control, "case": case}), how='left')
print(f"And {sum(data.control)} controls and {sum(data.case)} cases with actigraphy")

Found 51257 controls and 44229 cases total.
And 21498 controls and 18110 cases with actigraphy


In [10]:
def BH_FDR(ps):
    ''' Benjamini-Hochberg FDR control

    Converts p values to q values'''

    # For the purposes of comparison, an implementation of Benjamini Hochberg correction
    sort_order = numpy.argsort(ps)

    adjusted = numpy.zeros(ps.shape)
    adjusted[sort_order] = numpy.array(ps)[sort_order]*len(ps)/numpy.arange(1,len(ps)+1)

    # Make monotone, skipping NaNs
    m = 1;
    for i, r in enumerate(sort_order[::-1]):
        if numpy.isfinite(adjusted[r]):
            m = min(adjusted[r], m)
            adjusted[r] = m

    return adjusted # the q-values

## Define the tests to perform
Here we create a function that performs the tests we'll run on a given input dataset
That way we can easily run this on a variety of samples and sample sizes to determine power

In [11]:
def perform_tests(data, control_expression, mental_health_vars =None):    
    # Perform univariate assocation between the actigraphy traits and the binary case/control value
    binarized_univariate = {}
    for var in activity.columns:
        fit = smf.ols(f"Q('{var}') ~ case + {control_expression}", data=data).fit()
        binarized_univariate[var] = {
            "p": fit.pvalues["case[T.True]"],
            "coef": fit.params["case[T.True]"],
            "coef_std": fit.params["case[T.True]"] / data[var].std()
        }
    binarized_univarite = pandas.DataFrame(binarized_univariate).T
    
    if mental_health_vars is None:
        mental_health_vars = full_mental_health.columns.intersection(data.columns).difference(covariates)
    # Perform univariate associations between a selection of actigrpahy traits
    # and a selection of the mental health traits
    univariate = {}
    fits = {}
    bad = False
    for mental_health_var in mental_health_vars:
        N_var = (~data[mental_health_var].isna()).sum()
        if N_var <= 20:
            print(f"Skipping {mental_health_var} due to low data")
            continue

        bad = False
        for activity_var in (activity_norm.columns + "_norm"):      
            if bad:
                continue

            try:
                fit = smf.ols(f"{activity_var} ~ Q('{mental_health_var}') + {control_expression}", data=data).fit()
                fit2 = smf.ols(f"{activity_var} ~ {control_expression}", data=data).fit()
            except ValueError:
                print(f"Skipping {mental_health_var}-{activity_var} associations due to missing data")
                continue
            if fit.condition_number > 1e4:# or fit.rsquared < 0:
                print(f"Skipping {mental_health_var} associations due to bad numerics")
                bad = True
                continue
            f, p, df = fit.compare_f_test(fit2)
            
            univariate[(mental_health_var, activity_var)] = {
                "p": p,
                "N": N_var,
                "cond_num": fit.condition_number
            }
            
            fits[(mental_health_var, activity_var)] = fit

    univariate = pandas.DataFrame(univariate).T
    return binarized_univarite, univariate, fits

In [41]:
def sample(data, N_cases, N_controls):
    case_sample = data[data.case].sample(N_cases)
    control_sample = data[data.control].sample(N_controls)
    return pandas.concat([case_sample, control_sample])

Ns = [(100,25), (100,100), (200, 200), (300, 300), (400,400), (500, 500), (1000, 1000), (2000, 2000), (5000, 5000)]
results = {}
full_results = {}
for N in Ns:

    results_binary_univariate, results_univariate, fits = perform_tests(sample(data, *N), "sex + standardize(birth_year)")
    
    best_p_binary = results_binary_univariate.p.min()
    num_significant_binary = numpy.sum(results_binary_univariate.p*len(results_binary_univariate) < 0.05)
    
    best_p = results_univariate.p.min()
    num_significant = numpy.sum(results_univariate.p*len(results_univariate) < 0.05)
    
    results[N] = {
        "N_control": N[0],
        "N_case": N[1],
        "best_p_binarized": best_p,
        "num_significant_binarized": num_significant
    }
    full_results[N] = (results_binary_univariate, results_univariate, fits)
results = pandas.DataFrame(results).T

Skipping actions_taken_following_self_harm_Prefer_not_to_answer associations due to bad numerics
Skipping activities_to_treat_anxiety_Prefer_not_to_answer associations due to bad numerics
Skipping activities_to_treat_depression_Prefer_not_to_answer associations due to bad numerics
Skipping assessment_center associations due to bad numerics
Skipping attempted_suicide_past_year associations due to bad numerics
Skipping avoided_activities_because_of_stressful_experience_past_month associations due to bad numerics
Skipping been_in_combat_war_zone associations due to bad numerics
Skipping been_in_series_accident associations due to bad numerics
Skipping behavior_misc_addictions_A_behaviour due to low data
Skipping behavior_misc_addictions_Prefer_not_to_answer due to low data
Skipping behavior_misc_addictions_Something_else_not_mentioned due to low data
Skipping belittlement_by_partner_as_adult associations due to bad numerics
Skipping contemplated_self_harm_last_year associations due to bad

ValueError: Length of values does not match length of index

In [42]:
for binary, all_, fits in full_results.values():
    binary['q'] = BH_FDR(binary['p'])
    all_['q'] = BH_FDR(all_['p'])

In [74]:
full_results[(500,500)][0].sort_values(by="p")

(53, 4)

In [71]:
best_q = pandas.Series(
    {N: df.q.min() for N,(df,_,_) in full_results.items()},
    name="Best q"
)
n_significant = pandas.Series(
    {N: (df.q < 0.1).sum() for N,(df,_,_) in full_results.items()},
    name="Num q < 0.1"
)

In [73]:
print("\n".join([str(x) for n,x in n_significant.iteritems()]))

0
0
2
1
12
30
35
40
