# Blood Work Analysis
Investigate time-of-sampling data on the bloodwork to check if there are circadian changes and whether those associate with activity changes.

In [1]:
%matplotlib notebook
import scipy
import numpy
from IPython.display import display, HTML
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import re
import matplotlib.patches as mpatches
from scipy.cluster import hierarchy
import pylab
import pandas

In [2]:
COHORT = 1
OUTDIR = f"../blood_work_timing/cohort{COHORT}/"

In [3]:
full_activity = pandas.read_csv("../processed/activity_features_aggregate_seasonal.txt", sep="\t", dtype={'Unnamed: 0': str})
activity_summary = pandas.read_csv("../processed/activity_summary_aggregate.txt", index_col=0, sep="\t")
activity_summary_seasonal = pandas.read_csv("../processed/activity_summary_aggregate_seasonal.txt", index_col=0, sep="\t")
activity_summary_seasonal["ID"] = activity_summary_seasonal.index.astype(int)
ukbb = pandas.read_hdf("../processed/ukbb_data_table.h5")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# Separate out the user ID from the run number (0 = original, 1-4 are seasonal repeats)
full_activity.rename(columns={"Unnamed: 0": "run_id"}, inplace=True)
full_activity['id'] = full_activity.run_id.apply(lambda x: int(x.split('.')[0]))
full_activity['run'] = full_activity.run_id.apply(lambda x: int(x.split('.')[1]))
activity = full_activity[full_activity.run == 0]
activity.set_index('id', inplace=True)

In [5]:
ukbb.columns = ukbb.columns.str.replace("[,:/]","_") # Can't use special characters easily

In [6]:
# drop activity for people who fail basic QC
okay = (activity_summary['quality-goodCalibration'].astype(bool)
            & (~activity_summary['quality-daylightSavingsCrossover'].astype(bool))
            & (activity_summary['quality-goodWearTime'].astype(bool))
       )
activity = activity[activity.index.map(okay)]
activity.columns = activity.columns.str.replace("-","_") # Can't use special characters easily
print(f"Dropping {(~okay).sum()} entries out of {len(okay)} due to bad quality or wear-time")

Dropping 11363 entries out of 103688 due to bad quality or wear-time


In [7]:
covariates = [
              "sex", "ethnicity", "overall_health", "household_income", "smoking", "birth_year", "BMI",
               #'education_Prefer_not_to_answer', # This answer causes problems for some reason
               'education_None_of_the_above',
               'education_College_or_University_degree',
               'education_A_levels_AS_levels_or_equivalent', 
               'education_O_levels_GCSEs_or_equivalent',
               'education_CSEs_or_equivalent',
               'education_NVQ_or_HND_or_HNC_or_equivalent',
               'education_Other_professional_qualifications_eg__nursing__teaching',
                ]

data_full = activity.copy()
data_full = data_full.join(ukbb, how="inner")
print(f"Data starting size: {data_full.shape}")

# Down sample for testing
numpy.random.seed(0)
# Note: total 92331, half is 46164
cohort_id_ranges = {1: slice(0, 25000),
           2: slice(25000,50000)}
selected_ids = numpy.random.choice(data_full.index, size=data_full.shape[0], replace=False)[cohort_id_ranges[COHORT]]
data = data_full.loc[selected_ids].copy()
print(f"Data size after selecting test set: {data.shape}")

Data starting size: (92325, 434)
Data size after selecting test set: (25000, 434)


In [8]:
# Q-value utility
def BH_FDR(ps):
    ''' Benjamini-Hochberg FDR control

    Converts p values to q values'''

    # For the purposes of comparison, an implementation of Benjamini Hochberg correction
    sort_order = numpy.argsort(ps)

    adjusted = numpy.zeros(ps.shape)
    adjusted[sort_order] = numpy.array(ps)[sort_order]*len(ps)/numpy.arange(1,len(ps)+1)

    # Make monotone, skipping NaNs
    m = 1;
    for i, r in enumerate(sort_order[::-1]):
        if numpy.isfinite(adjusted[r]):
            m = min(adjusted[r], m)
            adjusted[r] = m

    return adjusted # the q-values

## Plot bloodwork values by time to check for time-dependence

In [9]:
# Process the blood time to get a time-of-day in hours-since-midnight
collection_datetime = pandas.to_datetime(data.blood_sample_time_collected_V0)
data['bloodwork_time'] = (collection_datetime - pandas.to_datetime(collection_datetime.dt.date)).dt.total_seconds() / 60 / 60
collection_datetime7 = pandas.to_datetime(data.blood_sample_time_collected_V6)
data['bloodwork_time_7'] = (collection_datetime7 - pandas.to_datetime(collection_datetime7.dt.date)).dt.total_seconds() / 60 / 60

In [10]:
bloodwork_fields = [c.strip() for c in """
alanine_aminotransferase
albumin
alkaline_phosphatase
apolipoprotein_A
apolipoprotein_B
aspartate_aminotransferase
c_reactive_protein
calcium
cholesterol
creatinine
cystatin_C
direct_bilirubin
gamma_glutamyltransferase
glucose
glycated_heamoglobin
hdl_cholesterol
igf_1
ldl_direct
lipoprotein_A
oestradiol
phosphate
rheumatoid_factor
shbg
testosterone
total_bilirubin
total_protein
triglycerides
urate
urea
vitamin_D
basophill_count
basophill_percent
eosinophill_count
eosinophill_percent
haematocrit_percent
haemoglobin_concentration
high_light_scatter_reticulocyte_count
high_light_scatter_reticulocyte_percent
immature_reticulocyte_fraction
lymphocyte_count
lymphocyte_percent
mean_corpuscular_haemoglobin
mean_corpuscular_haemoglobin_conc
mean_corpuscular_volume
mean_platelt_volume
mean_reticulocyte_volume
mean_sphered_cell_volume
monocyte_count
monocyte_percent
neutrophil_count
neutrophil_percent
nucleated_red_blood_cell_count
nucleated_red_blood_cell_percent
platelet_count
platelet_crit
platelet_distribution_width
red_blood_cell_count
red_blood_cell_distribution_width
reticulocyte_count
reticulocyte_percentage
white_blood_cell_count
""".strip().splitlines()]

In [11]:
time_dependent_pvalues = {}
for field in bloodwork_fields:
    results = smf.ols(f"Q('{field}') ~ bloodwork_time", data=data).fit()
    time_dependent_pvalues[field] = results.pvalues['bloodwork_time']
time_dependent_pvalues = pandas.Series(time_dependent_pvalues)

In [12]:
quadratic_time_dependent_pvalues = {}
time_dependent_results = {}
for field in bloodwork_fields:
    results = smf.ols(f"Q('{field}') ~ bloodwork_time + I(bloodwork_time**2)", data=data).fit()
    quadratic_time_dependent_pvalues[field] = results.pvalues['I(bloodwork_time ** 2)']
    time_dependent_results[field] = results
quadratic_time_dependent_pvalues = pandas.Series(quadratic_time_dependent_pvalues)

In [13]:
time_dependence = pandas.DataFrame({
    "linear": time_dependent_pvalues,
    "quadratic": quadratic_time_dependent_pvalues
})

In [14]:
time_dependence['linear_q'] = BH_FDR(time_dependence.linear)
time_dependence['quadratic_q'] = BH_FDR(time_dependence.quadratic)

In [15]:
time_dependence.sort_values(by="linear")

Unnamed: 0,linear,quadratic,linear_q,quadratic_q
phosphate,4.152599e-302,1.619309e-38,2.533085e-300,3.292595e-37
white_blood_cell_count,7.915337e-285,2.440631e-14,2.414178e-283,1.654206e-13
lymphocyte_count,2.771686e-231,1.621831e-03,5.635761e-230,3.957268e-03
neutrophil_count,2.398207e-128,1.128924e-38,3.657266e-127,3.292595e-37
total_bilirubin,2.352184e-91,2.144902e-01,2.869664e-90,2.616781e-01
...,...,...,...,...
alanine_aminotransferase,6.557407e-01,6.775546e-04,7.017576e-01,1.796993e-03
shbg,6.934514e-01,1.656338e-02,7.280002e-01,3.157395e-02
cholesterol,7.041314e-01,4.716815e-02,7.280002e-01,7.058298e-02
glucose,7.853763e-01,3.722230e-17,7.984659e-01,3.243657e-16


In [16]:
print(f"Of {len(time_dependence)} variables:")
print(f"{sum(time_dependence.linear_q < 0.05)} had significant linear associations")
print(f"{sum(time_dependence.quadratic_q < 0.05)} had significant quadratic associations")
print(f"{sum((time_dependence.linear_q < 0.05) | (time_dependence.quadratic_q < 0.05))} had significant associations in either")

Of 61 variables:
41 had significant linear associations
37 had significant quadratic associations
49 had significant associations in either


In [29]:
low_RA = data[data.acceleration_RA < data.acceleration_RA.quantile(0.2)]
high_RA = data[data.acceleration_RA > data.acceleration_RA.quantile(0.8)]
sns.jointplot("bloodwork_time", "phosphate", data=low_RA, kind="kdeplot") #low_RA[(low_RA.sex == "Female") & (low_RA.birth_year > 1950) & (low_RA.birth_year < 1960)], kind="kdeplot")

<IPython.core.display.Javascript object>

<seaborn.axisgrid.JointGrid at 0x2cd7358ac50>

In [28]:
sns.jointplot("bloodwork_time", "phosphate", data=high_RA, kind="kdeplot")

<IPython.core.display.Javascript object>

<seaborn.axisgrid.JointGrid at 0x2cd737acb38>

In [21]:

sns.jointplot("bloodwork_time", "phosphate", data=high_RA, kind="kdeplot")

<IPython.core.display.Javascript object>

<seaborn.axisgrid.JointGrid at 0x2cd764f61d0>

## Time Association by Rhythmicity

In [20]:
rhythmicity_associations_data = {}
low_RA = data[data.acceleration_RA < data.acceleration_RA.quantile(0.20)]
high_RA = data[data.acceleration_RA > data.acceleration_RA.quantile(0.80)]
for field in bloodwork_fields:
    low_RA_results = smf.ols(f"Q('{field}') ~ bloodwork_time + I(bloodwork_time**2) + sex + birth_year", data=low_RA).fit()
    low_RA_reduced = smf.ols(f"Q('{field}') ~ sex + birth_year", data=low_RA).fit()
    high_RA_results = smf.ols(f"Q('{field}') ~ bloodwork_time + I(bloodwork_time**2) + sex + birth_year", data=high_RA).fit()
    high_RA_reduced = smf.ols(f"Q('{field}') ~ sex + birth_year", data=high_RA).fit()

    rhythmicity_associations_data[field] = {
        
        #"low_RA_pvalue": low_RA_results.pvalues["bloodwork_time"],
        #"high_RA_pvalue": high_RA_results.pvalues["bloodwork_time"],
        "low_RA_pvalue": low_RA_results.compare_f_test(low_RA_reduced)[1],
        "high_RA_pvalue": high_RA_results.compare_f_test(high_RA_reduced)[1],
        "low_RA_rsquared": low_RA_results.rsquared,
        "high_RA_rsquared": high_RA_results.rsquared,
        "low_RA_mse_resid": low_RA_results.mse_resid,
        "high_RA_mse_resid": high_RA_results.mse_resid,
        "low_RA_mse_model": low_RA_results.mse_model,
        "high_RA_mse_model": high_RA_results.mse_model,
    }
rhythmicity_associations = pandas.DataFrame(rhythmicity_associations_data).T

In [24]:
rhythmicity_associations.sort_values(by="high_RA_pvalue")

Unnamed: 0,low_RA_pvalue,high_RA_pvalue,low_RA_rsquared,high_RA_rsquared,low_RA_mse_resid,high_RA_mse_resid,low_RA_mse_model,high_RA_mse_model
white_blood_cell_count,4.979274e-41,1.867291e-68,0.041026,0.067697,3.511772,2.624002,179.531890,228.976283
phosphate,2.423971e-77,3.730428e-63,0.124051,0.126511,0.024194,0.020960,3.655858,3.262601
lymphocyte_count,1.620682e-32,3.325683e-54,0.034712,0.054956,0.730154,0.507568,31.323852,35.411667
neutrophil_count,3.591837e-24,2.760498e-38,0.025421,0.041596,2.029407,1.549201,63.152912,80.667331
monocyte_count,7.660381e-10,1.258934e-23,0.042128,0.059354,0.051756,0.028561,2.715610,2.162165
...,...,...,...,...,...,...,...,...
aspartate_aminotransferase,4.603180e-01,9.344303e-01,0.033358,0.049863,105.051450,65.201222,4236.056515,4025.703656
gamma_glutamyltransferase,9.838977e-01,9.443946e-01,0.026745,0.057278,2070.486692,687.570982,66740.160648,49264.182704
nucleated_red_blood_cell_percent,1.316192e-01,9.621183e-01,0.002865,0.000635,0.197129,0.125018,0.675806,0.095227
lipoprotein_A,7.615299e-01,9.703481e-01,0.001818,0.001115,2438.313454,2367.735695,4219.993154,2497.769850


In [31]:
fig, ax = pylab.subplots()

ax.scatter(-numpy.log10(rhythmicity_associations.low_RA_pvalue),
           -numpy.log10(rhythmicity_associations.high_RA_pvalue),
           marker="+")
ax.set_xlabel("-log10 p-value in low RA")
ax.set_ylabel("-log10 p-value in high RA")

#ax.plot([0,1], [0,1], c="k", linestyle="--", zorder=-1) # Draw rsquared instead
#ax.scatter(rhythmicity_associations.low_RA_rsquared,
#           rhythmicity_associations.high_RA_rsquared,
#           marker="+")

#ax.plot([0,1], [0,1], c="k", linestyle="--", zorder=-1) # Draw rsquared instead
#scales = data[rhythmicity_associations.index].var()
#ax.scatter(rhythmicity_associations.low_RA_mse_resid / scales,
#           rhythmicity_associations.high_RA_mse_resid / scales,
#           marker="+")
#ax.set_xlabel("Residual MSE in low RA")
#ax.set_ylabel("Residual MSE in high RA")

#Diagonal line
bottom = min(ax.get_xlim()[0], ax.get_ylim()[0])
top = max(ax.get_xlim()[1], ax.get_ylim()[1])
ax.plot([bottom, top], [bottom, top], c="k", linestyle="--", zorder=-1)
ax.set_aspect("equal")
ax.set_title("Time-Value associations in top/bottom quintile of RA scores")
fig.savefig(OUTDIR+"time_associations.by_RA.png")

<IPython.core.display.Javascript object>

In [41]:
fig, ax = pylab.subplots()

ax.plot([0,1], [0,1], c="k", linestyle="--", zorder=-1) # Draw rsquared instead
scales = data[rhythmicity_associations.index].var()
low_RA_scales = low_RA[rhythmicity_associations.index].var()
high_RA_scales = high_RA[rhythmicity_associations.index].var()
scale = -numpy.log10(numpy.minimum(rhythmicity_associations.low_RA_pvalue, rhythmicity_associations.high_RA_pvalue))
ax.scatter(rhythmicity_associations.low_RA_mse_model / scales,
           rhythmicity_associations.high_RA_mse_model / scales,
           s=scale*3+1,
           marker="+")
ax.set_xlabel("Model MSE in low RA")
ax.set_ylabel("Model MSE in high RA")

#Diagonal line
bottom = min(ax.get_xlim()[0], ax.get_ylim()[0])
top = max(ax.get_xlim()[1], ax.get_ylim()[1])
ax.plot([bottom, top], [bottom, top], c="k", linestyle="--", zorder=-1)
ax.set_aspect("equal")
ax.set_title("Time-Value associations in top/bottom quintile of RA scores")
fig.savefig(OUTDIR+"time_associations.by_RA.png")

<IPython.core.display.Javascript object>

## Time associations by self-reported values

In [32]:
self_report_variables = [
    "daytime_dozing",
    "getting_up_in_morning",
    "morning_evening_person",
    "nap_during_day",
    "sleeplessness",
    "snoring",
    "IPAQ_activity_group",
]
self_report_associations_data = []
for var in self_report_variables:
    for field in bloodwork_fields:
        for cat in data[var].cat.categories:
            d = data[data[var] == cat]
            if len(d) < 500:
                continue
            d = d.sample(500)
            results = smf.ols(f"Q('{field}') ~ bloodwork_time + I(bloodwork_time**2) + sex + birth_year", data=d).fit()
            reduced = smf.ols(f"Q('{field}') ~ sex + birth_year", data=d).fit()

            self_report_associations_data.append({
                "self_report_var": var,
                "bloodwork_var": field,
                "response": cat,
                "pvalue": results.compare_f_test(reduced)[1],
                "rsquared": results.rsquared,
                "mse_model": results.mse_model,
                "mse_resid": results.mse_resid,
                "mse_total": results.mse_total,
                "N": len(d),
            })
self_report_associations = pandas.DataFrame(self_report_associations_data)

In [57]:
# Plot by variable in two categories
self_report_var_details = {
    "daytime_dozing": {
        "name": "daytime_dozing",
        "zeros": "Never/rarely",
        "ones": "Often",
    },
    "getting_up_in_morning": {
        "name": "getting_up_in_morning",
        "zeros": "Very easy",
        "ones": "Not at all easy",
    },
    "morning_evening_person": {
        "name": "chronotype",
        "zeros": "Definitely a 'morning' person",
        "ones": "Definitely an 'evening' person",
    },
    "nap_during_day": {
        "name": "nap_during_day",
        "zeros": "Never/rarely",
        "ones": "Usually",
    },
    "sleeplessness": {
        "name": "sleeplessness",
        "zeros": "Never/rarely",
        "ones": "Usually",
    },
    "snoring": {
        "name": "snoring",
        "zeros": "No",
        "ones": "Yes",
    },
    "IPAQ_activity_group": {
        "name": "IPAQ Activity Group",
        "zeros": "low",
        "ones": "high",
    }
}
scales = data[bloodwork_fields].var()
for var in self_report_var_details:
    d = self_report_associations.query(f"self_report_var == '{var}'")
    details = self_report_var_details[var]
    fig, axes = pylab.subplots(ncols=2, nrows=2, figsize=(9,10))
    A = d[d.response == details['ones']]
    B = d[d.response == details['zeros']]
    for ax, stat in zip(axes.flatten(), ['mse_resid', 'rsquared', 'mse_model', 'mse_total']):
        if stat == 'rsquared':
            ax.scatter(d[d.response == details['ones']].rsquared,
                       d[d.response == details['zeros']].rsquared,
                       marker="+")
            ax.set_xlabel(f"R-squared of fit in '{details['ones']}'")
            ax.set_ylabel(f"R-squared of fit in '{details['zeros']}'")
        if stat == 'mse_resid':
            ax.scatter(numpy.sqrt(A.mse_resid / A.bloodwork_var.map(scales)),
                       numpy.sqrt(B.mse_resid / B.bloodwork_var.map(scales)),
                       marker="+")
            ax.set_xlabel(f"Residual MSE of fit in '{details['ones']}'")
            ax.set_ylabel(f"Residual MSE of fit in '{details['zeros']}'")
        if stat == 'mse_model':
            ax.scatter(numpy.sqrt(A.mse_model / A.bloodwork_var.map(scales)),
                       numpy.sqrt(B.mse_model / B.bloodwork_var.map(scales)),
                       marker="+")
            ax.set_xlabel(f"Model MSE of fit in '{details['ones']}'")
            ax.set_ylabel(f"Model MSE of fit in '{details['zeros']}'")
        if stat == 'mse_total':
            ax.scatter(numpy.sqrt(A.mse_total / A.bloodwork_var.map(scales)),
                       numpy.sqrt(B.mse_total / B.bloodwork_var.map(scales)),
                       marker="+")
            ax.set_xlabel(f"Total MSE of fit in '{details['ones']}'")
            ax.set_ylabel(f"Total MSE of fit in '{details['zeros']}'")

        #Diagonal line
        bottom = min(ax.get_xlim()[0], ax.get_ylim()[0])
        top = max(ax.get_xlim()[1], ax.get_ylim()[1])
        ax.plot([bottom, top], [bottom, top], c="k", linestyle="--", zorder=-1)

        ax.set_aspect("equal")
        ax.set_title(f"Time-Value associations\nBy {var}")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
self_report_associations.sort_values(by="pvalue")

In [None]:
data[var].value_counts()

## Phase-correction

In [40]:
# Estimate a phase parameter and a phase offset
#data['sleep_phase'] = (data['main_sleep_onset_avg'] + data['main_sleep_offset_avg'])/2
data['phase_offset'] = data.phase - data.phase.mean()
chronotype_to_offset = {
    "Definitely a 'morning' person": -0.5,
    "More a 'morning' than 'evening' person": -0.25,
    "Do not know": 0,
    "More an 'evening' than a 'morning' person": 0.25,
    "Definitely an 'evening' person": 0.5
}
#data['phase_offset'] = data.morning_evening_person.map(chronotype_to_offset)
# Some outliers exist, so we just zero them
data.loc[data.phase_offset.abs() > 3, 'phase_offset'] = float("NaN")
data['random_offset'] = numpy.random.normal(0, scale=data.phase_offset.std(), size=len(data))
data['bloodwork_time_phase_corrected'] = data.bloodwork_time - data.phase_offset
data['bloodwork_time_random_corrected'] = data.bloodwork_time - data.random_offset

In [35]:
data.phase_offset.describe()

count    24637.000000
mean        -0.184453
std          0.866367
min         -2.734379
25%         -0.751046
50%         -0.205454
75%          0.342833
max          2.999142
Name: phase_offset, dtype: float64

In [42]:
phase_associations_data = {}
d = data[data.cosinor_rsquared > 0.3]
for field in bloodwork_fields:
    phase_results = smf.ols(f"Q('{field}') ~ bloodwork_time_phase_corrected + I(bloodwork_time_phase_corrected**2) + sex + birth_year", data=d).fit()
    phase_reduced = smf.ols(f"Q('{field}') ~ sex + birth_year", data=d).fit()
    regular_results = smf.ols(f"Q('{field}') ~ bloodwork_time + I(bloodwork_time**2) + sex + birth_year", data=d).fit()
    regular_reduced = smf.ols(f"Q('{field}') ~ sex + birth_year", data=d).fit()
    random_results = smf.ols(f"Q('{field}') ~ bloodwork_time_random_corrected + I(bloodwork_time_random_corrected**2) + sex + birth_year", data=d).fit()
    random_reduced = smf.ols(f"Q('{field}') ~ sex + birth_year", data=d).fit()    

    phase_associations_data[field] = {
        "pvalue": regular_results.compare_f_test(regular_reduced)[1],
        "phase_corrected_pvalue": phase_results.compare_f_test(phase_reduced)[1],
        "random_corrected_pvalue": random_results.compare_f_test(random_reduced)[1],
        "mse_resid": regular_results.mse_resid,
        "phase_mse_resid": phase_results.mse_resid,
        "random_corrected_mse_resid": random_results.mse_resid,
        "rsquared": regular_results.rsquared,
        "phase_corrected_rsquared": phase_results.rsquared,
        "random_corrected_rsquared": random_results.rsquared,
    }
phase_associations = pandas.DataFrame(phase_associations_data).T

In [44]:
(phase_associations.rsquared > phase_associations.phase_corrected_rsquared).sum()

30

In [43]:
fig, ax = pylab.subplots()

#ax.plot([0,250], [0,250], c="k", linestyle="--", zorder=-1)
ax.scatter(phase_associations.rsquared,
           phase_associations.phase_corrected_rsquared,
          marker="+") 
ax.set_xlabel("R-squared without correcting for phase")
ax.set_ylabel("R-squared after correcting for phase")

#ax.plot([0,250], [0,250], c="k", linestyle="--", zorder=-1)
#ax.scatter(-numpy.log10(phase_associations.pvalue),
#           -numpy.log10(phase_associations.phase_corrected_pvalue),
#          marker="+") 
#ax.set_xlabel("-log10 p-value without correcting for phase")
#ax.set_ylabel("-log10 p-value after correcting for phase")

#scales = data[phase_associations.index].var()
#ax.scatter(phase_associations.mse_resid / scales,
#           phase_associations.phase_mse_resid / scales,
#           marker="+")
#ax.set_xlabel("Residual MSE without correcting for phase")
#ax.set_ylabel("Residual MSE after correcting for phase")

#Diagonal line
bottom = min(ax.get_xlim()[0], ax.get_ylim()[0])
top = max(ax.get_xlim()[1], ax.get_ylim()[1])
ax.plot([bottom, top], [bottom, top], c="k", linestyle="--", zorder=-1)
ax.set_aspect("equal")
ax.set_title("Time-Value associations with/without phase correction")
fig.savefig(OUTDIR+"before_after_phase_correction.png")

<IPython.core.display.Javascript object>

In [None]:
fig, ax = pylab.subplots()

ax.scatter(-numpy.log10(phase_associations.random_corrected_pvalue),
           -numpy.log10(phase_associations.phase_corrected_pvalue),
          marker="+")

#Diagonal line
bottom = min(ax.get_xlim()[0], ax.get_ylim()[0])
top = max(ax.get_xlim()[1], ax.get_ylim()[1])
ax.plot([bottom, top], [bottom, top], c="k", linestyle="--", zorder=-1)
ax.set_aspect("equal")
ax.set_xlabel("-log10 p-value with a random 'correction'")
ax.set_ylabel("-log10 p-value after correcting for phase")
ax.set_title("Time-Value associations with/without phase correction")

In [None]:
### Try an interaction model

interaction_associations_data = {}
interaction_associations_results = []
data['chronotype'] = data['morning_evening_person'].copy()
data.loc[data.chronotype.isin(['Prefer not to answer', 'Do not know', ]), "chronotype"] = float("NaN")
d = data[data.cosinor_rsquared > 0.3]
for field in bloodwork_fields:
    results = smf.ols(f"Q('{field}') ~ bloodwork_time + bloodwork_time : chronotype + sex + birth_year", data=d).fit()
    reduced = smf.ols(f"Q('{field}') ~ bloodwork_time + sex + birth_year", data=d).fit()

    interaction_associations_results.append(phase_results)
    interaction_associations_data[field] = {
        "time_pvalue": reduced.pvalues['bloodwork_time'],
        "interaction_pvalue":  results.compare_f_test(reduced)[1],
    }
interaction_associations = pandas.DataFrame(interaction_associations_data).T

In [None]:
interaction_associations.sort_values(by="interaction_pvalue") * 61
#data.chronotype.value_counts()

In [None]:
pylab.figure()
for cat in d.chronotype.unique():
    sns.regplot(y="nucleated_red_blood_cell_percent", x="bloodwork_time", data=d[d.chronotype==cat], label=cat)

# Test sex differences

In [46]:
sex_associations_data = {}
for field in bloodwork_fields:
    male_results = smf.ols(f"Q('{field}') ~ bloodwork_time + I(bloodwork_time**2) + birth_year", data=data[data.sex == "Male"]).fit()
    male_reduced = smf.ols(f"Q('{field}') ~ sex + birth_year", data=data[data.sex == "Male"]).fit()
    female_results = smf.ols(f"Q('{field}') ~ bloodwork_time + I(bloodwork_time**2) + birth_year", data=data[data.sex == "Female"]).fit()
    female_reduced = smf.ols(f"Q('{field}') ~ sex + birth_year", data=data[data.sex == "Female"]).fit()

    sex_associations_data[field] = {
        "male_pvalue": male_results.compare_f_test(male_reduced)[1],
        "female_pvalue": female_results.compare_f_test(female_reduced)[1],
        "male_mse_resid": male_results.mse_resid,
        "female_mse_resid": female_results.mse_resid,
        "male_rsquared": male_results.rsquared,
        "female_rsquared": female_results.rsquared,
    }
sex_associations = pandas.DataFrame(sex_associations_data).T

In [47]:
fig, ax = pylab.subplots()

ax.scatter(sex_associations.male_rsquared,
            sex_associations.female_rsquared,
          marker="+")
ax.set_xlabel("R-squared in males")
ax.set_ylabel("R-squared in females")

#ax.plot([0,250], [0,250], c="k", linestyle="--", zorder=-1)
#ax.scatter(-numpy.log10(sex_associations.male_pvalue),
#           -numpy.log10(sex_associations.female_pvalue))
#ax.set_xlabel("-log10 p-value among males")
#ax.set_ylabel("-log10 p-value among females")

#scales = data[sex_associations.index].var()
#ax.scatter(sex_associations.male_mse_resid / scales,
#           sex_associations.female_mse_resid / scales,
#           marker="+")

#Diagonal line
bottom = min(ax.get_xlim()[0], ax.get_ylim()[0])
top = max(ax.get_xlim()[1], ax.get_ylim()[1])
ax.plot([bottom, top], [bottom, top], c="k", linestyle="--", zorder=-1)
ax.set_aspect("equal")
ax.set_title("Time-Value associations by sex")

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Time-Value associations by sex')

In [None]:
sex_associations.sort_values(by="male_rsquared", ascending=False)

# Time differences for all measures
While time of bloodwork is the only time recorded, we will use that as a proxy for other measures done at the assessment center.

In [None]:
import fields_of_interest
assessment_center_field_groups = [fields_of_interest.physical_measures,
                                    fields_of_interest.urine,
                                    fields_of_interest.arterial_stiffness,
                                    fields_of_interest.impedance,
                                    fields_of_interest.hearing_test,]
def find_var(var):
    for v in [var, var+"_V0"]:
        if v in data.columns:
            if pandas.api.types.is_numeric_dtype(data[v].dtype):
                return v
    print("Failed to find:", var)
    return None # can't find it
assessment_center_fields = [find_var(c) for block in assessment_center_field_groups
                                    for c in block
                                    if find_var(c) is not None]

all_associations_data = {}
for field in assessment_center_fields:
    results = smf.ols(f"Q('{field}') ~ bloodwork_time + I(bloodwork_time**2) + BMI +  sex + birth_year", data=data).fit()
    reduced = smf.ols(f"Q('{field}') ~ BMI + sex + birth_year", data=data).fit()

    all_associations_data[field] = {
        "pvalue": results.compare_f_test(reduced)[1],
    }
all_associations = pandas.DataFrame(all_associations_data).T

In [None]:
ukbb.

In [None]:
all_associations.sort_values(by="pvalue")