# Shiftwork analysis

Check if there are associations of phenotypes with shift work

In [256]:
%matplotlib notebook
import pathlib
import scipy
import numpy
from IPython.display import display, HTML
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import re
import matplotlib.patches as mpatches
from scipy.cluster import hierarchy
import pylab

# Configure plot settings
# Make it so that plots autolayout to fit everything (particularly text labels)
# NOTE: this can mess with dynamic zooming
#pylab.rcParams['figure.autolayout'] = True

In [2]:
COHORT = 2
OUTDIR = f"../shiftwork/cohort{COHORT}/"

In [3]:
import pandas
ukbb_full = pandas.read_hdf("../processed/ukbb_data_table.h5")
employment_full = pandas.read_csv("../processed/ukbb_employment_history.txt", sep="\t")

In [4]:
def OLS(*args, **kwargs):
    # For some reason, on my current version of numpy I get itermitten SVD convergence problems
    # that go away when re-run. So we replace the standard OLS function with one that retries up to 3 times
    for i in range(3): # Up to 3 retries
        print(f"Attempt {i}")
        try:
            return smf.ols(*args, **kwargs)
        except Exception as e:
            print(e)
            continue # SVD failed to converge, try again
    return smf.ols(*args, **kwargs)

In [5]:
ukbb_full.columns = ukbb_full.columns.str.replace("[,:/]","_") # Can't use special characters easily

In [6]:
all_ids = sorted(employment_full.ID.unique())

# Down sample for testing
# Total with employment history is 120,000
numpy.random.seed(0)
cohort_id_ranges = {1: slice(0, 30_000),
           2: slice(30_000,120299)}
selected_ids = numpy.random.choice(all_ids, size=len(all_ids), replace=False)[cohort_id_ranges[COHORT]]

print(f"Data size before selecting test set: {ukbb_full.shape}")
print(f"of which {len(employment_full.ID.unique())} had detailed employment information")

employment = employment_full[employment_full.ID.isin(selected_ids)].copy()
data = ukbb_full[ukbb_full.index.isin(selected_ids)].copy()

print(f"Data size after selecting test set: {data.shape}")
print(f"Employment data size before selecting test set: {employment_full.shape}")
print(f"Employment data size after selecting test set: {employment.shape}")

Data size before selecting test set: (502507, 187)
of which 120299 had detailed employment information
Data size after selecting test set: (90299, 187)
Employment data size before selecting test set: (402689, 20)
Employment data size after selecting test set: (302295, 20)


In [7]:
# Find an ever-shiftwork cohort and ever-night-shift cohort
data['ever_shiftwork'] = employment.groupby("ID").job_involved_shift_work.any()
employment['involved_night_shift'] = ((employment.night_shifts_worked.isin([0,1]))
                                     | (employment.mixture_of_day_and_night_shifts.isin([0,1])))
data['ever_night_shiftwork'] = employment.groupby("ID").involved_night_shift.any()
print(f"Of {data.shape[0]}, {sum(data.ever_shiftwork)} have some shiftwork, of which {sum(data.ever_night_shiftwork)} have some night shiftwork")

Of 90299, 25780 have some shiftwork, of which 20893 have some night shiftwork


In [9]:
def BH_FDR(ps):
    ''' Benjamini-Hochberg FDR control

    Converts p values to q values'''

    # For the purposes of comparison, an implementation of Benjamini Hochberg correction
    sort_order = numpy.argsort(ps)

    adjusted = numpy.zeros(ps.shape)
    adjusted[sort_order] = numpy.array(ps)[sort_order]*len(ps)/numpy.arange(1,len(ps)+1)

    # Make monotone, skipping NaNs
    m = 1;
    for i, r in enumerate(sort_order[::-1]):
        if numpy.isfinite(adjusted[r]):
            m = min(adjusted[r], m)
            adjusted[r] = m

    return adjusted # the q-values

In [10]:
# Add the amount of time at the job to the employment data
# if the year_job_ended == -313, then the job was ongoing at the time of the questionairre.
# We treat this as the maximum of 2017 and one year after the job started
employment['year_job_ended_corrected'] = employment['year_job_ended'].copy()
employment.loc[employment.year_job_ended == -313.0, 'year_job_ended_corrected'] = 2017
employment['duration'] = employment['year_job_ended_corrected'] - employment['year_job_started']
employment.loc[employment.year_job_ended == -313.0, 'duration'] = numpy.maximum(1, employment.loc[employment.year_job_ended == -313.0, 'duration'])

In [11]:
# Add duration of night-shiftwork jobs up
def years_employed(data):
    # Sum number of years employed, not double-counting years when two jobs were listed
    data_sorted = data.sort_values(by="year_job_started")
    starts, ends = data.year_job_started, data.year_job_ended_corrected
    duration = 0
    last_end = float("-inf")
    for start, end in zip(starts, ends):
        duration += max(end - max(start, last_end), 0)
        last_end = max(end, last_end)
    # If employed at all, we give at least 0.5 year duration since
    # may start and end on the same year but 0 is clearly a bad choice
    if len(data) > 0:
        duration = max(duration, 0.5)
    return duration
data['duration_night_shiftwork'] = employment[employment.involved_night_shift & (~employment.duplicated())].groupby("ID").apply(years_employed)
data.duration_night_shiftwork.fillna(0, inplace=True)

## Summarize Job Types

In [12]:
job_code_info = pandas.read_csv("../coding497.tsv", sep="\t", engine="python", index_col='node_id')

In [13]:
job_code_info['level'] = float("NaN")
def get_level(index):
    level = job_code_info.loc[index].level
    if pandas.isna(level):
        parent = job_code_info.loc[index].parent_id
        if parent == 0:
            level = 1
        else:
            level = get_level(parent) + 1
        job_code_info.loc[index, 'level'] = level
    return level
        
[get_level(index) for index in job_code_info.index]

def get_level_meaning(index, level=1):
    if job_code_info.loc[index].level <= level:
        return job_code_info.loc[index].meaning
    return get_level_meaning(job_code_info.loc[index].parent_id, level=level)
job_code_info['level1_meaning'] = job_code_info.index.map(get_level_meaning)
job_code_info['level2_meaning'] = job_code_info.index.map(lambda x: get_level_meaning(x,2))

job_code_info = job_code_info.reset_index().set_index('coding')

In [14]:
employment['job_category'] = employment.job_code.map(job_code_info.level1_meaning)
employment['job_category_level2'] = employment.job_code.map(job_code_info.level2_meaning)

In [15]:
# Gather the shift-work data and compute frequencies of the job categories
pandas.DataFrame({
    "all_night_shift": employment[employment.involved_night_shift].job_category.value_counts(),
    "male": employment[employment.involved_night_shift & (employment.ID.map(data.sex) == "Male")].job_category.value_counts(),
    "female": employment[employment.involved_night_shift & (employment.ID.map(data.sex) == "Female")].job_category.value_counts(),
}).sort_values(by="all_night_shift", ascending=False)

Unnamed: 0,all_night_shift,male,female
"health (human or animal), residential/social/religious care, undertaking (including managers)",11770,1934,9836
"armed forces, emergency services, security, health & safety (including managers)",4256,3548,708
"science, research, engineering, computer technology (including managers)",2762,2396,366
routine factory-based manufacturing (including managers),2509,2308,201
"transport (road, rail, air, water), work with other mobile machinery (including managers)",2006,1767,239
skilled manual work (including managers),1742,1691,51
"office-based work: professional, managerial, administrative or general office/clerical",1469,822,647
"mining, quarrying, energy production, water treatment (including managers)",839,825,14
"personal services, travel/tourism, hospitality (including managers)",766,284,482
"sport, culture, arts, media, entertainment (including managers)",764,428,336


## Test association with metabolic syndrome

Following the definition in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5436094/ for determining metabolic syndrome cases:
    defined as waist circumference ⩾94 cm for males and ⩾80 cm for females)
    as well as at least two of the following four factors: 
    (1) raised triglycerides (⩾1.7 mmol l−1),2
    (2) reduced HDL cholesterol (<1.03 mmol/l in males and <1.29 mmol/l for females),
    (3) raised blood pressure (BP) (systolic BP ⩾130 mm Hg or diastolic BP ⩾85 mm Hg),
    (4) raised fasting plasma glucose (defined as HbA1c levels⩾5.7 mmol/l

In [16]:
def at_least_two(*args):
    df = pandas.DataFrame({i:arg for i, arg in enumerate(args)})
    return df.sum(axis=1) >= 2
data['metabolic_syndrome'] = (
    (data.sex == "Male") & (
        (data.waist_circumference >= 94) & at_least_two(
            (data.triglycerides >= 1.7),
            (data.hdl_cholesterol < 1.03),
            ((data.systolic_blood_pressure_V0 > 130) |
             (data.diastolic_blood_pressure_V0 > 85)),
            ((data.glycated_heamoglobin > 5.7) &
             (data.blood_sample_fasting_time >= 8))
        )
    )
    |
    (data.sex == "Female") & (
        (data.waist_circumference >= 80) & at_least_two(
            (data.triglycerides >= 1.7),
            (data.hdl_cholesterol < 1.29),
            ((data.systolic_blood_pressure_V0 > 130) |
             (data.diastolic_blood_pressure_V0 > 85)),
            ((data.glycated_heamoglobin > 5.7) &
             (data.blood_sample_fasting_time >= 8))
        )
    )
)

In [17]:
print(f"Identified {data.metabolic_syndrome.sum()} cases out of {len(data)}")
print(f"Male: {(data.metabolic_syndrome & (data.sex == 'Male')).sum()}")
print(f"Female: {(data.metabolic_syndrome & (data.sex == 'Female')).sum()}")

Identified 21456 cases out of 90299
Male: 11398
Female: 10058


### Simple model without using job codes

In [18]:
results = smf.ols("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork * sex", data=data).fit()
reduced = smf.ols("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork + sex", data=data).fit()
_, nested_p, _ = results.compare_f_test(reduced)
male_shiftwork_p = results.f_test("ever_night_shiftwork[T.True] + ever_night_shiftwork[T.True]:sex[T.Male] = 0").pvalue
results.summary()

0,1,2,3
Dep. Variable:,metabolic_syndrome.astype(int),R-squared:,0.017
Model:,OLS,Adj. R-squared:,0.017
Method:,Least Squares,F-statistic:,393.9
Date:,"Mon, 26 Oct 2020",Prob (F-statistic):,0.0
Time:,09:58:34,Log-Likelihood:,-50213.0
No. Observations:,90299,AIC:,100400.0
Df Residuals:,90294,BIC:,100500.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1983,0.002,95.534,0.000,0.194,0.202
ever_night_shiftwork[T.True],0.0240,0.005,4.852,0.000,0.014,0.034
sex[T.Male],0.0660,0.003,20.206,0.000,0.060,0.072
ever_night_shiftwork[T.True]:sex[T.Male],0.0334,0.007,4.940,0.000,0.020,0.047
center(birth_year),-0.0043,0.000,-23.046,0.000,-0.005,-0.004

0,1,2,3
Omnibus:,15339.344,Durbin-Watson:,2.008
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22478.065
Skew:,1.2,Prob(JB):,0.0
Kurtosis:,2.535,Cond. No.,43.8


In [19]:
print("P-Values without controlling for job types")
#print("Total shift-work effect:", reduced.pvalues["ever_night_shiftwork[T.True]"])
print("Female shift-work effect:", results.pvalues["ever_night_shiftwork[T.True]"])
print("Male shift-work effect:", male_shiftwork_p)
print("Differential Male/Female effect:", results.pvalues["ever_night_shiftwork[T.True]:sex[T.Male]"])

P-Values without controlling for job types
Total shift-work effect: 1.9038045410721858e-35
Female shift-work effect: 1.2270071398407404e-06
Male shift-work effect: 1.1521168034295289e-35
Differential Male/Female effect: 7.816028154221922e-07


In [20]:
results.pvalues

Intercept                                    0.000000e+00
ever_night_shiftwork[T.True]                 1.227007e-06
sex[T.Male]                                  1.380199e-90
ever_night_shiftwork[T.True]:sex[T.Male]     7.816028e-07
center(birth_year)                          3.546972e-117
dtype: float64

In [21]:
# Basic percent-case for each group
percent_metabolic_syndrome = data.groupby(["sex", "ever_night_shiftwork"]).metabolic_syndrome.mean().reset_index()
percent_metabolic_syndrome

Unnamed: 0,sex,ever_night_shiftwork,metabolic_syndrome
0,Female,False,0.197441
1,Female,True,0.21361
2,Male,False,0.267503
3,Male,True,0.323708


In [265]:
# Plot the raw data by sex and shiftwork (percentage metabolic)
data['metabolic_syndrome_percent'] = data.metabolic_syndrome * 100
fig, ax = pylab.subplots()
g = sns.barplot(x="sex", y="metabolic_syndrome_percent", hue="ever_night_shiftwork",  data = data,
            saturation=0.85, capsize=0.15, errwidth=1.5)
ax.set_ylabel("Percent with metabolic syndrome")
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
legend = [c for c in ax.get_children() if type(c) is pylab.mpl.legend.Legend][0]
legend.set_title("")
legend.texts[0].set_text("No night shifts")
legend.texts[1].set_text("Night shifts")
fig.savefig(OUTDIR+"/percent_metabolic_syndrome.svg")

<IPython.core.display.Javascript object>

### Model with covariates, but not controlling for job type
Logistic model that controls for age, ethnicity (as either white or other), smoking, and household income.

In [284]:
data['ethnicity_white'] = data.ethnicity.isin(["White", "British", "Irish", "Any other white background"])
results = smf.logit("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork * sex + ethnicity_white + smoking + household_income"
                    , data=data).fit()
male_shiftwork_p = results.f_test("ever_night_shiftwork[T.True] + ever_night_shiftwork[T.True]:sex[T.Male] = 0").pvalue
print("Male shiftwork p", male_shiftwork_p)
results.summary()

Optimization terminated successfully.
         Current function value: 0.538180
         Iterations 6
Male shiftwork p 5.298485820490509e-20


0,1,2,3
Dep. Variable:,metabolic_syndrome.astype(int),No. Observations:,89707.0
Model:,Logit,Df Residuals:,89707.0
Method:,MLE,Df Model:,-1.0
Date:,"Thu, 29 Oct 2020",Pseudo R-squ.:,0.02094
Time:,12:17:43,Log-Likelihood:,-48279.0
converged:,True,LL-Null:,-49311.0
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.3415,0.195,-6.883,0.000,-1.724,-0.960
ever_night_shiftwork[T.True],0.1285,0.029,4.429,0.000,0.072,0.185
sex[T.Male],0.4009,0.019,21.369,0.000,0.364,0.438
ethnicity_white[T.True],0.1647,0.053,3.116,0.002,0.061,0.268
smoking[T.Never],-0.2472,0.182,-1.361,0.173,-0.603,0.109
smoking[T.Previous],-0.0394,0.182,-0.217,0.828,-0.395,0.317
smoking[T.Current],0.0957,0.184,0.522,0.602,-0.264,0.455
household_income[T.Prefer not to answer],-0.0501,0.063,-0.798,0.425,-0.173,0.073
"household_income[T.Less than 18,000]",0.1574,0.059,2.674,0.008,0.042,0.273


In [285]:
print("P-Values when controlling for covariates")
print("Female shift-work effect:", results.pvalues["ever_night_shiftwork[T.True]"])
print("Male shift-work effect:", male_shiftwork_p)
print("Differential Male/Female effect:", results.pvalues["ever_night_shiftwork[T.True]:sex[T.Male]"])

P-Values when controlling for covarites
Female shift-work effect: 9.464259298164164e-06
Male shift-work effect: 5.298485820490509e-20
Differential Male/Female effect: 0.014499917571988086


In [290]:
# Same model but with downsampled data
# to check how it affects p-values for comparison with the controlling below
p_values = []
for i in range(100):
    d = data.sample(19760)
    data['ethnicity_white'] = data.ethnicity.isin(["White", "British", "Irish", "Any other white background"])
    results = smf.logit("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork * sex + ethnicity_white + smoking + household_income"
                        , data=d).fit()
    male_shiftwork_p = results.f_test("ever_night_shiftwork[T.True] + ever_night_shiftwork[T.True]:sex[T.Male] = 0").pvalue
    #print("Male shiftwork p", male_shiftwork_p)
    #results.summary()
    p_values.append(results.pvalues['ever_night_shiftwork[T.True]:sex[T.Male]'])
p_values = pandas.Series(sorted(p_values))
p_values.describe()

Optimization terminated successfully.
         Current function value: 0.537251
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.538948
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.535529
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.533477
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.539044
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.534677
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.539120
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.543361
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.533446
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.534803
  

Optimization terminated successfully.
         Current function value: 0.535827
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.540461
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.534404
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.534972
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.535930
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.534069
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.535765
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.536455
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.542438
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.534170
  

count    100.000000
mean       0.351582
std        0.283752
min        0.002012
25%        0.123309
50%        0.308086
75%        0.522183
max        0.973695
dtype: float64

## Control for job type via pairings

In [26]:
## Try to find 'tetrads' of male/female night/day workers...
# WARNING: slow if not already computed!!!
tetrad_list_file = pathlib.Path(OUTDIR+"/tetrad_id_list.txt")
if not tetrad_list_file.exists():
    used_ids = set()
    for ID in data[data.ever_night_shiftwork].index:
        if ID not in used_ids:
            # Try to find matches
            sex, shiftwork = data.loc[ID,['sex', 'ever_night_shiftwork']]
            job_history = employment[employment.ID == ID]
            # Find the/a job that is shiftwork for them
            jobtype = job_history[job_history.involved_night_shift].sort_values(by="duration").job_category_level2.iloc[0]

            # Find those with the same job type and haven't been used yet
            possible_ids = employment[employment.job_category_level2 == jobtype].ID.unique()

            tetrad = {}
            tetrad[(sex, shiftwork)] = ID
            for ID2 in set(possible_ids).difference(used_ids):
                sex2, shiftwork2 = data.loc[ID2, ['sex', 'ever_night_shiftwork']]
                if (sex2, shiftwork2) not in tetrad:
                    tetrad[(sex2, shiftwork2)] = ID2
                    if len(tetrad) == 4:
                        break
            if len(tetrad) == 4:
                used_ids.update(tetrad.values())
    pandas.Series(list(used_ids)).to_csv(tetrad_list_file, sep="\t", index=0)
else:
    # If output already exists, we just use that list of ids instead
    print(f"Reading tetrad list in from {tetrad_list_file}")
    used_ids = set(int(x) for x in tetrad_list_file.read_text().splitlines())

Reading tetrad list in from ..\shiftwork\cohort2\tetrad_id_list.txt


In [27]:
tetrad_data = data[data.index.isin(used_ids)]
print(f"Identified N={len(used_ids)} individuals in {len(used_ids)//4} 'tetrads' with similar job histories\nand one each of the four combinations (male/female) (shiftwork/noshiftwork)")

Identified N=19760 individuals in 4940 'tetrads' with similar job histories
and one each of the four combinations (male/female) (shiftwork/noshiftwork)


In [270]:
results = OLS("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
reduced = OLS("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork + sex", data=tetrad_data).fit()
_, nested_p, _ = results.compare_f_test(reduced)
male_shiftwork_p = results.f_test("ever_night_shiftwork[T.True] + ever_night_shiftwork[T.True]:sex[T.Male] = 0").pvalue
print(results.summary())
#reduced.summary()

Attempt 0
Attempt 0
                                  OLS Regression Results                                  
Dep. Variable:     metabolic_syndrome.astype(int)   R-squared:                       0.015
Model:                                        OLS   Adj. R-squared:                  0.015
Method:                             Least Squares   F-statistic:                     76.37
Date:                            Thu, 29 Oct 2020   Prob (F-statistic):           2.27e-64
Time:                                    12:00:08   Log-Likelihood:                -11274.
No. Observations:                           19760   AIC:                         2.256e+04
Df Residuals:                               19755   BIC:                         2.260e+04
Df Model:                                       4                                         
Covariance Type:                        nonrobust                                         
                                               coef    std err        

In [271]:
print("P-Values when controlling for job types")
print("Total shift-work effect:", reduced.pvalues["ever_night_shiftwork[T.True]"])
print("Female shift-work effect:", results.pvalues["ever_night_shiftwork[T.True]"])
print("Male shift-work effect:", male_shiftwork_p)
print("Differential Male/Female effect:", results.pvalues["ever_night_shiftwork[T.True]:sex[T.Male]"])

P-Values when controlling for job types
Total shift-work effect: 2.481102321468442e-05
Female shift-work effect: 0.08692020779813747
Male shift-work effect: 2.0917516663821606e-05
Differential Male/Female effect: 0.07267068201629065


In [272]:
# Figure for the controlled model
fig, ax = pylab.subplots()
means = {"male": results.params["Intercept"] + results.params["sex[T.Male]"],
         "male_shift": results.params["Intercept"] + results.params["ever_night_shiftwork[T.True]"] + results.params["sex[T.Male]"] + results.params["ever_night_shiftwork[T.True]:sex[T.Male]"],
         "female": results.params["Intercept"],
         "female_shift": results.params["Intercept"] + results.bse["ever_night_shiftwork[T.True]"],}
def error_from_results(components):
    cov = results.cov_params()
    vec = numpy.array([1 if var in components else 0 for var in cov.columns])[:,None]
    return numpy.sqrt(vec.T @ cov.values @ vec)[0,0] * 1.96 # 1.96 standard deviations
std_errors = {"male": error_from_results(["Intercept", "sex[T.Male]"]),
         "male_shift": error_from_results(["Intercept", "ever_night_shiftwork[T.True]", "sex[T.Male]", "ever_night_shiftwork[T.True]:sex[T.Male]"]),
         "female": error_from_results(["Intercept"]),
         "female_shift": error_from_results(["Intercept", "ever_night_shiftwork[T.True]"]),
             }
width = 0.5
error_kw = {"capsize": 15, "capthick": 1}
ax.bar([-0.25, 1],
       [means['female'], means['male']],
       width,
       yerr=[std_errors['female'], std_errors['male']],
       label="No Shiftwork",
       error_kw=error_kw)
ax.bar([0.25, 1.5],
       [means['female_shift'], means['male_shift']],
       width,
       yerr=[std_errors['female_shift'], std_errors['male_shift']],
       label="Shiftwork",
       error_kw=error_kw)
ax.set_xticks([0,1.25])
ax.set_xticklabels(["Female", "Male"])
ax.yaxis.set_major_formatter(pylab.matplotlib.ticker.PercentFormatter(xmax=1, decimals=0))
ax.legend()
ax.set_ylabel("Percent with Metabolic Syndrome")
fig.savefig(OUTDIR+"/controlled_for_job.model.svg")

<IPython.core.display.Javascript object>

In [266]:
# Figure for the RAW data of the "tetrad" dataset
fig, ax = pylab.subplots()
model_results = pandas.DataFrame({
    "Sex": ["Male", "Male", "Female", "Female"],
    "Shiftwork": [False, True, False, True],
    "metabolic_syndrome": [
        tetrad_data[(tetrad_data.sex == "Male") & (tetrad_data.ever_night_shiftwork == False)].metabolic_syndrome.mean(),
        tetrad_data[(tetrad_data.sex == "Male") & (tetrad_data.ever_night_shiftwork == True)].metabolic_syndrome.mean(),
        tetrad_data[(tetrad_data.sex == "Female") & (tetrad_data.ever_night_shiftwork == False)].metabolic_syndrome.mean(),
        tetrad_data[(tetrad_data.sex == "Female") & (tetrad_data.ever_night_shiftwork == True)].metabolic_syndrome.mean()
    ]
})
model_results['metabolic_syndrome'] *= 100 # Convert to percents
sns.barplot(x="sex", y="metabolic_syndrome", hue="ever_night_shiftwork",  data = tetrad_data,
           saturation=0.85, capsize=0.15, errwidth=1.5)
ax.set_ylabel("Percent with Metabolic Syndrome")
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
legend = [c for c in ax.get_children() if type(c) is pylab.mpl.legend.Legend][0]
legend.set_title("")
legend.texts[0].set_text("No night shifts")
legend.texts[1].set_text("Night shifts")
fig.savefig(OUTDIR+"/controlled_for_job.raw.svg")

<IPython.core.display.Javascript object>

In [33]:
# Logistic regression
results_logit = smf.logit("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
reduced_logit = smf.logit("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork + sex", data=tetrad_data).fit()
results_logit.summary()

Optimization terminated successfully.
         Current function value: 0.551631
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.551678
         Iterations 5


0,1,2,3
Dep. Variable:,metabolic_syndrome.astype(int),No. Observations:,19760.0
Model:,Logit,Df Residuals:,19760.0
Method:,MLE,Df Model:,-1.0
Date:,"Mon, 26 Oct 2020",Pseudo R-squ.:,0.01364
Time:,11:53:26,Log-Likelihood:,-10900.0
converged:,True,LL-Null:,-11051.0
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.4002,0.036,-39.155,0.000,-1.470,-1.330
ever_night_shiftwork[T.True],0.0874,0.050,1.738,0.082,-0.011,0.186
sex[T.Male],0.3984,0.048,8.293,0.000,0.304,0.493
ever_night_shiftwork[T.True]:sex[T.Male],0.0921,0.067,1.374,0.170,-0.039,0.223
center(birth_year),-0.0186,0.002,-8.555,0.000,-0.023,-0.014


In [280]:
tetrad_data = tetrad_data.copy()
tetrad_data['ethnicity_white'] = tetrad_data.index.map(data.ethnicity_white)
results = smf.logit("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork * sex + ethnicity_white + smoking + household_income", data=tetrad_data).fit()
male_shiftwork_p = results.f_test("ever_night_shiftwork[T.True] + ever_night_shiftwork[T.True]:sex[T.Male] = 0").pvalue
print("Male shiftwork effect p:", male_shiftwork_p)
results.summary()

Optimization terminated successfully.
         Current function value: 0.547688
         Iterations 6
Male shiftwork effect p: 0.0008197027234422647


0,1,2,3
Dep. Variable:,metabolic_syndrome.astype(int),No. Observations:,19631.0
Model:,Logit,Df Residuals:,19631.0
Method:,MLE,Df Model:,-1.0
Date:,"Thu, 29 Oct 2020",Pseudo R-squ.:,0.02305
Time:,12:15:39,Log-Likelihood:,-10752.0
converged:,True,LL-Null:,-11005.0
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.9498,0.438,-4.456,0.000,-2.808,-1.092
ever_night_shiftwork[T.True],0.0830,0.051,1.637,0.102,-0.016,0.182
sex[T.Male],0.4346,0.049,8.913,0.000,0.339,0.530
ethnicity_white[T.True],0.4100,0.102,4.005,0.000,0.209,0.611
smoking[T.Never],0.0790,0.411,0.192,0.847,-0.726,0.884
smoking[T.Previous],0.3082,0.411,0.751,0.453,-0.497,1.113
smoking[T.Current],0.3973,0.414,0.961,0.337,-0.413,1.208
household_income[T.Prefer not to answer],0.0108,0.141,0.077,0.939,-0.265,0.287
"household_income[T.Less than 18,000]",0.2268,0.131,1.736,0.082,-0.029,0.483


In [282]:
print("P-Values when controlling for job types and covariates")
#print("Total shift-work effect:", reduced.pvalues["ever_night_shiftwork[T.True]"])
print("Female shift-work effect:", results.pvalues["ever_night_shiftwork[T.True]"])
print("Male shift-work effect:", male_shiftwork_p)
print("Differential Male/Female effect:", results.pvalues["ever_night_shiftwork[T.True]:sex[T.Male]"])

P-Values when controlling for job types and covariates
Female shift-work effect: 0.10160505384739825
Male shift-work effect: 0.0008197027234422647
Differential Male/Female effect: 0.3201661097011902


## Check associations with the five criteria for Metabolic Syndrome

In [34]:
results_waist = smf.ols("waist_circumference ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
print(results_waist.summary())

                             OLS Regression Results                            
Dep. Variable:     waist_circumference   R-squared:                       0.225
Model:                             OLS   Adj. R-squared:                  0.225
Method:                  Least Squares   F-statistic:                     1430.
Date:                 Mon, 26 Oct 2020   Prob (F-statistic):               0.00
Time:                         11:53:26   Log-Likelihood:                -76651.
No. Observations:                19734   AIC:                         1.533e+05
Df Residuals:                    19729   BIC:                         1.534e+05
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------

In [35]:
results_triglycerides = smf.ols("triglycerides ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
print(results_triglycerides.summary())
print("Interaction p-value:", results_triglycerides.pvalues["ever_night_shiftwork[T.True]:sex[T.Male]"])

                            OLS Regression Results                            
Dep. Variable:          triglycerides   R-squared:                       0.061
Model:                            OLS   Adj. R-squared:                  0.061
Method:                 Least Squares   F-statistic:                     303.0
Date:                Mon, 26 Oct 2020   Prob (F-statistic):          5.91e-253
Time:                        11:53:26   Log-Likelihood:                -25519.
No. Observations:               18579   AIC:                         5.105e+04
Df Residuals:                   18574   BIC:                         5.109e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

In [36]:
results_hdl = smf.ols("hdl_cholesterol ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
print(results_hdl.summary())

                            OLS Regression Results                            
Dep. Variable:        hdl_cholesterol   R-squared:                       0.189
Model:                            OLS   Adj. R-squared:                  0.189
Method:                 Least Squares   F-statistic:                     994.1
Date:                Mon, 26 Oct 2020   Prob (F-statistic):               0.00
Time:                        11:53:26   Log-Likelihood:                -6025.4
No. Observations:               17017   AIC:                         1.206e+04
Df Residuals:                   17012   BIC:                         1.210e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

In [37]:
results_systolic = smf.ols("systolic_blood_pressure_V0 ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
print(results_systolic.summary())

                                OLS Regression Results                                
Dep. Variable:     systolic_blood_pressure_V0   R-squared:                       0.130
Model:                                    OLS   Adj. R-squared:                  0.130
Method:                         Least Squares   F-statistic:                     693.6
Date:                        Mon, 26 Oct 2020   Prob (F-statistic):               0.00
Time:                                11:53:26   Log-Likelihood:                -79517.
No. Observations:                       18535   AIC:                         1.590e+05
Df Residuals:                           18530   BIC:                         1.591e+05
Df Model:                                   4                                         
Covariance Type:                    nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
---------------------

In [38]:
results_diastolic = smf.ols("diastolic_blood_pressure_V0 ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
print(results_diastolic.summary())

                                 OLS Regression Results                                
Dep. Variable:     diastolic_blood_pressure_V0   R-squared:                       0.032
Model:                                     OLS   Adj. R-squared:                  0.032
Method:                          Least Squares   F-statistic:                     155.2
Date:                         Mon, 26 Oct 2020   Prob (F-statistic):          7.33e-131
Time:                                 11:53:26   Log-Likelihood:                -69626.
No. Observations:                        18535   AIC:                         1.393e+05
Df Residuals:                            18530   BIC:                         1.393e+05
Df Model:                                    4                                         
Covariance Type:                     nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------

In [39]:
results_gh = smf.ols("glycated_heamoglobin ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data[tetrad_data.blood_sample_fasting_time >= 8]).fit()
print(results_gh.summary())

                             OLS Regression Results                             
Dep. Variable:     glycated_heamoglobin   R-squared:                       0.027
Model:                              OLS   Adj. R-squared:                  0.020
Method:                   Least Squares   F-statistic:                     4.096
Date:                  Mon, 26 Oct 2020   Prob (F-statistic):            0.00277
Time:                          11:53:26   Log-Likelihood:                -2042.7
No. Observations:                   605   AIC:                             4095.
Df Residuals:                       600   BIC:                             4117.
Df Model:                             4                                         
Covariance Type:              nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------

## Dose-response
Does the 'dose' of the shiftwork matter? Do more years in shiftwork imply a larger metabolic syndrome risk?

In [40]:
results = smf.logit("(metabolic_syndrome.astype(int)) ~ center(birth_year) + duration_night_shiftwork * sex",
                  data=tetrad_data).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.551725
         Iterations 5


0,1,2,3
Dep. Variable:,metabolic_syndrome.astype(int),No. Observations:,19760.0
Model:,Logit,Df Residuals:,19755.0
Method:,MLE,Df Model:,4.0
Date:,"Mon, 26 Oct 2020",Pseudo R-squ.:,0.01347
Time:,11:53:26,Log-Likelihood:,-10902.0
converged:,True,LL-Null:,-11051.0
Covariance Type:,nonrobust,LLR p-value:,3.512e-63

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.3800,0.030,-46.462,0.000,-1.438,-1.322
sex[T.Male],0.4138,0.040,10.235,0.000,0.335,0.493
center(birth_year),-0.0178,0.002,-8.211,0.000,-0.022,-0.014
duration_night_shiftwork,0.0031,0.002,1.492,0.136,-0.001,0.007
duration_night_shiftwork:sex[T.Male],0.0025,0.003,0.963,0.336,-0.003,0.008


In [41]:
female_duration_effect_p = results.f_test("duration_night_shiftwork = 0").pvalue
male_duration_effect_p = results.f_test("duration_night_shiftwork:sex[T.Male] + duration_night_shiftwork = 0").pvalue
print("Logistic model of shiftwork duration versus metabolic syndrome:")
print(f"Duration effect in females: {female_duration_effect_p:0.2e}")
print(f"Duration effect in males: {male_duration_effect_p:0.2e}")
print(f"Interaction effect of duration and sex: {results.pvalues['duration_night_shiftwork:sex[T.Male]']:0.2e}")

Logistic model of shiftwork duration versus metabolic syndrome:
Duration effect in females: 1.36e-01
Duration effect in males: 2.38e-04
Interaction effect of duration and sex: 3.36e-01


In [255]:
# Plot the percent metabolic syndrome versus duration of night shiftwork
#fig, ax = pylab.subplots()
baseline = tetrad_data[~tetrad_data.ever_night_shiftwork].metabolic_syndrome.mean()
smoothed = sm.nonparametric.lowess(data['metabolic_syndrome'], data['duration_night_shiftwork'],
                                  return_sorted=True,
                                   frac=0.3,
                                  delta=1,
                                  it=0)
#ax.plot(smoothed[:,0], smoothed[:,1])
#ax.axhline(baseline, c="k",)
sns.lmplot(x="duration_night_shiftwork", y="metabolic_syndrome", data=data, logistic=False, ci = 95, y_jitter=0.1, hue="sex")

  frac=frac, it=it, delta=delta, given_xvals=False)


<IPython.core.display.Javascript object>

<seaborn.axisgrid.FacetGrid at 0x1f523d859b0>

In [109]:
#fig, ax = pylab.subplots()

sns.lmplot(x="duration_night_shiftwork", y="triglycerides", data=tetrad_data[tetrad_data.ever_night_shiftwork], hue="sex")
results = smf.ols("triglycerides ~ center(birth_year) + duration_night_shiftwork * sex",
                  data=tetrad_data[tetrad_data.ever_night_shiftwork]).fit()
results.summary()

<IPython.core.display.Javascript object>

0,1,2,3
Dep. Variable:,triglycerides,R-squared:,0.068
Model:,OLS,Adj. R-squared:,0.067
Method:,Least Squares,F-statistic:,168.0
Date:,"Mon, 26 Oct 2020",Prob (F-statistic):,4.54e-139
Time:,14:59:25,Log-Likelihood:,-12808.0
No. Observations:,9281,AIC:,25630.0
Df Residuals:,9276,BIC:,25660.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.4729,0.022,67.830,0.000,1.430,1.515
sex[T.Male],0.4909,0.032,15.252,0.000,0.428,0.554
center(birth_year),-0.0056,0.001,-4.354,0.000,-0.008,-0.003
duration_night_shiftwork,-0.0013,0.001,-1.166,0.244,-0.003,0.001
duration_night_shiftwork:sex[T.Male],0.0005,0.001,0.374,0.708,-0.002,0.003

0,1,2,3
Omnibus:,4163.812,Durbin-Watson:,2.024
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29492.909
Skew:,2.02,Prob(JB):,0.0
Kurtosis:,10.743,Cond. No.,97.1


## Investigate the more specific job types

In [254]:
 top_shiftwork_job_categories = {
     "healthcare": "health (human or animal), residential/social/religious care, undertaking (including managers)",
    "security": "armed forces, emergency services, security, health & safety (including managers)",
    #"science_IT": "science, research, engineering, computer technology (including managers)",
}
figs = []
for name, job_category in top_shiftwork_job_categories.items():
    in_job_category = employment[employment.job_category == job_category].ID.unique()
    
    # test the interaction
    results = OLS("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork * sex", data=data[data.index.isin(in_job_category)]).fit()
    p = results.pvalues["ever_night_shiftwork[T.True]:sex[T.Male]"]
    
    fig, ax = pylab.subplots()
    sns.barplot(x="sex", y="metabolic_syndrome", hue="ever_night_shiftwork", data=data[data.index.isin(in_job_category)],
                saturation=0.85, capsize=0.15, errwidth=1.5)
    ax.set_ylabel("Percent with metabolic syndrome")
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    legend = [c for c in ax.get_children() if type(c) is pylab.mpl.legend.Legend][0]
    legend.set_title("")
    legend.texts[0].set_text("No night shifts")
    legend.texts[1].set_text("Night shifts")
    ax.set_title(name + f"\nInteraction p={p:0.2e}")
    pylab.show()
    figs.append(fig)
for fig in figs:
    fig.savefig(OUTDIR+f"by_job_category.{name}.svg")

Attempt 0


<IPython.core.display.Javascript object>

Attempt 0


<IPython.core.display.Javascript object>

# Activity differences by sex

In [106]:
full_activity = pandas.read_csv("../processed/activity_features_aggregate_seasonal.txt", sep="\t", dtype={'Unnamed: 0': str})
activity_summary = pandas.read_csv("../processed/activity_summary_aggregate.txt", index_col=0, sep="\t")

# Separate out the user ID from the run number (0 = original, 1-4 are seasonal repeats)
full_activity.rename(columns={"Unnamed: 0": "run_id"}, inplace=True)
full_activity['id'] = full_activity.run_id.apply(lambda x: int(x.split('.')[0]))
full_activity['run'] = full_activity.run_id.apply(lambda x: int(x.split('.')[1]))
activity = full_activity[full_activity.run == 0]
activity.set_index('id', inplace=True)
activity = activity.join(activity_summary)

# drop activity for people who fail basic QC
okay = (activity_summary['quality-goodCalibration'].astype(bool)
            & (~activity_summary['quality-daylightSavingsCrossover'].astype(bool))
            & (activity_summary['quality-goodWearTime'].astype(bool))
       )
activity = activity[okay]
activity.columns = activity.columns.str.replace("-","_") # Can't use special characters easily
print(f"Dropping {(~okay).sum()} entries out of {len(okay)} due to bad quality or wear-time")

## Process activity variables that need cleaning
activity.phase = activity.phase % 24
print(f"Have activity data on {len(activity)}")


  interactivity=interactivity, compiler=compiler, result=result)


Dropping 11363 entries out of 103688 due to bad quality or wear-time
Have activity data on 92325


In [107]:
activity = activity.join(ukbb_full[["sex", "birth_year", "smoking", "ethnicity", "household_income", "BMI"]])
def selected_ethnicity(ethnicity):
    if ethnicity in ["White", "British", "Irish", "Any other white background"]:
        return "White"
    return "Nonwhite or mixed"                       
activity['ethnicity_selected'] = activity.ethnicity.apply(selected_ethnicity)

In [108]:
activity.ethnicity_selected.value_counts()

White                89161
Nonwhite or mixed     3124
Name: ethnicity_selected, dtype: int64

In [274]:
## Test for sex differences in particular variables of interest
activity_variables = ["acceleration_RA", "mesor", "phase", "cosinor_rsquared", "total_sleep_mean", "WASO_mean", "amplitude"]
activity_results = []
for var in activity_variables:
    #results = smf.ols(f"{var} ~ center(birth_year) * sex + ethnicity_selected + household_income + smoking", data=activity).fit()
    #results = smf.ols(f"{var} ~ sex + (center(birth_year) + ethnicity_selected + household_income + smoking)", data=activity).fit()
    results = smf.ols(f"{var} ~ sex + (center(birth_year) + ethnicity_selected + household_income + smoking)", data=activity).fit()
    activity_results.append({
        "var": var,
        "sex_difference_pvalue": results.pvalues['sex[T.Male]'],
        "male_female_difference": results.params['sex[T.Male]'],
        "male_median": activity[activity.sex == "Male"][var].median(),
        "female_median": activity[activity.sex == "Female"][var].median(),
        "male_mean": activity[activity.sex == "Male"][var].mean(),
        "female_mean": activity[activity.sex == "Female"][var].mean(),
    })

In [279]:
print(f"Number of participants with actigraphy data being used:\n{results.nobs}")

Number of participants with actigraphy data being used:
91672.0


In [103]:
activity_results_df = pandas.DataFrame(activity_results)
for i, results in activity_results_df.iterrows():
    print(results['var'])
    print(f"Sex difference: p = {results.sex_difference_pvalue:0.2e}")
    print(f"Controlled male-female difference: {results.male_female_difference}")
    print(f"Median in males: {results.male_median}")
    print(f"Median in females: {results.female_median}")
    print(f"Mean in males: {results.male_mean}")
    print(f"Mean in females: {results.female_mean}")
    print()

acceleration_RA
Sex difference: p = 1.32e-126
Controlled male-female difference: -0.01198271882746957
Median in males: 0.8553137825079963
Median in females: 0.8638872838026901
Mean in males: 0.8368277296807988
Mean in females: 0.8493796968427771

mesor
Sex difference: p = 1.54e-146
Controlled male-female difference: -0.015742660889922187
Median in males: 1.0204073921270946
Median in females: 1.0394918111203402
Mean in males: 1.0204612273141571
Mean in females: 1.038698813631847

phase
Sex difference: p = 8.29e-33
Controlled male-female difference: -0.08561740669622803
Median in males: 14.467717058744762
Median in females: 14.56429485930841
Mean in males: 14.49494147155485
Mean in females: 14.580589329767

cosinor_rsquared
Sex difference: p = 0.00e+00
Controlled male-female difference: -0.03252540700881763
Median in males: 0.2932380317918082
Median in females: 0.32822914457211094
Mean in males: 0.29305907877311627
Mean in females: 0.32507992917639983

total_sleep_mean
Sex difference: p 

In [104]:
# Plot config by variables
plot_config = {
    "acceleration_RA": {
        "xbottom": 0.6,
        "xtop": 1.0,
        "point_width": 0.01,
        "bandwidth": 0.15,
        "label": "RA",
    },
    "amplitude": {
        "xbottom": 0.1,
        "xtop": 0.9,
        "point_width": 0.01,
        "bandwidth": 0.25,
        "label": "Amplitude",
    },
    "mesor": {
        "xbottom": 0.0,
        "xtop": 2.0,
        "point_width": 0.01,
        "bandwidth": 0.25,
        "label": "Mesor",     
    },
    "WASO_mean": {
        "xbottom": 0,
        "xtop": 1,
        "point_width": 0.01,
        "bandwidth": 0.25,
        "label": "WASO",
    },
    "total_sleep_mean": {
        "xbottom": 5,
        "xtop": 12,
        "point_width": 0.01,
        "bandwidth": 0.25,
        "label": "Sleep Duration",
    },
    "cosinor_rsquared": {
        "xbottom": 0,
        "xtop": 1,
        "point_width": 0.01,
        "bandwidth": 0.25,
        "label": "Cosinor R-squared",
    },
    "phase": {
        "xbottom": 11,
        "xtop": 19,
        "point_width": 0.01,
        "bandwidth": 0.25,
        "label": "Cosinor Phase",
    },
}

def density_plot(data, var="acceleration_RA", normalize=False, confidence_interval=True, rescale=True, annotate=False):
    CONTROL_COLOR = "teal"
    CASE_COLOR = "orange"
    UNCERTAIN_COLOR = (0.8, 0.8, 0.8)
    
    config = plot_config[var]
    xbottom = config['xbottom']
    xtop = config['xtop']
    point_width = config['point_width']
    bandwidth = config['bandwidth']
    eval_x = numpy.linspace(xbottom, xtop, int(0.5/point_width + 1))
    
    case = data.sex == "Male"

    case_scaling = (case).sum() * point_width if rescale else 1
    control_scaling = (~case).sum() * point_width if rescale else 1
    case_avg = data[var][case].mean()
    control_avg = data[var][~case].mean()
    total_incidence = case.sum()/len(case)

    def densities_and_incidence(data):
        cases = data[var][case].dropna()
        controls = data[var][~case].dropna()
        case_density = scipy.stats.gaussian_kde(cases, bw_method=bandwidth)(eval_x) * case_scaling
        control_density = scipy.stats.gaussian_kde(controls, bw_method=bandwidth)(eval_x) * control_scaling
        if not normalize:
            #incidence = local_regression(data[var], data[code], eval_x, bw=0.1)
            incidence = case_density / (control_density  + case_density)
        if normalize:
            incidence = case_density / total_incidence / 2 / (control_density + case_density / total_incidence / 2)
        return case_density, control_density, incidence
    
    case_density, control_density, incidence = densities_and_incidence(data)

    fig, (ax1,ax3) = pylab.subplots(nrows=2, sharex=True,
                                    figsize=(6,3),
                                    gridspec_kw = {"hspace":0.05, "bottom":0.2})

    # Plot the data
    ax1.fill_between(eval_x, 0, control_density, color=CONTROL_COLOR)
    ax3.fill_between(eval_x, 0, case_density, color=CASE_COLOR)

    # Plot avgs
    ax1.axvline(control_avg, c='k', linestyle="--")
    ax3.axvline(case_avg, c='k', linestyle="--")

    # Label plot
    ax1.set_ylabel(f"Femles\nN={(~case).sum()}")
    ax3.set_ylabel(f"Males\nN={case.sum()}") 
    ax3.set_xlabel(config['label'])

    ax1.spines['left'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.spines['top'].set_visible(False)
    ax1.tick_params(bottom=False)
    ax3.tick_params(bottom=False)
    ax1.yaxis.set_ticks([])
    ax3.spines['left'].set_visible(False)
    ax3.spines['right'].set_visible(False)
    ax3.yaxis.set_ticks([])

    # Set axis limits
    ax1.set_xlim(xbottom, xtop)
    if not normalize:
        max_density = max(numpy.max(case_density), numpy.max(control_density))
        ax1.set_ylim(0, max_density)
        ax3.set_ylim(0, max_density)
    else:
        ax1.set_ylim(0)
        ax3.set_ylim(0)
    ax3.invert_yaxis()

    if annotate:
        ax1.annotate("Female mean",
                        xy=(control_avg, numpy.max(control_density)/2),
                        xytext=(-50,0),
                        textcoords="offset pixels",
                        ha="right",
                        va="center",
                        arrowprops={"arrowstyle": "->"})
        ax3.annotate("Male mean",
                        xy=(case_avg, numpy.max(control_density)/2),
                        xytext=(-50,0),
                        textcoords="offset pixels",
                        ha="right",
                        va="center",
                        arrowprops={"arrowstyle": "->"})


    #ax1.set_title(var)
    return fig
fig = density_plot(activity, annotate=True )
fig.savefig(OUTDIR+"RA.by_sex.svg")

<IPython.core.display.Javascript object>

In [105]:
for var in activity_variables:
    fig = density_plot(activity, var=var,
                annotate=True if var == activity_variables[0] else False)
    fig.savefig(OUTDIR+f"{var}.by_sex.svg")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>