# Shiftwork analysis

Check if there are associations of phenotypes with shift work

In [1]:
%matplotlib notebook
import pathlib
import scipy
import numpy
from IPython.display import display, HTML
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import re
import matplotlib.patches as mpatches
from scipy.cluster import hierarchy
import pylab

# Configure plot settings
# Make it so that plots autolayout to fit everything (particularly text labels)
# NOTE: this can mess with dynamic zooming
#pylab.rcParams['figure.autolayout'] = True

In [2]:
COHORT = 1
OUTDIR = f"../shiftwork/cohort{COHORT}/"

In [3]:
import pandas
#full_activity = pandas.read_csv("../processed/activity_features_aggregate.txt", index_col=0, sep="\t")
#activity_summary = pandas.read_csv("../processed/activity_summary_aggregate.txt", index_col=0, sep="\t")
ukbb_full = pandas.read_hdf("../processed/ukbb_data_table.h5")
employment_full = pandas.read_csv("../processed/ukbb_employment_history.txt", sep="\t")

In [4]:
ukbb_full.columns = ukbb_full.columns.str.replace("[,:/]","_") # Can't use special characters easily

In [5]:
covariates = [
              "sex", "ethnicity", "overall_health", "household_income", "smoking", "birth_year", "BMI",
               #'education_Prefer_not_to_answer', # This answer causes problems for some reason
               'education_None_of_the_above',
               'education_College_or_University_degree',
               'education_A_levels_AS_levels_or_equivalent', 
               'education_O_levels_GCSEs_or_equivalent',
               'education_CSEs_or_equivalent',
               'education_NVQ_or_HND_or_HNC_or_equivalent',
               'education_Other_professional_qualifications_eg__nursing__teaching',
                ]

all_ids = sorted(employment_full.ID.unique())

# Down sample for testing
# Total with employment history is 120,000
numpy.random.seed(0)
cohort_id_ranges = {1: slice(0, 30_000),
           2: slice(30_000,120299)}
selected_ids = numpy.random.choice(all_ids, size=len(all_ids), replace=False)[cohort_id_ranges[COHORT]]

print(f"Data size before selecting test set: {ukbb_full.shape}")
print(f"of which {len(employment_full.ID.unique())} had detailed employment information")

employment = employment_full[employment_full.ID.isin(selected_ids)].copy()
data = ukbb_full[ukbb_full.index.isin(selected_ids)].copy()

print(f"Data size after selecting test set: {data.shape}")
print(f"Employment data size before selecting test set: {employment_full.shape}")
print(f"Employment data size after selecting test set: {employment.shape}")

Data size before selecting test set: (502507, 187)
of which 120299 had detailed employment information
Data size after selecting test set: (30000, 187)
Employment data size before selecting test set: (402689, 20)
Employment data size after selecting test set: (100394, 20)


In [6]:
# Find an ever-shiftwork cohort and ever-night-shift cohort
data['ever_shiftwork'] = employment.groupby("ID").job_involved_shift_work.any()
employment['involved_night_shift'] = ((employment.night_shifts_worked.isin([0,1]))
                                     | (employment.mixture_of_day_and_night_shifts.isin([0,1])))
data['ever_night_shiftwork'] = employment.groupby("ID").involved_night_shift.any()
print(f"Of {data.shape[0]}, {sum(data.ever_shiftwork)} have some shiftwork, of which {sum(data.ever_night_shiftwork)} have some night shiftwork")

Of 30000, 8633 have some shiftwork, of which 6995 have some night shiftwork


In [7]:
data.head()

Unnamed: 0_level_0,alcohol_frequency,education_Prefer_not_to_answer,education_None_of_the_above,education_College_or_University_degree,education_A_levels_AS_levels_or_equivalent,education_O_levels_GCSEs_or_equivalent,education_CSEs_or_equivalent,education_NVQ_or_HND_or_HNC_or_equivalent,education_Other_professional_qualifications_eg__nursing__teaching,ethnicity,...,morning_evening_person,nap_during_day,sleep_duration,sleeplessness,snoring,age_at_death,date_of_death,primary_cause_of_death_ICD10,ever_shiftwork,ever_night_shiftwork
f.eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000124,Once or twice a week,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,British,...,More a 'morning' than 'evening' person,Sometimes,8.0,Sometimes,Yes,,,,False,False
1000264,Three or four times a week,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Other ethnic group,...,Do not know,Never/rarely,7.0,Never/rarely,No,,,,False,False
1000322,Three or four times a week,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,Irish,...,More a 'morning' than 'evening' person,Never/rarely,7.0,Sometimes,Yes,,,,True,True
1000572,Once or twice a week,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,British,...,More a 'morning' than 'evening' person,Sometimes,8.0,Sometimes,No,,,,False,False
1000620,,,,,,,,,,,...,,,,,,,,,False,False


In [8]:
def BH_FDR(ps):
    ''' Benjamini-Hochberg FDR control

    Converts p values to q values'''

    # For the purposes of comparison, an implementation of Benjamini Hochberg correction
    sort_order = numpy.argsort(ps)

    adjusted = numpy.zeros(ps.shape)
    adjusted[sort_order] = numpy.array(ps)[sort_order]*len(ps)/numpy.arange(1,len(ps)+1)

    # Make monotone, skipping NaNs
    m = 1;
    for i, r in enumerate(sort_order[::-1]):
        if numpy.isfinite(adjusted[r]):
            m = min(adjusted[r], m)
            adjusted[r] = m

    return adjusted # the q-values

In [9]:
# Add the amount of time at the job to the employment data
# if the year_job_ended == -313, then the job was ongoing at the time of the questionairre.
# We treat this as the maximum of 2017 and one year after the job started
employment['year_job_ended_corrected'] = employment['year_job_ended'].copy()
employment.loc[employment.year_job_ended == -313.0, 'year_job_ended_corrected'] = 2017
employment['duration'] = employment['year_job_ended_corrected'] - employment['year_job_started']
employment.loc[employment.year_job_ended == -313.0, 'duration'] = numpy.maximum(1, employment.loc[employment.year_job_ended == -313.0, 'duration'])

In [10]:
# Add duration of night-shiftwork jobs up
def years_employed(data):
    # Sum number of years employed, not double-counting years when two jobs were listed
    data_sorted = data.sort_values(by="year_job_started")
    starts, ends = data.year_job_started, data.year_job_ended_corrected
    duration = 0
    last_end = float("-inf")
    for start, end in zip(starts, ends):
        duration += max(end - max(start, last_end), 0)
        last_end = max(end, last_end)
    # If employed at all, we give at least 0.5 year duration since
    # may start and end on the same year but 0 is clearly a bad choice
    if len(data) > 0:
        duration = max(duration, 0.5)
    return duration
data['duration_night_shiftwork'] = employment[employment.involved_night_shift & (~employment.duplicated())].groupby("ID").apply(years_employed)
data.duration_night_shiftwork.fillna(0, inplace=True)

## Summarize Job Types

In [11]:
job_code_info = pandas.read_csv("../coding497.tsv", sep="\t", engine="python", index_col='node_id')

In [12]:
job_code_info['level'] = float("NaN")
def get_level(index):
    level = job_code_info.loc[index].level
    if pandas.isna(level):
        parent = job_code_info.loc[index].parent_id
        if parent == 0:
            level = 1
        else:
            level = get_level(parent) + 1
        job_code_info.loc[index, 'level'] = level
    return level
        
[get_level(index) for index in job_code_info.index]

def get_level_meaning(index, level=1):
    if job_code_info.loc[index].level <= level:
        return job_code_info.loc[index].meaning
    return get_level_meaning(job_code_info.loc[index].parent_id, level=level)
job_code_info['level1_meaning'] = job_code_info.index.map(get_level_meaning)
job_code_info['level2_meaning'] = job_code_info.index.map(lambda x: get_level_meaning(x,2))

job_code_info = job_code_info.reset_index().set_index('coding')

In [13]:
employment['job_category'] = employment.job_code.map(job_code_info.level1_meaning)
employment['job_category_level2'] = employment.job_code.map(job_code_info.level2_meaning)

In [14]:
# Gather the shift-work data and compute frequencies of the job categories
pandas.DataFrame({
    "all_night_shift": employment[employment.involved_night_shift].job_category.value_counts(),
    "male": employment[employment.involved_night_shift & (employment.ID.map(data.sex) == "Male")].job_category.value_counts(),
    "female": employment[employment.involved_night_shift & (employment.ID.map(data.sex) == "Female")].job_category.value_counts(),
}).sort_values(by="all_night_shift", ascending=False)

Unnamed: 0,all_night_shift,male,female
"health (human or animal), residential/social/religious care, undertaking (including managers)",3905,577,3328
"armed forces, emergency services, security, health & safety (including managers)",1388,1137,251
"science, research, engineering, computer technology (including managers)",914,789,125
routine factory-based manufacturing (including managers),863,779,84
"transport (road, rail, air, water), work with other mobile machinery (including managers)",640,548,92
skilled manual work (including managers),594,580,14
"office-based work: professional, managerial, administrative or general office/clerical",565,321,244
"mining, quarrying, energy production, water treatment (including managers)",303,301,2
"personal services, travel/tourism, hospitality (including managers)",283,102,181
"sport, culture, arts, media, entertainment (including managers)",265,149,116


## Test association with metabolic syndrome

Following the definition in https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5436094/ for determining metabolic syndrome cases:
    defined as waist circumference ⩾94 cm for males and ⩾80 cm for females)
    as well as at least two of the following four factors: 
    (1) raised triglycerides (⩾1.7 mmol l−1),2
    (2) reduced HDL cholesterol (<1.03 mmol l−1 in males and <1.29 mmol l−1 for females),
    (3) raised blood pressure (BP) (systolic BP ⩾130 mm Hg or diastolic BP ⩾85 mm Hg),
    (4) raised fasting plasma glucose (defined as HbA1c levels⩾5.7 mmol l−1

In [15]:
def at_least_two(*args):
    df = pandas.DataFrame({i:arg for i, arg in enumerate(args)})
    return df.sum(axis=1) >= 2
data['metabolic_syndrome'] = (
    (data.sex == "Male") & (
        (data.waist_circumference >= 94) & at_least_two(
            (data.triglycerides >= 1.7),
            (data.hdl_cholesterol < 1.03),
            ((data.systolic_blood_pressure_V0 > 130) |
             (data.diastolic_blood_pressure_V0 > 85)),
            ((data.glycated_heamoglobin > 5.7) &
             (data.blood_sample_fasting_time >= 8))
        )
    )
    |
    (data.sex == "Female") & (
        (data.waist_circumference >= 80) & at_least_two(
            (data.triglycerides >= 1.7),
            (data.hdl_cholesterol < 1.29),
            ((data.systolic_blood_pressure_V0 > 130) |
             (data.diastolic_blood_pressure_V0 > 85)),
            ((data.glycated_heamoglobin > 5.7) &
             (data.blood_sample_fasting_time >= 8))
        )
    )
)

In [16]:
print(f"Identified {data.metabolic_syndrome.sum()} cases out of {len(data)}")
print(f"Male: {(data.metabolic_syndrome & (data.sex == 'Male')).sum()}")
print(f"Female: {(data.metabolic_syndrome & (data.sex == 'Female')).sum()}")

Identified 7207 cases out of 30000
Male: 3805
Female: 3402


### Simply model without using job codes

In [17]:
results = smf.ols("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork * sex", data=data).fit()
results.summary()

0,1,2,3
Dep. Variable:,metabolic_syndrome.astype(int),R-squared:,0.018
Model:,OLS,Adj. R-squared:,0.018
Method:,Least Squares,F-statistic:,139.5
Date:,"Mon, 28 Sep 2020",Prob (F-statistic):,2.6200000000000003e-118
Time:,10:44:10,Log-Likelihood:,-16778.0
No. Observations:,30000,AIC:,33570.0
Df Residuals:,29995,BIC:,33610.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.2009,0.004,55.371,0.000,0.194,0.208
ever_night_shiftwork[T.True],0.0340,0.009,3.984,0.000,0.017,0.051
sex[T.Male],0.0566,0.006,9.939,0.000,0.045,0.068
ever_night_shiftwork[T.True]:sex[T.Male],0.0471,0.012,4.031,0.000,0.024,0.070
center(birth_year),-0.0043,0.000,-13.490,0.000,-0.005,-0.004

0,1,2,3
Omnibus:,5106.417,Durbin-Watson:,1.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7302.235
Skew:,1.182,Prob(JB):,0.0
Kurtosis:,2.5,Cond. No.,43.7


In [18]:
results.pvalues

Intercept                                   0.000000e+00
ever_night_shiftwork[T.True]                6.787441e-05
sex[T.Male]                                 3.050027e-23
ever_night_shiftwork[T.True]:sex[T.Male]    5.577871e-05
center(birth_year)                          2.356116e-41
dtype: float64

In [19]:
# Basic percent-case for each group
percent_metabolic_syndrome = data.groupby(["sex", "ever_night_shiftwork"]).metabolic_syndrome.mean().reset_index()
percent_metabolic_syndrome

Unnamed: 0,sex,ever_night_shiftwork,metabolic_syndrome
0,Female,False,0.199516
1,Female,True,0.226084
2,Male,False,0.261491
3,Male,True,0.340463


In [23]:
# Plot the raw data by sex and shiftwork (percentage metabolic)
#fig, ax = pylab.subplots()
#ax.bar(x=numpy.arange(4), height=percent_metabolic_syndrome*100)
#ax.set_title("Metabolic Syndrome and Shiftwork")
#ax.set_ylabel("Percent with metabolic syndrome")
#ax.set_xticks(numpy.arange(4))
#ax.set_xticklabels([f"{a} {'Shiftwork' if b else 'None'}" for a,b in percent_metabolic_syndrome.index])
data['metabolic_syndrome_percent'] = data.metabolic_syndrome * 100
fig, ax = pylab.subplots()
g = sns.barplot(x="sex", y="metabolic_syndrome_percent", hue="ever_night_shiftwork",  data = data,
            saturation=0.85, capsize=0.15, errwidth=1.5)
fig.savefig(OUTDIR+"/percent_metabolic_syndrome.png")
ax.set_ylabel("Percent with metabolic syndrome")
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
legend = [c for c in ax.get_children() if type(c) is pylab.mpl.legend.Legend][0]
legend.set_title("")
legend.texts[0].set_text("No night shifts")
legend.texts[1].set_text("Night shifts")

<IPython.core.display.Javascript object>

## Mixed Model account for Job Type

In [21]:
# First we gather a lifetime history of job employment type
# each individual is given a False/True for each job type if they have ever worked in that job
job_types = employment.groupby("ID").job_category.value_counts().unstack().fillna(0) > 0
data_employment = data.join(job_types)

In [24]:
job_terms = ' + '.join(f'Q("{term}")' for term in job_types.columns)
results = smf.ols(data = data_employment, formula=f"metabolic_syndrome.astype(int) ~ center(birth_year) + {job_terms} + sex * ever_night_shiftwork").fit()
results.summary()

LinAlgError: SVD did not converge

In [25]:
results_reduced = smf.ols(data = data_employment, formula=f"metabolic_syndrome.astype(int) ~ center(birth_year) + {job_terms} + sex + ever_night_shiftwork").fit()
_, pvalue, _ = results.compare_f_test(results_reduced)
print(f"Interaction term p-value:", pvalue)

Interaction term p-value: nan


## Control for job type via pairings

In [26]:
## Try to find 'tetrads' of male/female night/day workers...
# WARNING: slow!!
tetrad_list_file = pathlib.Path(OUTDIR+"/tetrad_id_list.txt")
if not tetrad_list_file.exists():
    used_ids = set()
    for ID in data[data.ever_night_shiftwork].index:
        if ID not in used_ids:
            # Try to find matches
            sex, shiftwork = data.loc[ID,['sex', 'ever_night_shiftwork']]
            job_history = employment[employment.ID == ID]
            # Find the/a job that is shiftwork for them
            jobtype = job_history[job_history.involved_night_shift].sort_values(by="duration").job_category_level2.iloc[0]

            # Find those with the same job type and haven't been used yet
            possible_ids = employment[employment.job_category_level2 == jobtype].ID.unique()

            tetrad = {}
            tetrad[(sex, shiftwork)] = ID
            for ID2 in set(possible_ids).difference(used_ids):
                sex2, shiftwork2 = data.loc[ID2, ['sex', 'ever_night_shiftwork']]
                if (sex2, shiftwork2) not in tetrad:
                    tetrad[(sex2, shiftwork2)] = ID2
                    if len(tetrad) == 4:
                        break
            if len(tetrad) == 4:
                used_ids.update(tetrad.values())
    pandas.Series(list(used_ids)).to_csv(tetrad_list_file, sep="\t", index=0)
else:
    # If output already exists, we just use that list of ids instead
    print(f"Reading tetrad list in from {tetrad_list_file}")
    used_ids = set(int(x) for x in tetrad_list_file.read_text().splitlines())

Reading tetrad list in from ..\shiftwork\cohort1\tetrad_id_list.txt


In [27]:
tetrad_data = data[data.index.isin(used_ids)]
print(f"Identified N={len(used_ids)} individuals in {len(used_ids)//4} 'tetrads' with similar job histories\nand one each of the four combinations (male/female) (shiftwork/noshiftwork)")

Identified N=6732 individuals in 1683 'tetrads' with similar job histories
and one each of the four combinations (male/female) (shiftwork/noshiftwork)


In [33]:
results = smf.ols("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
reduced = smf.ols("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork + sex", data=tetrad_data).fit()
_, nested_p, _ = results.compare_f_test(reduced)
male_shiftwork_p = results.f_test("ever_night_shiftwork[T.True] + ever_night_shiftwork[T.True]:sex[T.Male] = 0").pvalue
results.summary()
#reduced.summary()

0,1,2,3
Dep. Variable:,metabolic_syndrome.astype(int),R-squared:,0.016
Model:,OLS,Adj. R-squared:,0.015
Method:,Least Squares,F-statistic:,27.28
Date:,"Mon, 28 Sep 2020",Prob (F-statistic):,1.72e-22
Time:,11:15:50,Log-Likelihood:,-3936.2
No. Observations:,6732,AIC:,7882.0
Df Residuals:,6727,BIC:,7916.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.2181,0.011,20.601,0.000,0.197,0.239
ever_night_shiftwork[T.True],0.0055,0.015,0.366,0.715,-0.024,0.035
sex[T.Male],0.0434,0.015,2.887,0.004,0.014,0.073
ever_night_shiftwork[T.True]:sex[T.Male],0.0630,0.021,2.974,0.003,0.021,0.105
center(birth_year),-0.0037,0.001,-5.395,0.000,-0.005,-0.002

0,1,2,3
Omnibus:,1352.367,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1461.156
Skew:,1.077,Prob(JB):,0.0
Kurtosis:,2.244,Cond. No.,40.8


In [34]:
print("P-Values when controlling for job types")
print("Total shift-work effect:", reduced.pvalues["ever_night_shiftwork[T.True]"])
print("Female shift-work effect:", results.pvalues["ever_night_shiftwork[T.True]"])
print("Male shift-work effect:", male_shiftwork_p)
print("Differential Male/Female effect:", results.pvalues["ever_night_shiftwork[T.True]:sex[T.Male]"])

P-Values when controlling for job types
Total shift-work effect: 0.0004967242289831486
Female shift-work effect: 0.7147403780533731
Male shift-work effect: 4.984012989995691e-06
Differential Male/Female effect: 0.002949996564679227


In [32]:
#results = smf.logit("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
#reduced = smf.logit("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork + sex", data=tetrad_data).fit()
#results.summary()

In [35]:
# Figure for the controlled model
fig, ax = pylab.subplots()
model_results = pandas.DataFrame({
    "Sex": ["Male", "Male", "Female", "Female"],
    "Shiftwork": [False, True, False, True],
    "metabolic_syndrome": [
        results.params["Intercept"] + results.params["sex[T.Male]"],
        results.params["Intercept"] + results.params["ever_night_shiftwork[T.True]"] + results.params["sex[T.Male]"] + results.params["ever_night_shiftwork[T.True]:sex[T.Male]"],
        results.params["Intercept"],
        results.params["Intercept"] + results.params["ever_night_shiftwork[T.True]"],
    ]
})
model_results['metabolic_syndrome'] *= 100 # Convert to percents
sns.barplot(x="Sex", y="metabolic_syndrome", hue="Shiftwork",  data = model_results)
ax.set_ylabel("Percent with Metabolic Syndrome")

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Percent with Metabolic Syndrome')

In [None]:
# Figure for the RAW data of the "tetrad" dataset
fig, ax = pylab.subplots()
model_results = pandas.DataFrame({
    "Sex": ["Male", "Male", "Female", "Female"],
    "Shiftwork": [False, True, False, True],
    "metabolic_syndrome": [
        tetrad_data[(tetrad_data.sex == "Male") & (tetrad_data.ever_night_shiftwork == False)].metabolic_syndrome.mean(),
        tetrad_data[(tetrad_data.sex == "Male") & (tetrad_data.ever_night_shiftwork == True)].metabolic_syndrome.mean(),
        tetrad_data[(tetrad_data.sex == "Female") & (tetrad_data.ever_night_shiftwork == False)].metabolic_syndrome.mean(),
        tetrad_data[(tetrad_data.sex == "Female") & (tetrad_data.ever_night_shiftwork == True)].metabolic_syndrome.mean()
    ]
})
model_results['metabolic_syndrome'] *= 100 # Convert to percents
sns.barplot(x="Sex", y="metabolic_syndrome", hue="Shiftwork",  data = model_results)
ax.set_ylabel("Percent with Metabolic Syndrome")

## Check associations with the five criteria for Metabolic Syndrome

In [None]:
results_waist = smf.ols("waist_circumference ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
print(results_waist.summary())

In [None]:
results_triglycerides = smf.ols("triglycerides ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
print(results_triglycerides.summary())
print("Interaction p-value:", results_triglycerides.pvalues["ever_night_shiftwork[T.True]:sex[T.Male]"])

In [None]:
results_hdl = smf.ols("hdl_cholesterol ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
print(results_hdl.summary())

In [None]:
results_systolic = smf.ols("systolic_blood_pressure_V0 ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
print(results_systolic.summary())

In [None]:
results_diastolic = smf.ols("diastolic_blood_pressure_V0 ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data).fit()
print(results_diastolic.summary())

In [None]:
results_gh = smf.ols("glycated_heamoglobin ~ center(birth_year) + ever_night_shiftwork * sex", data=tetrad_data[tetrad_data.blood_sample_fasting_time >= 8]).fit()
print(results_gh.summary())

## Dose-response
Does the 'dose' of the shiftwork matter? Do more years in shiftwork imply a larger metabolic syndrome risk?

In [None]:
results = smf.ols("(metabolic_syndrome.astype(int)) ~ center(birth_year) + duration_night_shiftwork * sex",
                  data=tetrad_data).fit()
results.summary()

In [None]:
# Plot the percent metabolic syndrome versus duration of night shiftwork
#fig, ax = pylab.subplots()
baseline = tetrad_data[~tetrad_data.ever_night_shiftwork].metabolic_syndrome.mean()
smoothed = sm.nonparametric.lowess(tetrad_data['metabolic_syndrome'], tetrad_data['duration_night_shiftwork'],
                                  return_sorted=True,
                                   frac=0.3,
                                  delta=1,
                                  it=0)
#ax.plot(smoothed[:,0], smoothed[:,1])
#ax.axhline(baseline, c="k",)
sns.lmplot(x="duration_night_shiftwork", y="metabolic_syndrome", data=tetrad_data, logistic=False, ci = 95, y_jitter=0.1, hue="sex")

In [None]:
#fig, ax = pylab.subplots()

sns.lmplot(x="duration_night_shiftwork", y="triglycerides", data=tetrad_data[tetrad_data.ever_night_shiftwork], hue="sex")
results = smf.ols("triglycerides ~ center(birth_year) + duration_night_shiftwork * sex",
                  data=tetrad_data[tetrad_data.ever_night_shiftwork]).fit()
results.summary()

In [None]:
((tetrad_data.duration_night_shiftwork == 0.5) & tetrad_data.ever_night_shiftwork).idxmax()

In [None]:
print(data.loc[1059503, ["duration_night_shiftwork", "ever_night_shiftwork"]])

years_employed(employment[employment.ID == 1059503])

In [None]:
tetrad_data.duration_night_shiftwork.describe()

In [None]:
results = smf.ols("(waist_circumference) ~ center(birth_year) + duration_night_shiftwork * sex",
                  data=tetrad_data).fit()
results.summary()

In [None]:
### PERMUTATION TEST
ITERS = 10
results = []
for i in range(ITERS):
    resampled = tetrad_data.sample(n=len(tetrad_data), replace=True)
    results.append(smf.ols("(metabolic_syndrome.astype(int)) ~ center(birth_year) + ever_night_shiftwork * sex", data=resampled).fit())
interaction_params = [res.params["ever_night_shiftwork[T.True]:sex[T.Male]"] for res in results]