In [1]:
%matplotlib notebook
import scipy
import numpy
from IPython.display import display, HTML
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import re
import matplotlib.patches as mpatches
from scipy.cluster import hierarchy
import pylab
import pandas

In [2]:
full_activity = pandas.read_csv("../processed/activity_features_aggregate.txt", index_col=0, sep="\t")
activity_summary = pandas.read_csv("../processed/activity_summary_aggregate.txt", index_col=0, sep="\t")
ukbb = pandas.read_hdf("../processed/ukbb_data_table.h5")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
ukbb.columns = ukbb.columns.str.replace("[,:/]","_") # Can't use special characters easily

In [4]:
# Remove the activity variables that we don't want to use
bad_columns = ["_IV$", "_IS$", "^temp_", "^light_"]
good_columns = []
for c in full_activity.columns:
    fail = False
    for bad in bad_columns:
        if re.search(bad, c):
            fail = True
    if not fail:
        good_columns.append(c)
activity = full_activity[good_columns]

In [5]:
# drop activity for people who fail basic QC
[c for c in activity_summary.columns if 'quality' in c]
okay = activity_summary['quality-goodCalibration'].astype(bool) & (~activity_summary['quality-daylightSavingsCrossover'].astype(bool)) & (activity_summary['quality-goodWearTime'].astype(bool))
activity.columns = activity.columns.str.replace("-","_") # Can't use special characters easily
activity = activity[okay]
print(f"Dropping {(~okay).sum()} entries out of {len(okay)} due to bad quality or wear-time")

Dropping 11363 entries out of 103688 due to bad quality or wear-time


  """


In [6]:
data = activity.copy()
data = data.join(activity_summary[activity_summary.columns[activity_summary.columns.str.endswith("overall-avg")]], how="left")
covariates = ["sex",
              #"ethnicity",
              #"overall_health",
              #"household_income",
              #"smoking",
              "birth_year",
              #"BMI",
               #'education_Prefer_not_to_answer', # This answer causes problems for some reason
               #'education_None_of_the_above',
               #'education_College_or_University_degree',
               #'education_A_levels/AS_levels_or_equivalent', 
               #'education_O_levels/GCSEs_or_equivalent',
               #'education_CSEs_or_equivalent',
               #'education_NVQ_or_HND_or_HNC_or_equivalent',
               #'education_Other_professional_qualifications_eg:_nursing,_teaching',
                ]

print(f"Data starting size: {data.shape}")

Data starting size: (92325, 62)


In [7]:
icd10_entries = pandas.read_csv("../processed/ukbb_icd10_entries.txt", sep="\t")
# Select our cohort from all the entries
icd10_entries.rename(columns={"ICD10_code": "ICD10"}, inplace=True)

### and the ICD9 data
icd9_entries = pandas.read_csv("../processed/ukbb_icd9_entries.txt", sep="\t")
# Select our cohort from all the entries
icd9_entries.rename(columns={"ICD9_code": "ICD9"}, inplace=True)

In [8]:
self_reported = pandas.read_csv("../processed/ukbb_self_reported_conditions.txt", sep="\t", dtype={"condition_code":int})
data_fields = pandas.read_csv("../Data_Dictionary_Showcase.csv", index_col="FieldID")
codings = pandas.read_csv("../Codings_Showcase.csv", dtype={"Coding": int})

In [9]:
SELF_REPORTED_CONDITION_FIELD = 20002
condition_code_to_meaning = codings[codings.Coding  == data_fields.loc[20002].Coding].drop_duplicates(subset=["Value"], keep=False).set_index("Value")
self_reported["condition"] = self_reported.condition_code.astype(str).map(condition_code_to_meaning.Meaning)

In [10]:
## Select those with HIV in their ICD10 status

HIV_CODES = ["B20", "B21", "B22", "B23", "B24"]
hiv_ids = set()
for code in HIV_CODES:
    ids = icd10_entries[icd10_entries.ICD10.str.startswith(code)].ID.unique()
    hiv_ids.update(ids)

self_reported_hiv_ids = set(self_reported[self_reported.condition.isin(["hiv/aids"])].ID.unique())
hiv_ids.update(self_reported_hiv_ids)

data['hiv_case'] = data.index.isin(hiv_ids)

print(f"Identified {data.hiv_case.sum()} cases with actigraphy out of {len(hiv_ids)} with HIV throughout UKBB")

Identified 79 cases with actigraphy out of 491 with HIV throughout UKBB


In [21]:
### Generate a summary file of the IDs by HIV status as well as basic covariates and QC for their actigraphy
summary_data = data[['hiv_case']].join(ukbb[['birth_year', 'sex']]).sort_values('hiv_case', ascending=False)
summary_data.index.rename("ID", inplace=True)

In [22]:
summary_data.to_csv("../processed/HIV/ID_list.txt", sep="\t")