In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
cd ~/demres

/Users/zurfarosa/demres


In [17]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
from datetime import date, timedelta

import demres
from demres.common.constants import entry_type
from demres.common import codelists
from demres.common.helper_functions import *
from demres.common.process_pt_features import *
from demres.common.process_entries import *
from demres.demins.constants import Study_Design as sd
from demres.demins.statistical_functions import *
from common.helper_functions import *
from pprint import pprint
from IPython.display import display

In [18]:
pd.set_option('display.max_columns', None)

In [19]:
pd.set_option('display.max_rows', None)

## Process raw CSV files

In [20]:
# create_pegmed()

In [21]:
# create_pegprod()

In [22]:
# create_prescriptions()

In [23]:
# create_consultations()

In [24]:
# create_clinicals()

In [25]:
# create_tests()

In [26]:
# create_referrals()

In [27]:
# create_immunisations()

In [28]:
# create_medcoded_entries()

## Create processed CVS files to share with DEMINS teams
*convert the newly created HDF files to CVS (for use in other projects)*

In [29]:
# prescriptions = pd.read_hdf('data/processed_data/hdf/prescriptions.hdf')
# prescriptions.to_csv('data/processed_data/to_share_with_DEMINS/prescriptions.csv',index=False)

In [30]:
# consultations = pd.read_hdf('data/processed_data/hdf/consultations.hdf')
# consultations.to_csv('data/processed_data/to_share_with_DEMINS/consultations.csv',index=False)

In [31]:
# immunisations = pd.read_hdf('data/processed_data/hdf/immunisations.hdf')
# immunisations.to_csv('data/processed_data/to_share_with_DEMINS/immunisations.csv',index=False)

In [32]:
# clinicals = pd.read_hdf('data/processed_data/hdf/clinicals.hdf')
# clinicals.to_csv('data/processed_data/to_share_with_DEMINS/clinicals.csv',index=False)

In [33]:
# tests = pd.read_hdf('data/processed_data/hdf/tests.hdf')
# tests.to_csv('data/processed_data/to_share_with_DEMINS/tests.csv',index=False)

In [34]:
# referrals = pd.read_hdf('data/processed_data/hdf/referrals.hdf')
# referrals.to_csv('data/processed_data/to_share_with_DEMINS/referrals.csv',index=False)

In [35]:
# medcoded_entries = pd.read_hdf('data/processed_data/hdf/medcoded_entries.hdf')
# medcoded_entries.to_csv('data/processed_data/to_share_with_DEMINS/medcoded_entries.csv',index=False)

## Create basic pt_features dataframe
*pt_features will contain all the variables (e.g. age, female gender, insomnia) used in the logistic regression*

In [37]:
all_encounters = get_all_encounters()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  all_encounters = pd.concat([consultations,medcoded_entries],ignore_index=True)


In [43]:
all_entries = get_all_entries(all_encounters)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  all_entries = pd.concat([all_encounters,prescriptions],ignore_index=True)


In [86]:
pt_features = create_pt_features()

In [88]:
pt_features = get_index_date_and_caseness_and_add_final_dementia_subtype(all_entries,pt_features)

Number of patients prescribed antidementia drugs but not diagnosed with dementia: 321


In [89]:
pt_features = add_data_start_and_end_dates(all_encounters,pt_features)

calculating latest_sysdate
calculating earliest_sysdate
resampling all_encounters - may take some time...
locating converted codes
choosing most appropriate measure of data_start
removing patients without any events
There are 14 patients without any events


In [90]:
pt_features.to_csv('data/processed_data/pt_features.csv',index=False)

In [91]:
#For sensitivity analysis, remove specific dementias (e.g. FTLD, DLB) from pt_features, but save this to a different CSV file
pt_features_avoid_specific_dementia = avoid_specific_dementia_subtypes(pt_features) 
pt_features_avoid_specific_dementia.to_csv('data/processed_data/pt_features_avoid_specific_dementia.csv',index=False)

removing cases where final dementia subtype is a specific, non-Alzheimer, non-VaD dementia
Number of cases removed:  6442
Number of patients (cases and controls) 88058


In [42]:
for window in [sd.exposure_windows[0],sd.exposure_windows[2]]:
    printbold(window['name'])  
    pt_features = pd.read_csv('data/processed_data/pt_features_avoid_specific_dementia.csv',delimiter=',',parse_dates=['index_date','data_end','data_start'],infer_datetime_format=True)
    pt_features = match_cases_and_controls(pt_features,window)
    pt_features.to_csv('data/processed_data/pt_features_avoid_specific_dementia_'+ window['name'] +'.csv',index=False)


**
12_to_7**

All cases 40130
Number of cases with 10 years of data 10374
Number of cases without 10 years of data (to be discarded): 29756
Number of controls 47928
32157  cases being removed as unmatchable
39955  controls being removed as unmatchable
72112  total patients being removed as unmatchable


**
8_to_3**

All cases 40130
Number of cases with 10 years of data 22094
Number of cases without 10 years of data (to be discarded): 18036
Number of controls 47928
21527  cases being removed as unmatchable
29325  controls being removed as unmatchable
50852  total patients being removed as unmatchable


## Add derived variables to pt_features 
*e.g. insomnia count, history of stroke, consultation count*

In [44]:
medcoded_entries = pd.read_hdf('data/processed_data/hdf/medcoded_entries.hdf')

In [45]:
prescriptions = pd.read_hdf('data/processed_data/hdf/prescriptions.hdf')

In [22]:
pt_features = pd.read_csv('data/processed_data/pt_features_avoid_specific_dementia_'+ sd.exposure_windows[1]['name'] +'.csv',delimiter=',',parse_dates=['index_date','data_end','data_start'],infer_datetime_format=True)

In [24]:
create_pdd_for_each_drug(prescriptions,pt_features,sd.exposure_windows[1])

In [None]:
for window in [sd.exposure_windows[0],sd.exposure_windows[2]]:
    print(window['name'],'...')
    pt_features = pd.read_csv('data/processed_data/pt_features_avoid_specific_dementia_' + window['name'] +'.csv',delimiter=',',parse_dates=['index_date','data_end','data_start'],infer_datetime_format=True)
    pt_features = get_multiple_condition_statuses(pt_features,medcoded_entries,prescriptions,window,codelists.all_codelists)
#     pt_features = create_PDD_columns_for_each_pt(pt_features,window,[codelists.insomnia['medications']],prescriptions)
#     pt_features = create_quantiles_and_booleans(pt_features)
    display(pt_features.sample(2))    
    pt_features.to_csv('data/processed_data/pt_features_avoid_specific_dementia_'+ window['name'] +'.csv',index=False)

12_to_7 ...
insomnia
stroke
