In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
cd ~/demres

/Users/zurfarosa/demres


In [11]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
from datetime import date, timedelta

import demres
from demres.common.constants import entry_type
from demres.common import codelists
from demres.common.helper_functions import *
from demres.common.process_pt_features import *
from demres.common.process_entries import *
from demres.demins.constants import Study_Design as sd
from demres.demins.statistical_functions import *
from common.helper_functions import *
from pprint import pprint
from IPython.display import display

In [12]:
pd.set_option('display.max_columns', None)

In [13]:
pd.set_option('display.max_rows', None)

## Process raw CSV files

In [6]:
# create_pegmed()

In [7]:
# create_pegprod()

In [8]:
# create_prescriptions()

In [9]:
# create_consultations()

In [10]:
# create_clinicals()

In [11]:
# create_tests()

In [12]:
# create_referrals()

In [13]:
# create_immunisations()

In [14]:
# create_medcoded_entries()

## Create processed CVS files to share with DEMINS teams
*convert the newly created HDF files to CVS (for use in other projects)*

In [15]:
# prescriptions = pd.read_hdf('data/processed_data/hdf/prescriptions.hdf')
# prescriptions.to_csv('data/processed_data/to_share_with_DEMINS/prescriptions.csv',index=False)

In [16]:
# consultations = pd.read_hdf('data/processed_data/hdf/consultations.hdf')
# consultations.to_csv('data/processed_data/to_share_with_DEMINS/consultations.csv',index=False)

In [17]:
# immunisations = pd.read_hdf('data/processed_data/hdf/immunisations.hdf')
# immunisations.to_csv('data/processed_data/to_share_with_DEMINS/immunisations.csv',index=False)

In [18]:
# clinicals = pd.read_hdf('data/processed_data/hdf/clinicals.hdf')
# clinicals.to_csv('data/processed_data/to_share_with_DEMINS/clinicals.csv',index=False)

In [19]:
# tests = pd.read_hdf('data/processed_data/hdf/tests.hdf')
# tests.to_csv('data/processed_data/to_share_with_DEMINS/tests.csv',index=False)

In [20]:
# referrals = pd.read_hdf('data/processed_data/hdf/referrals.hdf')
# referrals.to_csv('data/processed_data/to_share_with_DEMINS/referrals.csv',index=False)

In [21]:
# medcoded_entries = pd.read_hdf('data/processed_data/hdf/medcoded_entries.hdf')
# medcoded_entries.to_csv('data/processed_data/to_share_with_DEMINS/medcoded_entries.csv',index=False)

## Create basic pt_features dataframe
*pt_features will contain all the variables (e.g. age, female gender, insomnia) used in the logistic regression*

In [14]:
all_encounters = get_all_encounters()

In [15]:
all_entries = get_all_entries(all_encounters)

KeyboardInterrupt: 

In [86]:
pt_features = create_pt_features()

In [87]:
pt_features.head(5)

Unnamed: 0,patid,yob,pracid,female
0,57001,32,1,0
1,60001,24,1,0
2,149001,19,1,1
3,364001,37,1,1
4,432001,20,1,1


In [88]:
pt_features = get_index_date_and_caseness_and_add_final_dementia_subtype(all_entries,pt_features)

Number of patients prescribed antidementia drugs but not diagnosed with dementia: 321


In [89]:
pt_features = add_data_start_and_end_dates(all_encounters,pt_features)

calculating latest_sysdate
calculating earliest_sysdate
resampling all_encounters - may take some time...
locating converted codes
choosing most appropriate measure of data_start
removing patients without any events
There are 14 patients without any events


In [90]:
pt_features.to_csv('data/processed_data/pt_features.csv',index=False)

In [91]:
#For sensitivity analysis, remove specific dementias (e.g. FTLD, DLB) from pt_features, but save this to a different CSV file
pt_features_avoid_specific_dementia = avoid_specific_dementia_subtypes(pt_features) 
pt_features_avoid_specific_dementia.to_csv('data/processed_data/pt_features_avoid_specific_dementia.csv',index=False)

removing cases where final dementia subtype is a specific, non-Alzheimer, non-VaD dementia
Number of cases removed:  6442
Number of patients (cases and controls) 88058


In [19]:
for window in sd.exposure_windows:
    print('\n' + window['name'])  
    if window == sd.exposure_windows[1]:
        files = ['pt_features','pt_features_avoid_specific_dementia']
    else:
        files = ['pt_features']
    for file in files:
        print(file)  
        pt_features = pd.read_csv('data/processed_data/' + file + '.csv',delimiter=',',parse_dates=['index_date','data_end','data_start'],infer_datetime_format=True)
        pt_features = match_cases_and_controls(pt_features,window)
        pt_features.to_csv('data/processed_data/' + file + '_'+ window['name'] +'.csv',index=False)



12_to_7
pt_features
All cases 46572
Number of cases with 10 years of data 12021
Number of cases without 10 years of data (to be discarded): 34551
Number of controls 47928
37349  cases being removed as unmatchable
38705  controls being removed as unmatchable
76054  total patients being removed as unmatchable

10_to_5
pt_features
All cases 46572
Number of cases with 10 years of data 18527
Number of cases without 10 years of data (to be discarded): 28045
Number of controls 47928
31650  cases being removed as unmatchable
33006  controls being removed as unmatchable
64656  total patients being removed as unmatchable
pt_features_avoid_specific_dementia
All cases 40130
Number of cases with 10 years of data 15973
Number of cases without 10 years of data (to be discarded): 24157
Number of controls 47928
27251  cases being removed as unmatchable
35049  controls being removed as unmatchable
62300  total patients being removed as unmatchable

8_to_3
pt_features
All cases 46572
Number of cases wit

## Add derived variables to pt_features 
*e.g. insomnia count, history of stroke, consultation count*

In [20]:
medcoded_entries = pd.read_hdf('data/processed_data/hdf/medcoded_entries.hdf')

In [21]:
prescriptions = pd.read_hdf('data/processed_data/hdf/prescriptions.hdf')

In [22]:
pt_features = pd.read_csv('data/processed_data/' + file + '_'+ sd.exposure_windows[1]['name'] +'.csv',delimiter=',',parse_dates=['index_date','data_end','data_start'],infer_datetime_format=True)

In [24]:
create_pdd_for_each_drug(prescriptions,pt_features,sd.exposure_windows[1])

In [42]:
for window in sd.exposure_windows:
    print(window['name'],'...')
    if window == sd.exposure_windows[1]:
        files = ['pt_features','pt_features_avoid_specific_dementia']
    else:
        files = ['pt_features']
    for file in files:
        print(file,':')  
        pt_features = pd.read_csv('data/processed_data/' + file + '_' + window['name'] +'.csv',delimiter=',',parse_dates=['index_date','data_end','data_start'],infer_datetime_format=True)
#         pt_features = get_multiple_condition_statuses(pt_features,medcoded_entries,prescriptions,window,codelists.all_codelists)
#         pt_features = create_PDD_columns_for_each_pt(pt_features,window,[codelists.insomnia['medications']],prescriptions)
#         pt_features = create_quantiles_and_booleans(pt_features)
        display(pt_features.sample(2))    
        pt_features.to_csv('data/processed_data/' + file + '_'+ window['name'] +'.csv',index=False)

12_to_7 ...
pt_features :


Unnamed: 0,patid,yob,pracid,female,index_date,isCase,final dementia medcode,data_end,data_start,matchid,age_at_index_date,insomnia,stroke,heart_failure,mental_illness,sleep_apnoea,chronic_pulmonary_disease,hypnotics_100_pdds,age_at_index_date:65-69,age_at_index_date:70-74,age_at_index_date:75-79,age_at_index_date:80-84,age_at_index_date:85-89,age_at_index_date:90-99,age_at_index_date:above_99,hypnotic_pdds:00000,hypnotic_pdds:00001_10,hypnotic_pdds:00011_100,hypnotic_pdds:00101_1000,hypnotic_pdds:01001_10000,hypnotic_pdds:10000_and_above
7328,7884266,13,266,1,2008-04-17,False,,2011-10-03,1996-01-31,60669,95,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4252,13285343,33,343,0,2009-10-01,False,,2012-08-01,1997-09-16,6693,76,0,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


10_to_5 ...
pt_features :


Unnamed: 0,patid,yob,pracid,female,index_date,isCase,final dementia medcode,data_end,data_start,matchid,age_at_index_date,insomnia,stroke,heart_failure,mental_illness,sleep_apnoea,chronic_pulmonary_disease,hypnotics_100_pdds,age_at_index_date:65-69,age_at_index_date:70-74,age_at_index_date:75-79,age_at_index_date:80-84,age_at_index_date:85-89,age_at_index_date:90-99,age_at_index_date:above_99,hypnotic_pdds:00000,hypnotic_pdds:00001_10,hypnotic_pdds:00011_100,hypnotic_pdds:00101_1000,hypnotic_pdds:01001_10000,hypnotic_pdds:10000_and_above
14362,289282,17,282,1,2007-02-19,True,4693.0,2010-03-25,1995-10-31,42684,90,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
906,4654159,28,159,1,2007-08-08,True,7664.0,2008-08-11,1996-10-31,23656,79,0,0,0,0,0,0,0.30036,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


pt_features_avoid_specific_dementia :


Unnamed: 0,patid,yob,pracid,female,index_date,isCase,final dementia medcode,data_end,data_start,matchid,age_at_index_date,insomnia,stroke,heart_failure,mental_illness,sleep_apnoea,chronic_pulmonary_disease,hypnotics_100_pdds,age_at_index_date:65-69,age_at_index_date:70-74,age_at_index_date:75-79,age_at_index_date:80-84,age_at_index_date:85-89,age_at_index_date:90-99,age_at_index_date:above_99,hypnotic_pdds:00000,hypnotic_pdds:00001_10,hypnotic_pdds:00011_100,hypnotic_pdds:00101_1000,hypnotic_pdds:01001_10000,hypnotic_pdds:10000_and_above
10969,5169017,13,17,1,2011-04-18,True,1350.0,2012-07-12,1998-08-17,2519,98,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
16968,5734483,22,483,0,2007-07-11,False,,2011-11-03,1996-07-03,62956,85,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


8_to_3 ...
pt_features :


Unnamed: 0,patid,yob,pracid,female,index_date,isCase,final dementia medcode,data_end,data_start,matchid,age_at_index_date,insomnia,stroke,heart_failure,mental_illness,sleep_apnoea,chronic_pulmonary_disease,hypnotics_100_pdds,age_at_index_date:65-69,age_at_index_date:70-74,age_at_index_date:75-79,age_at_index_date:80-84,age_at_index_date:85-89,age_at_index_date:90-99,age_at_index_date:above_99,hypnotic_pdds:00000,hypnotic_pdds:00001_10,hypnotic_pdds:00011_100,hypnotic_pdds:00101_1000,hypnotic_pdds:01001_10000,hypnotic_pdds:10000_and_above
13466,14303416,20,416,1,2001-03-19,True,4693.0,2003-05-06,1991-01-31,61954,81,0,0,0,1,0,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
33258,32948229,25,229,1,2006-09-14,False,,2008-10-28,1993-01-31,16457,81,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
