In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ~/demres

/Users/zurfarosa/demres


In [3]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
from datetime import date, timedelta

import demres
from demres.common.constants import entry_type
from demres.common import codelists,druglists
from demres.common.process_pt_features import *
from demres.common.process_entries import *
from demres.demins.constants import Study_Design as sd
from demres.demins.statistical_functions import *
from common.helper_functions import *
from pprint import pprint

In [4]:
pd.set_option('display.max_columns', None)

## Process raw CSV files

In [None]:
# create_pegmed()

In [None]:
# create_pegprod()

In [None]:
# create_prescriptions()

In [None]:
# create_consultations()

In [None]:
# create_clinicals()

In [None]:
# create_tests()

In [None]:
# create_referrals()

In [None]:
# create_immunisations()

In [None]:
# create_medcoded_entries()

## Create basic pt_features dataframe
*pt_features will contain all the variables (e.g. age, female gender, insomnia) used in the logistic regression*

In [5]:
# specify subtype of dementia we're interested in - 'vascular','alzheimers' or 'all_dementia'
subtype = 'all_dementia' 

In [6]:
all_encounters = get_all_encounters()

In [7]:
all_entries = get_all_entries(all_encounters)

In [8]:
pt_features = create_pt_features()

In [9]:
pt_features = get_index_date_and_caseness_and_add_final_dementia_subtype(all_entries,pt_features)

In [None]:
# pt_features = only_include_specific_dementia_subtype(pt_features,subtype=subtype) #does nothing if we're interested in all dementia

In [14]:
pt_features = add_data_start_and_end_dates(all_encounters,pt_features,subtype)

calculating latest_sysdate
calculating earliest_sysdate
resampling all_encounters - may take some time...
locating converted codes
choosing most appropriate measure of data_start
removing patients without any events
There are 14 patients without any events


In [16]:
for window in sd.exposure_windows:
    print(window['name'],' being matched')    
    pt_features = pd.read_csv('data/pt_data/processed_data/pt_features_demins_'+subtype+'.csv',delimiter=',',parse_dates=['index_date','data_end','data_start'],infer_datetime_format=True)
    pt_features = match_cases_and_controls(pt_features,window)
    pt_features.to_csv('data/pt_data/processed_data/pt_features_demins_'+subtype+'_'+ window['name'] +'.csv',index=False)

12_to_7  being matched
All patients 94500
Number of suitable cases 12216
Number of controls 47020
75714  patients being removed as unmatchable
10_to_5  being matched
All patients 94500
Number of suitable cases 18849
Number of controls 47020
64082  patients being removed as unmatchable
8_to_3  being matched
All patients 94500
Number of suitable cases 26042
Number of controls 47020
50622  patients being removed as unmatchable


## Add derived variables to pt_features 
*e.g. insomnia count, history of stroke, consultation count*

In [17]:
medcoded_entries = pd.read_hdf('data/pt_data/processed_data/hdf/medcoded_entries.hdf')

In [18]:
prescriptions = pd.read_hdf('data/pt_data/processed_data/hdf/prescriptions.hdf')

In [21]:
pt_features = pd.read_csv('data/pt_data/processed_data/pt_features_demins_'+subtype+'_'+sd.exposure_windowss[1]['name']+'.csv',delimiter=',',parse_dates=['index_date','data_end','data_start'],infer_datetime_format=True)

In [40]:
pt_features = pd.read_csv('data/pt_data/processed_data/pt_features_demins_'+subtype+'_'+sd.exposure_windows[1]['name']+'.csv',delimiter=',',parse_dates=['index_date','data_end','data_start'],infer_datetime_format=True)
create_pdd_for_each_drug(prescriptions,druglists.all_druglists,pt_features,sd.exposure_windows[1])

CARBAMAZEPINE 	pdd: 329.571934268
GABAPENTIN 	pdd: 711.545805055
LAMOTRIGINE 	pdd: 187.326325773
PREGABALIN 	pdd: 169.484271346
SODIUM VALPROATE 	pdd: 721.028033855
VALPROIC ACID 	No prescriptions found
LITHIUM CITRATE 	pdd: 891.292307692
LITHIUM CARBONATE 	pdd: 464.293337538
VALPROATE SEMISODIUM 	pdd: 496.21653085
OXCARBAZEPINE 	pdd: 573.913043478
ALPRAZOLAM 	pdd: 0.239361702128
CHLORDIAZEPOXIDE HYDROCHLORIDE 	pdd: 10.8586489863
CLOBAZAM 	pdd: 11.7033750594
CLONAZEPAM 	pdd: 0.823051344315
DIAZEPAM 	pdd: 4.66742792872
FLUNITRAZEPAM 	pdd: 1.0
FLURAZEPAM 	No prescriptions found
FLURAZEPAM HYDROCHLORIDE 	pdd: 27.2297441592
LOPRAZOLAM 	No prescriptions found
LORAZEPAM 	pdd: 1.46318357973
LORMETAZEPAM 	pdd: 0.865905116329
MIDAZOLAM 	No prescriptions found
MIDAZOLAM HYDROCHLORIDE 	No prescriptions found
NITRAZEPAM 	pdd: 5.97603790155
OXAZEPAM 	pdd: 19.0568681231
TEMAZEPAM 	pdd: 14.2202967324
ZALEPLON 	pdd: 6.97840172786
ZOPICLONE 	pdd: 6.57257823592
ZOLPIDEM TARTRATE 	pdd: 7.23277765645
ZOLP

In [53]:
pd.options.mode.chained_assignment = None  # default='warn'

# Add condition status (e.g. insomnia count, presence of diabetes, presence of stroke)
for window in sd.exposure_windows:
    print(window['name'],'...')
    pt_features = pd.read_csv('data/pt_data/processed_data/pt_features_demins_'+subtype+'_'+ window['name'] +'.csv',delimiter=',',parse_dates=['index_date','data_end','data_start'],infer_datetime_format=True)
    pt_features = get_multiple_condition_statuses(pt_features,medcoded_entries,window,[codelists.mental_illness])
    pt_features = create_PDD_columns_for_each_pt(pt_features,window,druglists.all_druglists,prescriptions)
    pt_features = get_consultation_count(pt_features,all_encounters,window)
    pt_features = create_quantiles_and_booleans(pt_features)
    pt_features.to_csv('data/pt_data/processed_data/pt_features_demins_'+subtype+'_'+ window['name'] +'.csv',index=False)