In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ~/demres

/Users/zurfarosa/demres


In [3]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor 

from datetime import date, timedelta

import demres
from demres.common.constants import entry_type
from demres.demins.constants import Study_Design as sd
from demres.common import codelists
from demres.common.helper_functions import *
from demres.demins.statistical_functions import *

  from pandas.core import datetools


In [4]:
pd.set_option('display.max_columns', None)

## Specifiy dementia subtype

In [5]:
subtype = 'all_dementia' # options: 'alzheimers', 'vascular','all_dementia'

## Specify exposure window

In [16]:
window = '10_to_5' #options: '12_to_7','10_to_5','8_to_3'

## Load relevant dataframe and create intercept

In [17]:
pt_features = pd.read_csv('data/pt_data/processed_data/pt_features_demins_' + subtype + '_' + window +'.csv',delimiter=',',parse_dates=['index_date','data_end','data_start'],infer_datetime_format=True)

In [18]:
pt_features.columns

Index(['patid', 'yob', 'pracid', 'female', 'index_date', 'isCase',
       'final dementia medcode', 'data_end', 'data_start', 'matchid',
       'age_at_index_date', 'insomnia', 'insomnia_no_hypnotics', 'stroke',
       'heart_failure', 'mental_illness', 'sleep_apnoea',
       'chronic_pulmonary_disease', 'epilepsy', 'hypnotics_100_pdds',
       'age_at_index_date:65-69', 'age_at_index_date:70-74',
       'age_at_index_date:75-79', 'age_at_index_date:80-84',
       'age_at_index_date:85-89', 'age_at_index_date:90-99',
       'age_at_index_date:above_99', 'hypnotic_pdds:00000',
       'hypnotic_pdds:00001_10', 'hypnotic_pdds:00011_100',
       'hypnotic_pdds:00101_1000', 'hypnotic_pdds:01001_10000',
       'hypnotic_pdds:10000_and_above'],
      dtype='object')

In [19]:
pt_features.loc['intercept'] = 1.0

In [20]:
pt_features.sample(5)

Unnamed: 0,patid,yob,pracid,female,index_date,isCase,final dementia medcode,data_end,data_start,matchid,age_at_index_date,insomnia,insomnia_no_hypnotics,stroke,heart_failure,mental_illness,sleep_apnoea,chronic_pulmonary_disease,epilepsy,hypnotics_100_pdds,age_at_index_date:65-69,age_at_index_date:70-74,age_at_index_date:75-79,age_at_index_date:80-84,age_at_index_date:85-89,age_at_index_date:90-99,age_at_index_date:above_99,hypnotic_pdds:00000,hypnotic_pdds:00001_10,hypnotic_pdds:00011_100,hypnotic_pdds:00101_1000,hypnotic_pdds:01001_10000,hypnotic_pdds:10000_and_above
10746,17708027.0,15.0,27.0,1.0,2007-09-26 00:00:00,0.0,,2009-10-08 00:00:00,1996-01-31 00:00:00,45234.0,92.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
29400,6787373.0,20.0,373.0,1.0,2006-10-19 00:00:00,1.0,1350.0,2013-04-02 00:00:00,1992-11-30 00:00:00,55518.0,86.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
15973,18762227.0,28.0,227.0,0.0,2010-03-09 00:00:00,1.0,8634.0,2011-08-04 00:00:00,1996-10-31 00:00:00,34845.0,82.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
12941,13078075.0,28.0,75.0,1.0,2009-03-03 00:00:00,1.0,,2011-03-15 00:00:00,1997-01-31 00:00:00,9959.0,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5720,1245350.0,17.0,350.0,1.0,2005-07-04 00:00:00,1.0,1917.0,2006-01-06 00:00:00,1993-01-31 00:00:00,52661.0,88.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [21]:
columns_for_inclusion = [
    'age_at_index_date',
    'female',
    'stroke',
    'heart_failure',
    'mental_illness',
    'sleep_apnoea', 
    'chronic_pulmonary_disease', 
#     'epilepsy',
    'hypnotics_100_pdds',
    'insomnia'
    
]

## Insomnia consultations

### insomnia yes/no

In [22]:
# workaround for deprecated chisqprob function in statsmodels
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df) 

In [23]:
training_cols = [col for col in pt_features.columns if col in columns_for_inclusion]
summary_table,univariate_and_multivariate_results_formatted = calculate_univariate_and_multivariate_ORs(pt_features,covariates=training_cols,main_variables=columns_for_inclusion)

female  being retained as mean > 0
age_at_index_date  being retained as mean > 0
insomnia  being retained as mean > 0
stroke  being retained as mean > 0
heart_failure  being retained as mean > 0
mental_illness  being retained as mean > 0
sleep_apnoea  being retained as mean > 0
chronic_pulmonary_disease  being retained as mean > 0
hypnotics_100_pdds  being retained as mean > 0




In [24]:
univariate_and_multivariate_results_formatted

Unnamed: 0,Univariate OR,Multivariate OR
Age at index date,"1.00, (1.00, 1.00), P=1.000","1.00, (1.00, 1.00), P=1.000"
Chronic pulmonary disease,"1.37, (1.30, 1.45), P=0.000","1.39, (1.31, 1.47), P=0.000"
Female,"1.00, (0.97, 1.03), P=0.994","0.95, (0.90, 0.99), P=0.994"
Heart failure,"1.26, (1.14, 1.39), P=0.000","1.17, (1.05, 1.29), P=0.000"
Hypnotics (100 PDDs),"1.02, (1.01, 1.03), P=0.000","1.01, (1.00, 1.01), P=0.000"
Insomnia,"1.38, (1.30, 1.46), P=0.000","1.17, (1.09, 1.26), P=0.000"
Mental illness,"1.58, (1.51, 1.64), P=0.000","1.80, (1.71, 1.89), P=0.000"
Sleep apnoea,"1.37, (0.76, 2.47), P=0.299","1.02, (0.56, 1.87), P=0.299"
Stroke,"1.48, (1.38, 1.60), P=0.000","1.46, (1.34, 1.58), P=0.000"


In [25]:
summary_table

0,1,2,3
Dep. Variable:,isCase,No. Observations:,30419.0
Model:,Logit,Df Residuals:,30410.0
Method:,MLE,Df Model:,8.0
Date:,"Thu, 24 May 2018",Pseudo R-squ.:,0.02242
Time:,13:36:34,Log-Likelihood:,-20612.0
converged:,True,LL-Null:,-21085.0
,,LLR p-value:,8.709e-199
