In [1]:
import os
import sys
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split

In [2]:
from zcor.encoding import *
from zcor.pfsa import *

## **===========================**

#### **Load the phenotype list**

In [3]:
PHN_FOLDER = 'phenotypes'
DX = [phn.split('.')[0] for phn in os.listdir(PHN_FOLDER) if phn.split("_")[0] == 'DX']
RX = [phn.split('.')[0] for phn in os.listdir(PHN_FOLDER) if phn.split("_")[0] == 'RX']
PX = [phn.split('.')[0] for phn in os.listdir(PHN_FOLDER) if phn.split("_")[0] == 'PX']

phenotype_catalog = {
        'DX': DX,
        'RX': RX,
        'PX': PX,
        'TOTAL': DX}

phenotype_codes = {}
for PHN in phenotype_catalog['DX']:
    with open("%s/%s.phn" % (PHN_FOLDER, PHN), "r") as f:
        raw = f.readlines()[0]
        phenotype_codes[PHN] = set(raw[:-1].split())

In [4]:
raw_sample = pd.read_csv('RAW_SAMPLE.csv')

In [5]:
raw_sample.sample(2)

Unnamed: 0,patient_id,record,gender,age,prediction_point,target,target_week,target_codes
3299,P3300,1:401.9|1:211.3|1:272.0|32:401.9|32:272.0|32:2...,M,53,162,0,-1,
1351,P1352,1:401.9|2:465.8|3:465.8|9:266.2|9:436|10:V43.1...,F,76,206,0,-1,


In [6]:
ENCODINGS = ternary_encoding(
              raw_sample, 
              phenotype_codes, 
              INFERENCE_WINDOW = 104,
              verbose = True)
ENCODINGS = ENCODINGS.merge(raw_sample[['patient_id', 'target']], on = 'patient_id')

~ ~ ~ ~ ~
1058 patients excluded due to empty inference window
0 more patients excluded due to insufficient number of non-null weeks
~ ~ ~ ~ ~


In [7]:
ENCODINGS.sample()

Unnamed: 0,patient_id,gender,age_at_screening,sequence,sequence_weeks,sequence_codes,abs_sequence_weeks,prediction_point,first_week,last_week,...,DX_Rheumatism_codes,DX_Sleep_Disorders_codes,DX_Symptoms_Abs_Pelvis_codes,DX_Symptoms_Digestive_codes,DX_Symptoms_General_codes,DX_Symptoms_Respiratory_codes,DX_Symptoms_Skin_codes,DX_Symptoms_Urinary_codes,DX_Thyroid_codes,target
5691,P6016,F,63.019231,0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 ...,3 20 48 48 50 50 50 51 51 52 53 53 53 53 53 58...,339.42 V76.12 736.79 V18.19 733.90 V76.2 V72.3...,108 125 153 153 155 155 155 156 156 157 158 15...,209,108,189,...,,,,,,,,,,0


#### **Split the encoded dataset into PFSA set and LLK set**

In [8]:
X = ENCODINGS.drop(['target'], 1)
y = ENCODINGS.target

X_pfsa, X_llk, y_pfsa, y_llk = train_test_split(X, y, test_size=0.5)
PFSA_DATA = X_pfsa.copy()
PFSA_DATA['target'] = y_pfsa
LLK_DATA = X_llk.copy()
LLK_DATA['target'] = y_llk

print(PFSA_DATA.target.value_counts())
print(LLK_DATA.target.value_counts())

0    9096
1     375
Name: target, dtype: int64
0    9090
1     381
Name: target, dtype: int64


### **PFSA inference**

In [9]:
# folder to save the generated PFSA models
PFSA_PATH = 'PFSA/%s'
# Make binaries executable
os.chmod('bin/genESeSS', 0o777)
os.chmod('bin/llk', 0o777)

In [10]:
PFSA_DATA.sample()

Unnamed: 0,patient_id,gender,age_at_screening,sequence,sequence_weeks,sequence_codes,abs_sequence_weeks,prediction_point,first_week,last_week,...,DX_Rheumatism_codes,DX_Sleep_Disorders_codes,DX_Symptoms_Abs_Pelvis_codes,DX_Symptoms_Digestive_codes,DX_Symptoms_General_codes,DX_Symptoms_Respiratory_codes,DX_Symptoms_Skin_codes,DX_Symptoms_Urinary_codes,DX_Thyroid_codes,target
6000,P6345,M,63.653846,2 2 0 0 0 0 0 0 0 0 0 2 0 0 2 0 2 0 2 0 0 0 2 ...,1 2 2 12 12 15 17 19 23 24 27 32 32 34 34 34 3...,427.69 427.89 427.61 786.09 272.2 327.23 327.2...,451 452 452 462 462 465 467 469 473 474 477 48...,554,451,538,...,728.71,327.23 327.23 327.23 327.23 327.23 327.23 327....,,,,786.09,782.3,,,0


In [11]:
LLK_DATA.sample()

Unnamed: 0,patient_id,gender,age_at_screening,sequence,sequence_weeks,sequence_codes,abs_sequence_weeks,prediction_point,first_week,last_week,...,DX_Rheumatism_codes,DX_Sleep_Disorders_codes,DX_Symptoms_Abs_Pelvis_codes,DX_Symptoms_Digestive_codes,DX_Symptoms_General_codes,DX_Symptoms_Respiratory_codes,DX_Symptoms_Skin_codes,DX_Symptoms_Urinary_codes,DX_Thyroid_codes,target
11155,P11786,M,71.692308,0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 ...,12 12 39 47 47 47 47 48 54 70 74 74 75 76 76 7...,414.01 414.01 414.01 401.9 786.50 786.59 401.9...,100 100 127 135 135 135 135 136 142 158 162 16...,192,100,191,...,,,789.06 789.09 789.06 789.01 789.06 789.01,,,786.50 786.59 786.50,782.3,,,0


#### **Generate PFSA models**
will be saved at `PFSA/{phenotype}/`

In [12]:
infer_pfsa(
    phenotype_catalog = phenotype_catalog,
    PFSA_SET = PFSA_DATA,
    PFSA_PATH = PFSA_PATH,
    STOPLIST = [],
    POS_EPSILON = 0.5,
    NEG_EPSILON = 0.5,
    GENESESS_PATH = 'bin/genESeSS',
    verbose = True,
)

1/39 > DX_Abnormal_Findings
2/39 > DX_Allergic
3/39 > DX_Cardiovascular
4/39 > DX_CNS
5/39 > DX_Development
6/39 > DX_Digestive
7/39 > DX_Dorsopathy
8/39 > DX_Endocrine
9/39 > DX_Frailty
10/39 > DX_Health_Services
11/39 > DX_Hematologic
12/39 > DX_Hypertension
13/39 > DX_Immune
14/39 > DX_Infections_Bacterial
15/39 > DX_Infections_Fungal_and_Other
16/39 > DX_Infections_General
17/39 > DX_Infections_Respiratory
18/39 > DX_Injuries
19/39 > DX_Integumentary
20/39 > DX_Ischemic
21/39 > DX_Metabolic
22/39 > DX_Musculoskeletal
23/39 > DX_Neoplastic
24/39 > DX_Ophthalmological
25/39 > DX_Oth_Urinary
26/39 > DX_Otic
27/39 > DX_PNS
28/39 > DX_Psychiatric
29/39 > DX_Reproductive
30/39 > DX_Respiratory
31/39 > DX_Rheumatism
32/39 > DX_Sleep_Disorders
33/39 > DX_Symptoms_Abs_Pelvis
34/39 > DX_Symptoms_Digestive
35/39 > DX_Symptoms_General
36/39 > DX_Symptoms_Respiratory
37/39 > DX_Symptoms_Skin
38/39 > DX_Symptoms_Urinary
39/39 > DX_Thyroid


#### **Generate LLK features**
NaN values mark absence of codes for a given phenotype in the patient's inference window

In [13]:
llk_features = generate_pfsa_features(
                     phenotype_catalog = phenotype_catalog,
                     DF = LLK_DATA.drop('target', 1),
                     POS_PFSA_PATH = PFSA_PATH + "/POS.pfsa",
                     NEG_PFSA_PATH = PFSA_PATH + "/NEG.pfsa",
                     STOPLIST = [],
                     used_channels = ['DX'],
                     ID_COLUMN = 'patient_id',
                     FEATURE_COLUMNS = [],
                     STAT_COLUMNS = [],
                     LLK_PATH = 'bin/llk',
                     GENESESS_PATH = 'bin/genESeSS',
                     verbose = True)
llk_features = llk_features.merge(LLK_DATA[['patient_id', 'target']], on = 'patient_id')

1/39 > DX_Abnormal_Findings
2/39 > DX_Allergic
3/39 > DX_Cardiovascular
4/39 > DX_CNS
5/39 > DX_Development
6/39 > DX_Digestive
7/39 > DX_Dorsopathy
8/39 > DX_Endocrine
9/39 > DX_Frailty
10/39 > DX_Health_Services
11/39 > DX_Hematologic
12/39 > DX_Hypertension
13/39 > DX_Immune
14/39 > DX_Infections_Bacterial
15/39 > DX_Infections_Fungal_and_Other
16/39 > DX_Infections_General
17/39 > DX_Infections_Respiratory
18/39 > DX_Injuries
19/39 > DX_Integumentary
20/39 > DX_Ischemic
21/39 > DX_Metabolic
22/39 > DX_Musculoskeletal
23/39 > DX_Neoplastic
24/39 > DX_Ophthalmological
25/39 > DX_Oth_Urinary
26/39 > DX_Otic
27/39 > DX_PNS
28/39 > DX_Psychiatric
29/39 > DX_Reproductive
30/39 > DX_Respiratory
31/39 > DX_Rheumatism
32/39 > DX_Sleep_Disorders
33/39 > DX_Symptoms_Abs_Pelvis
34/39 > DX_Symptoms_Digestive
35/39 > DX_Symptoms_General
36/39 > DX_Symptoms_Respiratory
37/39 > DX_Symptoms_Skin
38/39 > DX_Symptoms_Urinary
39/39 > DX_Thyroid


In [14]:
llk_features.sample(3)

Unnamed: 0,patient_id,DX_Abnormal_Findings_sld,DX_Abnormal_Findings_ratio,DX_Abnormal_Findings_abs_pos,DX_Abnormal_Findings_abs_neg,DX_Allergic_sld,DX_Allergic_ratio,DX_Allergic_abs_pos,DX_Allergic_abs_neg,DX_Cardiovascular_sld,...,STD_DX_abs_pos,MEAN_DX_abs_neg,MAX_DX_abs_neg,RANGE_DX_abs_neg,STD_DX_abs_neg,MEAN_DX_ratio,MAX_DX_ratio,RANGE_DX_ratio,STD_DX_ratio,target
1356,P3484,0.094216,1.200168,0.5649,0.470684,,,,,0.025753,...,0.030601,0.48117,0.534286,0.083733,0.027656,1.096731,1.200168,0.166462,0.047445,0
3302,P4906,,,,,0.026817,1.053057,0.532252,0.505435,0.012329,...,0.044936,0.5396,0.630054,0.137837,0.047472,1.059232,1.08981,0.069253,0.020563,0
6969,P11975,,,,,,,,,0.060263,...,0.014386,0.284442,0.287921,0.006959,0.004921,1.235142,1.255796,0.041308,0.029209,0


In [15]:
llk_features.to_csv("llk_features.csv", index = False)