In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('large_repr', 'truncate')
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns


from pathlib import Path

from sklearn.preprocessing import  LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_curve, roc_auc_score,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from typing import Tuple

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Plot settings
sns.set_context('notebook') 
sns.set_style('ticks') 
colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']
sns.set_palette(colours)
%matplotlib inline

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
DATA_PATH = Path("../data/")
RANDOM_STATE_PARAMETER = 42
TEST_SIZE = .25

In [5]:
data = pd.read_csv(DATA_PATH / "PX_Diag_data.csv")
data.head()

Unnamed: 0,PATIENT_ID,BC_PATIENT,SN_PATIENT,mBC_PATIENT,SERVICE_DATE_y,DIAGNOSIS_CODE,SEASON,PRC_STD_CD,Blood,Bone,Breast,Heart,Lungs,Medication,Medication.1,Other,Skin,Stomach,Vagina
0,1184666769,True,True,True,09/13/2013,196.3,4,77336,0,0,0,0,0,0,0,1,0,0,0
1,1184666769,True,True,True,09/13/2013,174.9,4,77336,0,0,0,0,0,0,0,1,0,0,0
2,1184666769,True,True,True,09/05/2013,196.3,4,77336,0,0,0,0,0,0,0,1,0,0,0
3,1184666769,True,True,True,09/05/2013,174.9,4,77336,0,0,0,0,0,0,0,1,0,0,0
4,1184666769,True,True,True,09/27/2013,196.3,4,77336,0,0,0,0,0,0,0,1,0,0,0


In [6]:
data.drop(["BC_PATIENT", "SN_PATIENT"], axis=1, inplace=True)
data.rename(columns={"SERVICE_DATE_y": "Service_date", "SEASON": "Season"}, inplace=True)
cols = ['PATIENT_ID',
 'Service_date',
 'Season',
 'DIAGNOSIS_CODE',        
 'PRC_STD_CD',
 'Blood',
 'Bone',
 'Breast',
 'Heart',
 'Lungs',
 'Medication',
 'Medication ',
 'Other',
 'Skin',
 'Stomach',
 'Vagina',
  'mBC_PATIENT']
data = data[cols]
data = pd.concat([data.iloc[:,:10], data.iloc[:, 11:]], axis=1)
ohe = pd.get_dummies(data.Season, prefix="Season")
data.drop(["Season"], axis=1, inplace=True)
data = pd.concat([data, ohe], axis=1)
le = LabelEncoder()
data["mBC_PATIENT"] = le.fit_transform(data["mBC_PATIENT"])
data.head()

Unnamed: 0,PATIENT_ID,Service_date,DIAGNOSIS_CODE,PRC_STD_CD,Blood,Bone,Breast,Heart,Lungs,Medication,Other,Skin,Stomach,Vagina,mBC_PATIENT,Season_1,Season_2,Season_3,Season_4
0,1184666769,09/13/2013,196.3,77336,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1
1,1184666769,09/13/2013,174.9,77336,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1
2,1184666769,09/05/2013,196.3,77336,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1
3,1184666769,09/05/2013,174.9,77336,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1
4,1184666769,09/27/2013,196.3,77336,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1


In [7]:
data = data[['PATIENT_ID', 'Service_date', 'DIAGNOSIS_CODE', 'PRC_STD_CD', 'Blood',
       'Bone', 'Breast', 'Heart', 'Lungs', 'Medication ', 'Other', 'Skin',
       'Stomach', 'Vagina', 'Season_1', 'Season_2', 'Season_3',
       'Season_4', 'mBC_PATIENT']]
data.head()

Unnamed: 0,PATIENT_ID,Service_date,DIAGNOSIS_CODE,PRC_STD_CD,Blood,Bone,Breast,Heart,Lungs,Medication,Other,Skin,Stomach,Vagina,Season_1,Season_2,Season_3,Season_4,mBC_PATIENT
0,1184666769,09/13/2013,196.3,77336,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
1,1184666769,09/13/2013,174.9,77336,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
2,1184666769,09/05/2013,196.3,77336,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
3,1184666769,09/05/2013,174.9,77336,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
4,1184666769,09/27/2013,196.3,77336,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1


In [8]:
data.columns

Index(['PATIENT_ID', 'Service_date', 'DIAGNOSIS_CODE', 'PRC_STD_CD', 'Blood',
       'Bone', 'Breast', 'Heart', 'Lungs', 'Medication ', 'Other', 'Skin',
       'Stomach', 'Vagina', 'Season_1', 'Season_2', 'Season_3', 'Season_4',
       'mBC_PATIENT'],
      dtype='object')

In [9]:
apld_patients = data[data["DIAGNOSIS_CODE"] == "C79.81"]
apld_patients.head()

Unnamed: 0,PATIENT_ID,Service_date,DIAGNOSIS_CODE,PRC_STD_CD,Blood,Bone,Breast,Heart,Lungs,Medication,Other,Skin,Stomach,Vagina,Season_1,Season_2,Season_3,Season_4,mBC_PATIENT
109712,134290358,10/15/2015,C79.81,99214,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
109715,134290358,10/15/2015,C79.81,99214,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
109718,134290358,10/15/2015,C79.81,99214,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
112828,1057559593,11/25/2015,C79.81,85025,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
112831,1057559593,11/25/2015,C79.81,36415,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1


In [11]:
l0 = len(apld_patients)
l1 = len(apld_patients[apld_patients["mBC_PATIENT"] == 0])
l2 = len(apld_patients[apld_patients["mBC_PATIENT"] == 1])
print(f"Number of APLD patient records:{l0}, nonMBC APLD patient records:{l1}, mBC APLD patient records:{l2}")

Number of APLD patient records:164210, nonMBC APLD patient records:66851, mBC APLD patient records:97359


In [12]:
apld_patients.to_csv(DATA_PATH / "apld_patients.csv", index=False)