[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tavakohr/accept-nlp/blob/master/app/ericeda.ipynb)

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from lightgbm.sklearn import LGBMClassifier
from xgboost import XGBClassifier

In [6]:
#!pip install lightgbm
#!pip install xgboost


### 1. Read Data

In [2]:
clean_df = pd.read_csv('../data/clean/Data_tte_DataScience.csv')

In [3]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5134 entries, 0 to 5133
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       5134 non-null   object 
 1   trial                    5134 non-null   object 
 2   tte0                     5134 non-null   float64
 3   event                    5134 non-null   int64  
 4   gender                   5134 non-null   int64  
 5   age10                    5134 non-null   float64
 6   nowsmk                   5134 non-null   int64  
 7   oxygen                   5134 non-null   int64  
 8   fev1pp100                5134 non-null   float64
 9   bmi10                    5134 non-null   float64
 10  sgrq10                   5134 non-null   float64
 11  indicated_statin         5134 non-null   int64  
 12  randomized_azithromycin  5134 non-null   int64  
 13  LAMA                     5134 non-null   int64  
 14  LABA                    

In [9]:
clean_df['ID'].nunique()

2249

In [5]:
import matplotlib.pyplot as plt

In [6]:
from pylab import rcParams

In [7]:
#!pip install pandas_profiling
from pandas_profiling import ProfileReport

In [8]:


profile = ProfileReport(unique_patients, title='Pandas Profiling Report') #, minimal=True)
profile.to_notebook_iframe()

NameError: name 'unique_patients' is not defined

In [6]:
unique_patients = clean_df.copy()
unique_patients = unique_patients.drop(columns=['tte0', 'event'])
unique_patients = unique_patients.drop_duplicates(subset='ID')
unique_patients['exacerbation_frequency'] = clean_df.groupby(by='ID')['event'].sum().tolist()
unique_patients['censor_time'] = clean_df.groupby(by='ID')['tte0'].max().tolist()
unique_patients['target'] = unique_patients['exacerbation_frequency'].apply(lambda x: 1 if x >= 2 else 0)

In [7]:
unique_patients

Unnamed: 0,ID,trial,gender,age10,nowsmk,oxygen,fev1pp100,bmi10,sgrq10,indicated_statin,...,LABA,ICS,randomized_LAMA,randomized_LABA,randomized_ICS,randomized_statin,YIS,exacerbation_frequency,censor_time,target
0,1001-OPTIM,OPTIMAL,0,5.716406,1,0,0.260000,4.927095,8.361236,1,...,0,1,1,1,0,0,0.996578,3,0.996578,1
3,1003-OPTIM,OPTIMAL,0,7.512500,0,1,0.310000,2.753906,3.041059,0,...,0,1,1,0,0,0,0.996578,8,0.996578,1
8,1004-OPTIM,OPTIMAL,1,7.104687,0,0,0.360000,2.713490,5.433148,0,...,0,0,1,1,0,0,0.999316,7,0.999316,1
15,1005-OPTIM,OPTIMAL,0,6.723438,0,0,0.340000,2.372529,3.592873,0,...,0,1,1,1,1,0,0.996578,1,0.996578,0
17,1006-OPTIM,OPTIMAL,0,6.112500,0,1,0.580000,3.565134,7.351983,0,...,1,1,1,1,1,0,0.996578,2,0.996578,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5124,S107029-STATC,STATCOPE,0,8.000000,0,1,0.246274,2.247659,4.419430,0,...,1,1,0,0,0,1,0.999316,3,0.999316,1
5128,S107037-STATC,STATCOPE,1,7.300000,0,1,0.346282,1.988571,5.341910,0,...,1,1,0,0,0,0,0.999316,1,0.999316,0
5130,S107045-STATC,STATCOPE,1,7.400000,0,1,0.462054,3.036735,5.154660,0,...,1,0,0,0,0,1,0.711841,0,0.711841,0
5131,S107052-STATC,STATCOPE,1,5.600000,0,1,0.197229,3.160011,6.807290,0,...,1,1,0,0,0,0,0.517454,1,0.517454,0


In [10]:
unique_patients.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2249 entries, 0 to 5133
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       2249 non-null   object 
 1   trial                    2249 non-null   object 
 2   gender                   2249 non-null   int64  
 3   age10                    2249 non-null   float64
 4   nowsmk                   2249 non-null   int64  
 5   oxygen                   2249 non-null   int64  
 6   fev1pp100                2249 non-null   float64
 7   bmi10                    2249 non-null   float64
 8   sgrq10                   2249 non-null   float64
 9   indicated_statin         2249 non-null   int64  
 10  randomized_azithromycin  2249 non-null   int64  
 11  LAMA                     2249 non-null   int64  
 12  LABA                     2249 non-null   int64  
 13  ICS                      2249 non-null   int64  
 14  randomized_LAMA         

In [9]:
train_df, test_df = train_test_split(unique_patients, test_size=0.2, random_state=123)

In [44]:
drop_features = ['ID', 'trial', 'randomized_azithromycin', 'randomized_LAMA', 'randomized_LABA', 'randomized_ICS', 'randomized_statin', 'YIS', 'exacerbation_frequency', 'censor_time']
binary_features = ['gender', 'nowsmk', 'oxygen', 'indicated_statin', 'LAMA', 'LABA', 'ICS']
numeric_features = ['age10', 'fev1pp100', 'bmi10', 'sgrq10']

In [45]:
preprocessor = make_column_transformer(
    ("drop", drop_features),
    (OneHotEncoder(drop="if_binary", dtype=int), binary_features),
    (StandardScaler(), numeric_features))

In [46]:
X_train, y_train = train_df.drop(columns=['target']), train_df['target']
X_test, y_test = test_df.drop(columns=['target']), test_df['target']

In [47]:
dummy = DummyClassifier(strategy="most_frequent")
pd.DataFrame(
    cross_validate(
        dummy,
        X_train,
        y_train,
        return_train_score=True
    )
).mean()

fit_time       0.001130
score_time     0.000302
test_score     0.635353
train_score    0.635353
dtype: float64

In [41]:
results_df = {}
pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=1000, random_state=123))
scores = cross_validate(pipe, X_train, y_train, return_train_score=True)
pd.DataFrame(scores).mean()

fit_time       0.018792
score_time     0.006240
test_score     0.634797
train_score    0.646887
dtype: float64

In [43]:
results_df = {}
pipe = make_pipeline(preprocessor, LGBMClassifier(random_state=123))
scores = cross_validate(pipe, X_train, y_train, return_train_score=True)
pd.DataFrame(scores).mean()

fit_time       0.080493
score_time     0.008247
test_score     0.613090
train_score    0.978598
dtype: float64

In [49]:
results_df = {}
pipe = make_pipeline(preprocessor, XGBClassifier(random_state=123))
scores = cross_validate(pipe, X_train, y_train, return_train_score=True)
pd.DataFrame(scores).mean()

fit_time       0.118199
score_time     0.012803
test_score     0.589208
train_score    0.998332
dtype: float64