In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('large_repr', 'truncate')
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns


from pathlib import Path

from sklearn.preprocessing import  LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_curve, roc_auc_score,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from typing import Tuple

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Plot settings
sns.set_context('notebook') 
sns.set_style('ticks') 
colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']
sns.set_palette(colours)
%matplotlib inline

In [3]:
import sys
sys.path.insert(0, '../src')
from ds_toolbox import create_balanced_dataset, plot_confusion_matrix_with_labels, compute_metrics, plot_roc_curves_with_classifiers

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
DATA_PATH = Path("../data/")
RANDOM_STATE_PARAMETER = 42
TEST_SIZE = .25

In [6]:
data = pd.read_csv(DATA_PATH / "apld_patients_diffent_diagnosis_code.csv")
data.head()

Unnamed: 0,PATIENT_ID,Service_date,DIAGNOSIS_CODE,PRC_STD_CD,Blood,Bone,Breast,Heart,Lungs,Medication,Other,Skin,Stomach,Vagina,Season_1,Season_2,Season_3,Season_4,mBC_PATIENT
0,275849394,09/03/2013,174.8,96402,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
1,275849394,09/03/2013,174.8,J9395,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
2,275849394,09/03/2013,174.8,96402,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
3,275849394,09/03/2013,174.8,J9395,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1
4,275849394,09/03/2013,174.8,96402,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1


In [7]:
ohe = pd.get_dummies(data.PRC_STD_CD, prefix="PRC_STD_CD")
data.drop(["PRC_STD_CD"], axis=1, inplace=True)
data = pd.concat([data, ohe], axis=1)

In [8]:
data.columns

Index(['PATIENT_ID', 'Service_date', 'DIAGNOSIS_CODE', 'Blood', 'Bone',
       'Breast', 'Heart', 'Lungs', 'Medication ', 'Other',
       ...
       'PRC_STD_CD_T2046', 'PRC_STD_CD_T4522', 'PRC_STD_CD_T4526',
       'PRC_STD_CD_T4527', 'PRC_STD_CD_T4528', 'PRC_STD_CD_T4535',
       'PRC_STD_CD_T4537', 'PRC_STD_CD_T4540', 'PRC_STD_CD_T4541',
       'PRC_STD_CD_V2632'],
      dtype='object', length=3230)

In [9]:
len(data[data["mBC_PATIENT"] == 0])

494356

In [10]:
len(data[data["mBC_PATIENT"] == 1])

7118308

In [11]:
data = pd.concat([data.iloc[:,:17], data.iloc[:, 18:], data.iloc[:,17]], axis=1)
data.head()

Unnamed: 0,PATIENT_ID,Service_date,DIAGNOSIS_CODE,Blood,Bone,Breast,Heart,Lungs,Medication,Other,...,PRC_STD_CD_T4522,PRC_STD_CD_T4526,PRC_STD_CD_T4527,PRC_STD_CD_T4528,PRC_STD_CD_T4535,PRC_STD_CD_T4537,PRC_STD_CD_T4540,PRC_STD_CD_T4541,PRC_STD_CD_V2632,mBC_PATIENT
0,275849394,09/03/2013,174.8,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,275849394,09/03/2013,174.8,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,275849394,09/03/2013,174.8,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,275849394,09/03/2013,174.8,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,275849394,09/03/2013,174.8,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [12]:
# for col in data.columns.tolist():
#     print(f"Col:{col}")

In [13]:
data.iloc[:,3:-1]

Unnamed: 0,Blood,Bone,Breast,Heart,Lungs,Medication,Other,Skin,Stomach,Vagina,...,PRC_STD_CD_T2046,PRC_STD_CD_T4522,PRC_STD_CD_T4526,PRC_STD_CD_T4527,PRC_STD_CD_T4528,PRC_STD_CD_T4535,PRC_STD_CD_T4537,PRC_STD_CD_T4540,PRC_STD_CD_T4541,PRC_STD_CD_V2632
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7612659,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7612660,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7612661,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7612662,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
data.iloc[:,-1]

0          1
1          1
2          1
3          1
4          1
          ..
7612659    1
7612660    1
7612661    1
7612662    1
7612663    1
Name: mBC_PATIENT, Length: 7612664, dtype: int64

In [15]:
x_train, x_test, y_train, y_test = train_test_split(
        data.iloc[:,3:-1],
        data.iloc[:,-1].values,
        random_state = RANDOM_STATE_PARAMETER,
        test_size=TEST_SIZE,
    )            

In [None]:
rf = RandomForestClassifier()      
rf = rf.fit(x_train, y_train)
y_prediction = rf.predict(x_test)
(accuracy, sensitivity, specificity) = compute_metrics(y_test, y_prediction)
print(f"accuracy:{accuracy}, sensitivity:{sensitivity}, specificity:{specificity}")

In [None]:
plot_confusion_matrix_with_labels(x_test, y_test, rf, ["non_mBC", "mBC"], filename="random_forest_prc_std_cd.pdf")

# Logistic Regression

In [None]:
%%time
model = LogisticRegression(class_weight='balanced')
tuning_parameters = {
    'C':[1e-4,1e-3,1e-2,0.1,1],
    "max_iter": [2000, 3000, 4000, 5000]
}

lr_search = RandomizedSearchCV(model, tuning_parameters, cv = 5, n_iter= 5, n_jobs=4,
                              random_state = 20)
lr_search.fit(x_train, y_train)
print('Best parameters:', lr_search.best_params_)
best_lr_model = lr_search.best_estimator_
y_prediction = best_lr_model.predict(x_test)
(accuracy, sensitivity, specificity) = compute_metrics(y_test, y_prediction)
print(f"accuracy:{accuracy}, sensitivity:{sensitivity}, specificity:{specificity}")

In [None]:
plot_confusion_matrix_with_labels(x_test, y_test, best_lr_model, ["non_mBC", "mBC"], filename="logistic_reg_prc_std_cd.pdf")

In [None]:
models = [best_lr_model, rf]
plot_roc_curves_with_classifiers(scores, y_test, labels);
scores = np.zeros((len(y_test), len(models)))
for i, model in enumerate(models):
    scores[:,i] = model.predict_proba(x_test)[:,1]   
plot_roc_curves_with_classifiers(scores, y_test, labels);

In [22]:
xgb = XGBClassifier(random_state = RANDOM_STATE_PARAMETER, n_jobs = -1)
xgb.fit(x_train, y_train)
y_prediction = xgb.predict(x_test)
(accuracy, sensitivity, specificity) = compute_metrics(y_test, y_prediction)
print(f"accuracy:{accuracy}, sensitivity:{sensitivity}, specificity:{specificity}")
plot_confusion_matrix_with_labels(x_test, y_test, xgb, ["non_mBC", "mBC"], filename="xgb_prc_std_cd.pdf")

KeyboardInterrupt: 

In [None]:
mbc_ratio=y_train[y_train == 1].shape[0]/y_train[y_train == 0].shape[0]
mbc_ratio

# Logistic Regression on oversampled data

In [None]:
X_sampled, y_sampled = create_balanced_dataset('SMOTE', x_train, y_train)

In [None]:
X_sampled.head()

# Random Forest on oversampled data

In [None]:
oversampling_rf = RandomForestClassifier()      
oversampling_rf = oversampling_rf.fit(X_sampled, y_sampled)
y_prediction = oversampling_rf.predict(x_test)
(accuracy, sensitivity, specificity) = compute_metrics(y_test, y_prediction)
print(f"accuracy:{accuracy}, sensitivity:{sensitivity}, specificity:{specificity}")

In [None]:
oversampling_xgb = XGBClassifier(random_state = RANDOM_STATE_PARAMETER, n_jobs = -1)
oversampling_xgb.fit(X_sampled, y_sampled)
y_prediction = oversampling_xgb.predict(x_test)
(accuracy, sensitivity, specificity) = compute_metrics(y_test, y_prediction)
print(f"accuracy:{accuracy}, sensitivity:{sensitivity}, specificity:{specificity}")

In [None]:
%%time
model = LogisticRegression()
#model = LogisticRegression(class_weight='balanced')
tuning_parameters = {
    'C':[2, 5, 10],
    "max_iter": [2000, 3000, 4000, 5000]
}

lr_search = RandomizedSearchCV(model, tuning_parameters, cv = 5, n_iter= 5, n_jobs=4,
                              random_state = 20)
lr_search.fit(X_sampled, y_sampled)
print('Best parameters:', lr_search.best_params_)
oversampling_best_lr_model = lr_search.best_estimator_
y_prediction = oversampling_best_lr_model.predict(x_test)
(accuracy, sensitivity, specificity) = compute_metrics(y_test, y_prediction)
print(f"accuracy:{accuracy}, sensitivity:{sensitivity}, specificity:{specificity}")