In [1]:
!unzip -q training_setA.zip
!unzip -q training_setB.zip

!pip install -U imbalanced-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.0-py3-none-any.whl (225 kB)
[K     |████████████████████████████████| 225 kB 30.2 MB/s 
Installing collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.8.1
    Uninstalling imbalanced-learn-0.8.1:
      Successfully uninstalled imbalanced-learn-0.8.1
Successfully installed imbalanced-learn-0.10.0


In [1]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, balanced_accuracy_score

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

import pandas as pd
import numpy as np
import os
from tqdm import tqdm

We observe that there are a lot of NaN values in the dataset. We begin by inspecting what percentage they occupy in each column, and discard columns with too many missing values.

In [12]:
def nanpercent():
  """
  check the percentage of NaN for every feature.
  """
  # total number of data instances
  num_data = 0

  # total numer of NaN numbers for every feature
  nan_array = np.zeros(41)

  for file_name in tqdm(os.listdir("./training/")):
    data = pd.read_csv(f"./training/{file_name}", sep='|')
    num_data += len(data)
    columns = data.columns
    for col in columns:
      num = data[col].isna().sum()

      # index of a column
      index = columns.get_loc(col)

      nan_array[index] += num
      
  np.set_printoptions(suppress=True)
  print(nan_array / num_data)

In [13]:
nanpercent()

100%|██████████| 20336/20336 [03:28<00:00, 97.58it/s]

[0.07743336 0.12032042 0.66224255 0.15211177 0.10232405 0.48125763
 0.09776833 1.         0.89574863 0.91949406 0.85807027 0.88532868
 0.91231753 0.95044387 0.98504205 0.91840702 0.9854065  0.95024392
 0.91676063 0.93357884 0.9985042  0.87768392 0.96565112 0.92219712
 0.94951247 0.8913764  0.98773372 0.99877881 0.88223711 0.91164303
 0.95152459 0.92489639 0.99236917 0.93482913 0.         0.
 0.48868346 0.48868346 0.00001012 0.         0.        ]





In the training dataset, we see that only columns 1-7 and 35-41 are usable, as other columns have more than 85% of missing values.

As sklearn does not naturally come with online learning, we create a whole dataset here and fit the model on it. (This includes the missing-value-imputing step)

In [2]:
def prepare_train_dataset():

  X = np.zeros((0, 13))
  Y = np.zeros(0)
  for file_name in tqdm(os.listdir("./training/")):
    data = pd.read_csv(f"./training/{file_name}", sep='|', usecols = 
      ["HR", "O2Sat", "Temp", "SBP", "MAP", "DBP", "Resp", "Age", "Gender", "Unit1", "Unit2", "HospAdmTime", "ICULOS", "SepsisLabel"]
    )

    # These concatenations get slower as the dataset gets bigger
    X = np.concatenate((X, data.iloc[:, :13] .to_numpy()), axis=0)
    Y = np.concatenate((Y, data.iloc[:, 13].to_numpy()), axis=0)

  imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
  X = imp_mean.fit_transform(X)

  return X, Y

In [3]:
def prepare_test_dataset():

  X = np.zeros((0, 13))
  Y = np.zeros(0)
  for file_name in tqdm(os.listdir("./training_setB/")):
    data = pd.read_csv(f"./training_setB/{file_name}", sep='|', usecols = 
      ["HR", "O2Sat", "Temp", "SBP", "MAP", "DBP", "Resp", "Age", "Gender", "Unit1", "Unit2", "HospAdmTime", "ICULOS", "SepsisLabel"]
    )
    X = np.concatenate((X, data.iloc[:, :13] .to_numpy()), axis=0)
    Y = np.concatenate((Y, data.iloc[:, 13].to_numpy()), axis=0)

  imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
  X = imp_mean.fit_transform(X)

  return X, Y

In [4]:
X, y = prepare_train_dataset()
X_test, y_test = prepare_test_dataset()

print(X.shape)
print(X_test.shape)

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.fit_transform(X_test)

100%|██████████| 20336/20336 [06:07<00:00, 55.41it/s]
100%|██████████| 20000/20000 [05:19<00:00, 62.65it/s]


(790215, 13)
(761995, 13)


In [22]:
# majority baseline
np.count_nonzero(y_test == 0) / len(y_test)

0.9858529255441308

In [23]:
clf_log = LogisticRegression().fit(X, y)
clf_log.score(X_test, y_test)

0.9854907184430344

In [24]:
clf_nb = GaussianNB().fit(X, y)
clf_nb.score(X_test, y_test)

0.957878988707275

In [25]:
# we use dual=False because n_samples >> n_features
clf_svm = LinearSVC(dual=False).fit(X, y)
clf_svm.score(X_test, y_test)

0.9858529255441308

In [26]:
clf_rf = RandomForestClassifier().fit(X, y)
clf_rf.score(X_test, y_test)

0.9840550134843404

In [27]:
clf_gb = GradientBoostingClassifier().fit(X, y)
clf_gb.score(X_test, y_test)

0.9723252777249195

We find that all these models fall behind the majority baseline in terms of classification accuracy. This is mostly because of the strong imbalance between labels. Now let's make some modifications to fix this.

First we introduce balanced accuracy score to better balance the evaluation of two labels.




In [12]:
# majority baseline
balanced_accuracy_score(y_test, np.zeros(761995))

0.5

In [13]:
balanced_accuracy_score(y_test, clf_log.predict(X_test))

0.5028335932524005

In [14]:
# Note that naive bayes already takes class prior into account
balanced_accuracy_score(y_test, clf_nb.predict(X_test))

0.575736871203715

In [15]:
balanced_accuracy_score(y_test, clf_svm.predict(X_test))

0.5

In [16]:
balanced_accuracy_score(y_test, clf_rf.predict(X_test))

0.5099218946815126

In [18]:
balanced_accuracy_score(y_test, clf_gb.predict(X_test))

0.5093685079274852

We see that the balanced accuracies are very bad, just slightly above the performance of the majority baseline (except for Gaussian Naive Bayes). 

Note also that this is a typical medical application where a false negative does much more harm than a false positive. We introduce another evaluation metric, which weights more on false negative samples. We see similar results with balanced accuracy.

In [5]:
weight = np.array([[0, 1], [1000, 0]])

In [29]:
matrix = confusion_matrix(y_test, np.zeros(761995))
np.multiply(weight, matrix).sum()

10780000

In [30]:
matrix = confusion_matrix(y_test, clf_log.predict(X_test))
np.multiply(weight, matrix).sum()

10714342

In [31]:
matrix = confusion_matrix(y_test, clf_nb.predict(X_test))
np.multiply(weight, matrix).sum()

8836283

In [32]:
matrix = confusion_matrix(y_test, clf_svm.predict(X_test))
np.multiply(weight, matrix).sum()

10780000

In [33]:
matrix = confusion_matrix(y_test, clf_rf.predict(X_test))
np.multiply(weight, matrix).sum()

10544607

In [34]:
matrix = confusion_matrix(y_test, clf_gb.predict(X_test))
np.multiply(weight, matrix).sum()

10435663

The first way to mitigate this is to balance the class weight (coefficients in the loss function for different classes), but this option is not available for all models.

In [35]:
clf_log = LogisticRegression(class_weight="balanced").fit(X, y)
clf_svm = LinearSVC(dual=False, class_weight="balanced").fit(X, y)
clf_rf = RandomForestClassifier(class_weight="balanced").fit(X, y)

In [21]:
print(balanced_accuracy_score(y_test, clf_log.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_svm.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_rf.predict(X_test)))

0.6432234328933819
0.6435837212114642
0.500055282427625


In [36]:
matrix = confusion_matrix(y_test, clf_log.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_svm.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_rf.predict(X_test))
print(np.multiply(weight, matrix).sum())

5084337
5210384
10775149


This improves the performance of Logistic Regression and Support Vector Machine, but not Random Forest.

The second way is upsampling minority class or downsampling majority class. As naive upsampling/downsampling plays a similar role as changing class weights, we also try to use SMOTE and ADASYN, which are two data synthesis techniques by interpolation.

One could also try to use SMOTEENN or SMOTETomek to clean up the resampled dataset, but as our dataset has a large number of instances, this is not computationally feasible.

In [6]:
X_resampled_over, y_resampled_over = RandomOverSampler().fit_resample(X, y)
X_resampled_under, y_resampled_under = RandomUnderSampler().fit_resample(X, y)
X_resampled_smote, y_resampled_smote = SMOTE().fit_resample(X, y)
X_resampled_adasyn, y_resampled_adasyn = ADASYN().fit_resample(X, y)

In [7]:
# RandomOverSampler
clf_log = LogisticRegression().fit(X_resampled_over, y_resampled_over)
clf_nb = GaussianNB().fit(X_resampled_over, y_resampled_over)
clf_svm = LinearSVC(dual=False).fit(X_resampled_over, y_resampled_over)
clf_rf = RandomForestClassifier().fit(X_resampled_over, y_resampled_over)
clf_gb = GradientBoostingClassifier().fit(X_resampled_over, y_resampled_over)

print(balanced_accuracy_score(y_test, clf_log.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_nb.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_svm.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_rf.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_gb.predict(X_test)))

matrix = confusion_matrix(y_test, clf_log.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_nb.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_svm.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_rf.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_gb.predict(X_test))
print(np.multiply(weight, matrix).sum())

0.6434337974213375
0.6570116000823255
0.6412360757267722
0.5011370019035458
0.6473321243703938
5081230
6070221
5209194
10749452
6248499


In [8]:
# RandomUnderSampler
clf_log = LogisticRegression().fit(X_resampled_under, y_resampled_under)
clf_nb = GaussianNB().fit(X_resampled_under, y_resampled_under)
clf_svm = LinearSVC(dual=False).fit(X_resampled_under, y_resampled_under)
clf_rf = RandomForestClassifier().fit(X_resampled_under, y_resampled_under)
clf_gb = GradientBoostingClassifier().fit(X_resampled_under, y_resampled_under)

print(balanced_accuracy_score(y_test, clf_log.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_nb.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_svm.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_rf.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_gb.predict(X_test)))

matrix = confusion_matrix(y_test, clf_log.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_nb.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_svm.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_rf.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_gb.predict(X_test))
print(np.multiply(weight, matrix).sum())

0.643481719787105
0.6557399468025682
0.6413010943915878
0.6625568619652489
0.6672622957487905
5081158
6067480
5208166
5787447
5639900


In [9]:
# SMOTE
clf_log = LogisticRegression().fit(X_resampled_smote, y_resampled_smote)
clf_nb = GaussianNB().fit(X_resampled_smote, y_resampled_smote)
clf_svm = LinearSVC(dual=False).fit(X_resampled_smote, y_resampled_smote)
clf_rf = RandomForestClassifier().fit(X_resampled_smote, y_resampled_smote)
clf_gb = GradientBoostingClassifier().fit(X_resampled_smote, y_resampled_smote)

print(balanced_accuracy_score(y_test, clf_log.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_nb.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_svm.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_rf.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_gb.predict(X_test)))

matrix = confusion_matrix(y_test, clf_log.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_nb.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_svm.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_rf.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_gb.predict(X_test))
print(np.multiply(weight, matrix).sum())

0.6421063890720904
0.6556501371303535
0.6395277032160281
0.5041131712327946
0.5352864667216846
5067409
6224838
5197806
7496324
1976451


In [10]:
# ADASYN
clf_log = LogisticRegression().fit(X_resampled_adasyn, y_resampled_adasyn)
clf_nb = GaussianNB().fit(X_resampled_adasyn, y_resampled_adasyn)
clf_svm = LinearSVC(dual=False).fit(X_resampled_adasyn, y_resampled_adasyn)
clf_rf = RandomForestClassifier().fit(X_resampled_adasyn, y_resampled_adasyn)
clf_gb = GradientBoostingClassifier().fit(X_resampled_adasyn, y_resampled_adasyn)

print(balanced_accuracy_score(y_test, clf_log.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_nb.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_svm.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_rf.predict(X_test)))
print(balanced_accuracy_score(y_test, clf_gb.predict(X_test)))

matrix = confusion_matrix(y_test, clf_log.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_nb.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_svm.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_rf.predict(X_test))
print(np.multiply(weight, matrix).sum())
matrix = confusion_matrix(y_test, clf_gb.predict(X_test))
print(np.multiply(weight, matrix).sum())

0.634287085718909
0.6592768614041296
0.6314470005097617
0.5034027895835339
0.5292988240929718
4836345
5887267
4938295
7158757
1745426


Next we inspect some characteristics of our models and try to interpret them. We try to deduce the importance of features. Note that Gaussian Naive Bayes doesn't fit in this interpretation.

In [11]:
clf_log.coef_

array([[ 0.22310042,  0.06156067,  0.09078094, -0.07791875, -0.12009886,
         0.02910475,  0.19436963,  0.08632727,  0.12687336,  0.18212114,
        -0.18212114, -0.03622169,  0.44252532]])

In [12]:
clf_svm.coef_

array([[ 0.10075912,  0.02617217,  0.04007523, -0.03348258, -0.05318494,
         0.01290629,  0.08730159,  0.03822141,  0.05603859,  0.08192074,
        -0.08192074, -0.01585976,  0.17486564]])

In [13]:
clf_gb.feature_importances_

array([0.04529501, 0.28922028, 0.01627583, 0.01448059, 0.01513379,
       0.03415826, 0.18675176, 0.00203189, 0.00213415, 0.03056403,
       0.01643807, 0.00840552, 0.33911083])

In [14]:
clf_rf.feature_importances_

array([0.0795065 , 0.11253901, 0.03978732, 0.05573005, 0.05376415,
       0.06826277, 0.13752168, 0.10309944, 0.01614442, 0.02364708,
       0.02159104, 0.10319855, 0.18520799])