In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
# from sklearn.model_selection import StratifiedKFold
# from sklearn.svm import SVC
from sklearn.linear_model import BayesianRidge
from sklearn.impute import (SimpleImputer, KNNImputer, IterativeImputer)
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier
# from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.multiclass import OneVsRestClassifier
scaler = MinMaxScaler()

np.set_printoptions(precision = 6)

In [29]:
# import data
df_train_features = pd.read_csv('train_features.csv', index_col=0)
df_train_labels   = pd.read_csv('train_labels.csv', index_col=0)
df_test_features  = pd.read_csv('test_features.csv', index_col=0)

x_ = df_train_features.values[:,1:]
y_ = df_train_labels.values
x_test = df_test_features.values[:,1:]

IdF_ = df_train_features.index
IdL_ = df_train_labels.index
IdF_test_ = df_test_features.index

In [None]:
# Extract population statistics (mean, std, ...) for each column for imputation 
# ignoring Nan values
pop_mean = np.nanmean(x_, axis = 0)
pop_median = np.nanmedian(x_, axis = 0)
pop_std = np.nanstd(x_, axis = 0)

pop_mean_test = np.nanmean(x_test, axis = 0)
pop_median_test = np.nanmedian(x_test, axis = 0)
pop_std_test = np.nanstd(x_test, axis = 0)

# Extract unique PIDs creating a dictionary
IdF_unique = pd.unique(IdF_) # feature indexes
IdL_unique = pd.unique(IdL_) # label indexes
IdF_test_unique = pd.unique(IdF_test_) # feature indexes

print(~np.any(IdF_unique - IdL_unique)) # should be True, meaning that IdF_unique and IdL_unique are exactly the same

In [41]:
# Imputing Strategy (depends on how many nan values in each column)
def impute_strategy(nb_Nan):
    if nb_Nan >= 12:
        strategy = 'C'
    elif (nb_Nan>=5) and (nb_Nan<12):
        strategy = 'M'
    elif (nb_Nan < 5) and (nb_Nan>0):
        strategy = 'I'
    return strategy

# Impute Data function
def impute_data(strategy, col_median):
    
    if strategy == 'C':
    # Estimate the score after replacing missing values by 0
        imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=col_median)
    
    if strategy == 'M':
    # Estimate the score after imputation (mean strategy) of the missing values
        imputer = SimpleImputer(missing_values=np.nan, strategy='median', fill_value=None)
    
    if strategy == 'KNN':
    # Estimate the score after kNN-imputation of the missing values
        imputer = KNNImputer(missing_values=np.nan, n_neighbors=5, weights='distance', metric='nan_euclidean')
    
    if strategy == 'I':
    # Estimate the score after iterative imputation of the missing values
        imputer = IterativeImputer(estimator=BayesianRidge(), missing_values=np.nan, max_iter=10, tol=0.001, n_nearest_features=5, initial_strategy='mean')
    
    return imputer

# Function to get patient'x data from it's ID number
def get_patient_data(pid_index):
    return x_[pid_index*12:pid_index*12 + 12, :]

def get_patient_data_test(pid_index):
    return x_test[pid_index*12:pid_index*12 + 12, :]

In [None]:
# Preprocessing for subtask C
# ---------------------- TRAINING DATA PREPROCESSING ---------------------------- 
# Imputation
x_imputed = np.zeros_like(x_)
for pid_idx in range(len(IdF_unique)):
    # get training data for a patient (dim 12 x d)
    x_patient = get_patient_data(pid_idx)
    x_patient_imputed = np.zeros_like(x_patient) # initialise imputed array for each patient
    
    # deal with column seperately
    for col in range(len(x_patient[1, :])):
        # Count number of Nan in each column
        nan_count = np.isnan(x_patient[:, col]).sum()
        
        if nan_count>0:
            # Imputation Strategy
            strategy = impute_strategy(nan_count)
            # Impute data for each column
            x_patient_col = x_patient[:, col].reshape(-1, 1) #reshape to be size (12,1)
            impute_estimator = impute_data(strategy, pop_median[col])
            x_patient_imputed[:, col] = impute_estimator.fit_transform(x_patient_col).reshape(-1) # reshape to be size (12,)
        else:
            x_patient_imputed[:, col] = x_patient[:, col]           
    # Concatenate imputed data for every patient
    x_imputed[pid_idx*12:pid_idx*12 + 12, :] = x_patient_imputed

print("done")

# ---------------------- TEST DATA PREPROCESSING ---------------------------- 
# Imputation
x_test_imputed = np.zeros_like(x_test)
for pid_idx in range(len(IdF_test_unique)):
    # get training data for a patient (dim 12 x d)
    x_test_patient = get_patient_data_test(pid_idx)
    x_test_patient_imputed = np.zeros_like(x_test_patient) # initialise imputed array for each patient
    
    # deal with column seperately
    for col in range(len(x_test_patient[1, :])):
        # Count number of Nan in each column
        nan_count = np.isnan(x_test_patient[:, col]).sum()
        
        if nan_count>0:
            # Imputation Strategy
            strategy = impute_strategy(nan_count)
            # Impute data for each column
            x_test_patient_col = x_test_patient[:, col].reshape(-1, 1) #reshape to be size (12,1)
            impute_estimator = impute_data(strategy, pop_median_test[col])
            x_test_patient_imputed[:, col] = impute_estimator.fit_transform(x_test_patient_col).reshape(-1) # reshape to be size (12,)
        else:
            x_test_patient_imputed[:, col] = x_test_patient[:, col]           
    # Concatenate imputed data for every patient
    x_test_imputed[pid_idx*12:pid_idx*12 + 12, :] = x_test_patient_imputed

print("done")

# Once the data has been processed, save it to avoid recomputing it
np.save('x_imputed.npy', x_imputed)
np.save('x_test_imputed.npy', x_test_imputed)

In [None]:
# Preprocessing for subtask A, B
# ---------------------- TRAINING DATA PREPROCESSING ---------------------------- 
# Imputation
x_imputed = np.zeros((int(x_.shape[0]/12), 35))

for pid_idx in range(len(IdF_unique)):
    # get training data for a patient (dim 12 x d)
    x_patient = get_patient_data(pid_idx)
    x_patient_imputed = np.array([0.]*35)
    
    # deal with column seperately
    for col in range(len(x_patient[1, :])):
        # Count number of Nan in each column
        nan_count = np.isnan(x_patient[:, col]).sum()
        
        if nan_count<12:
            x_patient_imputed[col]=np.nanmean(x_patient, axis = 0)[col]
        else:
            x_patient_imputed[col] = -1           
    # Concatenate imputed data for every patient
    x_imputed[pid_idx, :] = x_patient_imputed
    
print(x_imputed)

print("done")

# ---------------------- TEST DATA PREPROCESSING ---------------------------- 
# Imputation
x_test_imputed = np.zeros((int(x_test.shape[0]/12), 35))

for pid_idx in range(len(IdF_test_unique)):
    # get training data for a patient (dim 12 x d)
    x_test_patient = get_patient_data_test(pid_idx)
    x_test_patient_imputed = np.array([0.]*35)
    
    # deal with column seperately
    for col in range(len(x_test_patient[1, :])):
        # Count number of Nan in each column
        nan_count = np.isnan(x_test_patient[:, col]).sum()
        
        if nan_count<12:
            x_test_patient_imputed[col]=np.nanmean(x_test_patient, axis = 0)[col]
        else:
            x_test_patient_imputed[col] = -1           
    # Concatenate imputed data for every patient
    x_test_imputed[pid_idx, :] = x_test_patient_imputed
    
print(x_test_imputed)

print("done")

# Once the data has been processed, save it to avoid recomputing it
np.save('x_imputed2.npy', x_imputed)
np.save('x_test_imputed2.npy', x_test_imputed)

In [None]:
Labels=['pid','LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2','LABEL_Sepsis','LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
output = pd.DataFrame(np.array([[0]*16]), columns=Labels)

In [83]:
# Load the imputed data, in case it has already been calculated and saves
x_imputed = np.load('x_imputed2.npy',allow_pickle='TRUE')
x_test_imputed = np.load('x_test_imputed2.npy',allow_pickle='TRUE')

In [None]:
# for cross-validation, skipped
# ---------------------- SUBTASK A,B: CLASSIFICATION ----------------------------
# c=RandomForestClassifier(min_samples_leaf=20, class_weight='balanced', n_estimators=100)
# brm_clf = OneVsRestClassifier(c)

# # each row contains the total data for one patient
# # x_imputed = x_imputed.reshape(-1, 36)
# cnt = 0

# # 10 fold cross-validation to choose hyperparameters
# num_fold = 10
# kf = KFold(n_splits=num_fold)
# score = 0
# for train_index, val_index in kf.split(x_imputed): # split the data into validation and training sets
#     print(cnt, 'starting .. ', end="")

#     x_train = x_imputed[train_index]
#     #x_train_scaled = scaler.fit_transform(x_train)
#     x_val = x_imputed[val_index]
#     #x_val_scaled = scaler.transform(x_val)

#     y_train = y_[train_index, 0:11]
#     y_val = y_[val_index, 0:11]

#     brm_clf.fit(x_train, y_train) # apply model to training data
#     y_est = brm_clf.predict_proba(x_val) # predict labels using validation data

#     score += np.mean([(roc_auc_score(y_val[:, k], y_est[:, k])) for k in range(11)])
#     print("Task A score :", np.mean([(roc_auc_score(y_val[:, k], y_est[:, k])) for k in range(10)]), "Task B score :", roc_auc_score(y_val[:, 10], y_est[:, 10]))
#     print("done")
#     cnt += 1
    
# # EVALUATION of model
# # Calculate average auroc score for all 10 folds
# avg_score = score/num_fold

# print(avg_score)

In [85]:
# Fit the Random Forest on the whole training set
x_tot = x_imputed
y_tot = y_[:, 0:11]

c=RandomForestClassifier(min_samples_leaf=20, class_weight='balanced', n_estimators=100)
# c.fit(x_tot,y_tot)

brm_clf = OneVsRestClassifier(c)
brm_clf.fit(x_tot, y_tot)

# Predict the Labels for the test features
x_test_tasksA_B = x_test_imputed
y_est_test=brm_clf.predict_proba(x_test_tasksA_B)

In [88]:
# Write in the output dataframe the obtained values
TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2','LABEL_Sepsis']
i=0
for pid in IdF_test_unique:
    output.loc[i,'pid']=pid
    output.loc[i,TESTS]=y_est_test[i,:]
    i+=1

In [62]:
# Load the imputed data, in case it has already been calculated and saves
x_imputed = np.load('x_imputed.npy',allow_pickle='TRUE')
x_test_imputed = np.load('x_test_imputed.npy',allow_pickle='TRUE')

x_imputed = x_imputed.reshape(-1, 12*36)
x_test_imputed = x_test_imputed.reshape(-1, 12*36)

In [None]:
# ---------------------- SUBTASK C: REGRESSION ----------------------------
def taskC_regressor(alpha_):
    c=Ridge(alpha=alpha_)

    # 10 fold cross-validation to choose hyperparameters
    num_fold = 10
    kf = KFold(n_splits=num_fold)
    score = 0
    for train_index, val_index in kf.split(x_imputed): # split the data into validation and training sets
        x_train = x_imputed[train_index]
        x_train_scaled = scaler.fit_transform(x_train)
        x_val = x_imputed[val_index]
        x_val_scaled = scaler.transform(x_val)
        
        y_train = y_[train_index, 11:15]
        y_val = y_[val_index, 11:15]
 
        c.fit(x_train_scaled, y_train) # apply model to training data
        y_est = np.transpose(c.predict(x_val_scaled)) # predict labels using validation data
        
        score += np.mean([0.5 + 0.5*np.maximum(0, r2_score(y_val[:,k],y_est[k,:])) for k in range(4)])
        print(np.mean([0.5 + 0.5*np.maximum(0, r2_score(y_val[:,k],y_est[k,:])) for k in range(4)]))

    # EVALUATION of model
    # Calculate average auroc score for all 10 folds
    avg_score = score/num_fold

    return avg_score

alpha_values = np.logspace(-3, 3, num=50, base=10.) #50 values between 10-3 to 10+3 
print (alpha_values)
avg_scores = [taskC_regressor(alpha) for alpha in alpha_values]
print(avg_scores)

# find alpha that has the highest score
best_alpha = alpha_values[np.argmax(avg_scores)]
print(best_alpha)

In [64]:
# Fit the best Ridge regression on the whole training set
x_tot_scaled = scaler.fit_transform(x_imputed)
y_tot = y_[:, 11:15]

c=Ridge(alpha=best_alpha)
c.fit(x_tot_scaled, y_tot)

x_test_scaled=scaler.transform(x_test_imputed)
y_est_test=(c.predict(x_test_scaled))

In [65]:
# Write in the output dataframe the obtained values
i=0
for pid in IdF_test_unique:
    output.loc[i,'pid']=int(pid)
    output.loc[i,['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']]=y_est_test[i, :]
    i+=1

In [90]:
# Save the output (predicted labels for the test features)
# output.to_csv('prediction2.csv', index=False, float_format='%.3f')
compression_opts = dict(method='zip', archive_name='prediction.csv')
output.to_csv('prediction.zip', index=False, float_format='%.3f', compression=compression_opts)