## Import stuff

In [1]:
import tensorflow as tf
from keras.losses import categorical_crossentropy, mean_squared_error
from tensorflow.keras import layers

import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import random
from PIL import Image
import logging
import time

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import sklearn.metrics as metrics
from sklearn.svm import SVC
from sklearn import svm
from sklearn import datasets
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet
from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.kernel_approximation import Nystroem
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRFClassifier, XGBRegressor, XGBRFRegressor

from flaml import AutoML

import eli5
from eli5.sklearn import PermutationImportance

import shap


logger = tf.get_logger()
logger.setLevel(logging.ERROR)

print(tf.__version__)

def get_score(df_true, df_submission):
    for i in range(df_true.shape[1]):
        print("score for col ",i,": " , metrics.roc_auc_score(df_true[:,i], df_submission[:,i]))

def get_type(model):
    return str(type(model)).split(".")[-1].split("'")[0]

ny = Nystroem(random_state=1, n_components=1000,kernel='poly',degree=2)
scaler = MinMaxScaler()


2.8.0


## Load training data from csv


In [2]:
df = pd.read_csv("train_features_improved.csv")
labels_df = pd.read_csv("train_labels.csv")

X = np.array(df.iloc[:, 0:].values,  dtype=float)
Y = np.array(labels_df.iloc[:, 1:].values,  dtype=float)

print("X shape: ", X.shape)
print("Y shape: ", Y.shape)



X shape:  (18995, 514)
Y shape:  (18995, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18995 entries, 0 to 18994
Columns: 514 entries, pid to 0
dtypes: float64(477), int64(37)
memory usage: 74.5 MB


## The function that trains and rates models

In [3]:

def find_best_model(col, X, Y, X_test):
    classifiers = [
        # BayesianGaussianMixture(n_components=3, random_state=0),
        # Ridge(),
        # LogisticRegression(),
        # HistGradientBoostingClassifier(),
        # ExtraTreesClassifier(),
        # RandomForestClassifier(),
        # XGBRFClassifier(),
        # XGBClassifier(),
        # LGBMClassifier(),
        # CatBoostClassifier(logging_level='Silent'),
        AutoML(task="classification",time_budget=240,ensemble=False,auto_augment=False,verbose=False,estimator_list=['catboost']),
    ]

    regressors = [
        Ridge(),
        # HistGradientBoostingRegressor(),
        # ExtraTreesRegressor(),
        # RandomForestRegressor(),
        # XGBRFRegressor(),
        # XGBRegressor(),
        # LGBMRegressor(),
        # CatBoostRegressor(logging_level='Silent'),
    ]

    models = []

    #task 1 and 2
    if(col<11):    
        models = classifiers

    #task 3
    if(col>=11):
        models = regressors

    best_cv_score = 0
    best_model = None

    for model in models:
        start_time = time.time()
        model_type = get_type(model)
        print("training model ", model_type,"for col ", col)

        cv_score = 0

        skf = StratifiedKFold(n_splits=3,random_state=42,shuffle=True)
        split = skf.split(X, Y[:,col]) if col < 11 else skf.split(X, Y[:,0])
        for train, test in split:
            X_train, X_valid, Y_train, Y_valid = X[train], X[test], Y[train,col], Y[test,col]

            # scaler = StandardScaler()
            # X_train = scaler.fit_transform(X_train)
            # X_valid = scaler.transform(X_valid)
            # pca = PCA(n_components=500)
            # X_train = pca.fit_transform(X_train)
            # X_valid = pca.transform(X_valid)

            pred = []
            if model_type.count("Ridge") > 0:
                X_train = scaler.fit_transform(X_train)
                X_train = ny.fit_transform(X_train)
                model.fit(X_train, Y_train)
                X_valid = scaler.transform(X_valid)
                X_valid = ny.transform(X_valid)
                pred = model.predict(X_valid)
            else:
                model.fit(X_train, Y_train)
                if model_type.count("Regressor") > 0:
                    pred = model.predict(X_valid)
                else:
                    pred = model.predict_proba(X_valid)[:,1]

            if(col>=11):
                score = 0.5 + 0.5 * np.maximum(0, metrics.r2_score(Y_valid, pred))
            else:
                score = metrics.roc_auc_score(Y_valid, pred)
            cv_score += score
            
        cv_score /= 3

        print("score for",model_type,": " , cv_score,", time taken: ",int(time.time() - start_time),"s")
        if cv_score > best_cv_score:
            best_cv_score = cv_score
            best_model = model

    print("best model: ", get_type(best_model),"score: ",best_cv_score,"\n")

    # predictions = best_model.predict(X_test)
    predictions = np.zeros(5)
    return predictions,best_cv_score

## Writing results

In [4]:
X_test = pd.read_csv("test_features_improved.csv")
X_test = np.array(X_test.iloc[:, 0:].values,  dtype=float)

data = []
scores = []

for i in range(0,15):
# for i in range(0,1):
# for i in range(10,11):
# for i in range(11,15):
# for i in range(13,14):
    pred,best_score = find_best_model(i, X, Y, X_test)
    # pred,best_score = find_best_model(i, X[:2000,:], Y[:2000,:], X_test)

    pred = pred.reshape(pred.shape[0],1)
    data.append(pred)
    scores.append(best_score)





training model  AutoML for col  0


In [None]:
print(np.mean(scores[:10]))
print(np.mean(scores[10]))
print(np.mean(scores[11:]))

print(np.mean([np.mean(scores[:10]),np.mean(scores[10]),np.mean(scores[11:])]))


0.6234326488680421


IndexError: list index out of range

In [None]:
np_array = np.array(data).transpose()[0]

col_names = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct', 'LABEL_EtCO2', 'LABEL_Sepsis', 'LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
print(len(col_names))
new_df = pd.DataFrame(np_array, columns=col_names)

output_csv = pd.read_csv("sample.csv")
output_csv.update(new_df)
output_csv.to_csv('submit.zip', index=False, float_format='%.3f', compression='zip')
output_csv.to_csv('submit.csv', index=False, float_format='%.3f')


15
