In [0]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score

In [0]:
df = pd.read_csv("https://raw.githubusercontent.com/widsrecife/dados/master/datathon2020/training_v2.csv")

In [0]:
df.shape

(91713, 186)

In [0]:
# Separando a coluna das labels

y = df["hospital_death"]
X = df.drop(columns="hospital_death")

In [0]:
# Agrupando as colunas por tipo

all_columns = X.columns.tolist()

columns_id = ['encounter_id', 'patient_id', 'hospital_id', 'icu_id']

columns = list(filter(lambda x: x not in columns_id, all_columns))

continuous_features = X[columns].select_dtypes(exclude='object').columns.tolist()
categorical_features = X[columns].select_dtypes(include='object').columns.tolist()


columns_first_hour = list(filter((lambda x: "h1_" in x),columns))
columns_first_day = list(filter((lambda x: "d1_" in x),columns))

columns_not_time_related = list(filter((lambda x: x not in columns_first_hour and x not in columns_first_day),columns))

columns_apache = list(filter((lambda x: "apache" in x),columns_not_time_related))
columns_not_apache = list(filter((lambda x: "apache" not in x),columns_not_time_related))

assert len(columns) == len(columns_not_time_related) + len(columns_first_hour) + len(columns_first_day)

In [0]:
# Separando 40% dos dados para a gente testar depois

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.40, random_state=42, stratify=y)

## Treinando o modelo

In [0]:
# Modelo com parâmetros default
modelo = RandomForestClassifier()

In [0]:
# Treinar o modelo
modelo.fit(X_train, y_train)

ValueError: ignored

## Explorando os dados

In [0]:
X_train.dtypes

encounter_id                     int64
patient_id                       int64
hospital_id                      int64
age                            float64
bmi                            float64
                                ...   
leukemia                       float64
lymphoma                       float64
solid_tumor_with_metastasis    float64
apache_3j_bodysystem            object
apache_2_bodysystem             object
Length: 185, dtype: object

In [0]:
missing_value_stats = X_train.isnull().sum(axis=0).sort_values(ascending=False)
missing_value_stats[missing_value_stats != 0]

h1_bilirubin_min    50792
h1_bilirubin_max    50792
h1_lactate_max      50633
h1_lactate_min      50633
h1_albumin_min      50300
                    ...  
d1_sysbp_max           86
d1_heartrate_max       81
d1_heartrate_min       81
icu_admit_source       71
gender                 14
Length: 175, dtype: int64

## Criando nossa primeira pipeline
1. Selecionar as colunas que vamos utilizar no modelo
2. Preencher valores faltantes
3. Fazer o one-hot-encoding apenas nas colunas categóricas
4. Treinar o modelo usando o RandomForestClassifier

In [0]:
# Criando os steps

# 1. selecionar as colunas que vamos utilizar
def selecionar_colunas(X):
    return X[continuous_features+categorical_features]

step_1 = FunctionTransformer(selecionar_colunas, validate=False)

# 2. preencher valores faltantes
step_2 = SimpleImputer(strategy="most_frequent")

# 3. one-hot-encoding apenas nas colunas categóricas
one_hot_encoder = OneHotEncoder(handle_unknown="ignore")
# precisamos selecionar as colunas categóricas pelo indice e não pelo nome da coluna
categorical_features_indices = np.where(X_train[continuous_features+categorical_features].dtypes == np.object)[0]

step_3 = ColumnTransformer([("one-hot-encoding", one_hot_encoder, categorical_features_indices)],
                           remainder="passthrough")

# 4. treinar o modelo
step_4 = RandomForestClassifier()

In [0]:
# Montando a pipeline
primeira_pipeline = Pipeline([("selecionar_colunas", step_1),
                              ("preencher_valores_faltantes", step_2),
                              ("one_hot_encoding", step_3),
                              ("modelo", step_4)])

## Treinando o modelo usando a pipeline

In [0]:
primeira_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('selecionar_colunas',
                 FunctionTransformer(accept_sparse=False, check_inverse=True,
                                     func=<function selecionar_colunas at 0x7f4d525b4bf8>,
                                     inv_kw_args=None, inverse_func=None,
                                     kw_args=None, validate=False)),
                ('preencher_valores_faltantes',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='most_f...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_spli

## Avaliando o modelo com os dados que separamos

In [0]:
predicoes = primeira_pipeline.predict_proba(X_test)

In [0]:
predicoes

array([[0.81, 0.19],
       [0.99, 0.01],
       [0.43, 0.57],
       ...,
       [0.99, 0.01],
       [0.87, 0.13],
       [0.59, 0.41]])

In [0]:
roc_auc_score(y_test, predicoes[:,1])

0.8847274451322751

## Gerando dados para mandar para a competição no Kaggle

In [0]:
# O scikit gera duas colunas, uma para a classe 0 e outra para a classe um
primeira_pipeline.classes_

# Queremos os scores da classe 1 (a probabilidade do paciente morrer)
df_kaggle_predicoes = pd.read_csv("https://raw.githubusercontent.com/widsrecife/dados/master/datathon2020/unlabeled.csv")
predicoes_kaggle_scores = primeira_pipeline.predict_proba(df_kaggle_predicoes)[:, 1]

# No kaggle os dados precisam de duas colunas: encounter_id e hospital_death
predicoes_kaggle_imput = pd.DataFrame(predicoes_kaggle_scores)
predicoes_kaggle_imput["encounter_id"] = df_kaggle_predicoes["encounter_id"] #pegamos os valores do dataset completo
predicoes_kaggle_imput.columns = ["hospital_death","encounter_id"] # renomeando as colunas
predicoes_kaggle_imput[["encounter_id","hospital_death"]].to_csv("primeira_predicao_random_forest_default.csv",index=False)