In [31]:
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn import (
    ensemble,
    preprocessing,
    tree
)

from sklearn.metrics import (
    auc,
    confusion_matrix,
    roc_auc_score,
    roc_curve
)

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold
)

from yellowbrick.classifier import (
    ConfusionMatrix,
    ROCAUC
)

from yellowbrick.model_selection import (
    LearningCurve,
)

In [32]:
url = 'https://hbiostat.org/data/repo/titanic3.xls'
df = pd.read_excel(url)
orig_df = df


In [33]:
# percentage
df.isnull().mean(axis=1).loc[:10]*100

0      7.142857
1      7.142857
2     14.285714
3      7.142857
4     14.285714
5      7.142857
6      7.142857
7     14.285714
8      7.142857
9     14.285714
10     7.142857
dtype: float64

In [34]:
mask = df.isnull().any(axis=1)
df[mask].body.head()

0      NaN
1      NaN
2      NaN
3    135.0
4      NaN
Name: body, dtype: float64

In [35]:
df.sex.value_counts(dropna=False)

male      843
female    466
Name: sex, dtype: int64

In [36]:
df.embarked.value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: embarked, dtype: int64

Criando atributos

In [37]:
name = df.name
name.head(3)

0     Allen, Miss. Elisabeth Walton
1    Allison, Master. Hudson Trevor
2      Allison, Miss. Helen Loraine
Name: name, dtype: object

In [38]:
df = df.drop(
    columns=[
    "name",
    "ticket",
    "home.dest",
    "boat",	
    "body",
    "cabin"
    ]
)

In [39]:
df = pd.get_dummies(df)

In [40]:
df = df.drop(columns="sex_male")

In [41]:
# criando colunas dummy
df = pd.get_dummies(df, drop_first=True)
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'sex_female',
       'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [44]:
y = df.survived
X = df.drop(columns='survived')

In [45]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X.columns

Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_female', 'embarked_C',
       'embarked_Q', 'embarked_S'],
      dtype='object')

imptação dos dados

In [46]:
from sklearn.experimental import (
    enable_iterative_imputer,
)
from sklearn import impute
num_cols = [
    "pclass",
    "age",
    "sibsp",
    "parch",
    "fare",
    "sex_female",
]

In [47]:
imputer = impute.IterativeImputer()
imputed = imputer.fit_transform(
    X_train[num_cols]
)
X_train.loc[:, num_cols] = imputed
imputed = imputer.transform(X_test[num_cols])
X_test.loc[:, num_cols] = imputed

In [48]:
meds = X_train.median()
X_train = X_train.fillna(meds)
X_test = X_test.fillna(meds)

Normalizar os dados

In [49]:
cols = ['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_female', 'embarked_C',
   'embarked_Q', 'embarked_S']
sca = preprocessing.StandardScaler()
X_train = sca.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=cols)
X_test = sca.transform(X_test)
X_test = pd.DataFrame(X_test, columns=cols)

Refatorando o código para criar funções

In [50]:
def tweak_titanic(df):
    df = df.drop(
        columns=[
            "name",
            "ticket",
            "home.dest",
            "boat",
            "body",
            "cabin",
        ]
    ).pipe(pd.get_dummies, drop_first=True)
    return df

def get_train_test_X_y(
    df, y_col, size=0.3, std_cols=None
):
    y = df[y_col]
    X = df.drop(columns=y_col)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=size, random_state=42
    )
    cols = X.columns
    num_cols = [
        "pclass",
        "age",
        "sibsp",
        "parch",
        "fare",
    ]
    fi = impute.IterativeImputer()
    fitted = fi.fit_transform(X_train[num_cols])
    X_train = X_train.assign(**{c:fitted[:,i] for i, c in enumerate(num_cols)})
    test_fit = fi.transform(X_test[num_cols])
    X_test = X_test.assign(**{c:test_fit[:,i] for i, c in enumerate(num_cols)})
    if std_cols:
        std = preprocessing.StandardScaler()
        fitted = std.fit_transform(X_train[std_cols])
        X_train = X_train.assign(**{c:fitted[:,i] for i, c in enumerate(std_cols)})
        test_fit = std.transform(X_test[std_cols])
        X_test = X_test.assign(**{c:test_fit[:,i] for i, c in enumerate(std_cols)})

    return X_train, X_test, y_train, y_test

In [51]:
ti_df = tweak_titanic(orig_df)
std_cols = "pclass,age,sibsp,fare".split(",")
X_train, X_test, y_train, y_test = get_train_test_X_y(
    ti_df, "survived", std_cols=std_cols
)

Testando familias de algoritmos e comprando AUC e desvio-padrao usando validaçao cruzada K-fold