In [28]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import auc, confusion_matrix, roc_auc_score, roc_curve
from yellowbrick.classifier import ConfusionMatrix,ROCAUC
from yellowbrick.model_selection import LearningCurve
from ydata_profiling import ProfileReport

url = "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls"
df = pd.read_excel(url)
orig_df = df

profile = ProfileReport(df, title="Profiling Report")
df = df.drop(columns=["name","ticket","home.dest","boat","body","cabin",])

profile

Summarize dataset: 100%|██████████| 49/49 [00:03<00:00, 15.61it/s, Completed]                  
Generate report structure: 100%|██████████| 1/1 [00:04<00:00,  4.73s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.40it/s]




In [29]:
df

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.0000,0,0,211.3375,S
1,1,1,male,0.9167,1,2,151.5500,S
2,1,0,female,2.0000,1,2,151.5500,S
3,1,0,male,30.0000,1,2,151.5500,S
4,1,0,female,25.0000,1,2,151.5500,S
...,...,...,...,...,...,...,...,...
1304,3,0,female,14.5000,1,0,14.4542,C
1305,3,0,female,,1,0,14.4542,C
1306,3,0,male,26.5000,0,0,7.2250,C
1307,3,0,male,27.0000,0,0,7.2250,C


In [30]:
df

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.0000,0,0,211.3375,S
1,1,1,male,0.9167,1,2,151.5500,S
2,1,0,female,2.0000,1,2,151.5500,S
3,1,0,male,30.0000,1,2,151.5500,S
4,1,0,female,25.0000,1,2,151.5500,S
...,...,...,...,...,...,...,...,...
1304,3,0,female,14.5000,1,0,14.4542,C
1305,3,0,female,,1,0,14.4542,C
1306,3,0,male,26.5000,0,0,7.2250,C
1307,3,0,male,27.0000,0,0,7.2250,C


In [32]:
df = pd.get_dummies(df) # modify string entries

df

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1,1,29.0000,0,0,211.3375,True,False,False,False,True
1,1,1,0.9167,1,2,151.5500,False,True,False,False,True
2,1,0,2.0000,1,2,151.5500,True,False,False,False,True
3,1,0,30.0000,1,2,151.5500,False,True,False,False,True
4,1,0,25.0000,1,2,151.5500,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,14.5000,1,0,14.4542,True,False,True,False,False
1305,3,0,,1,0,14.4542,True,False,True,False,False
1306,3,0,26.5000,0,0,7.2250,False,True,True,False,False
1307,3,0,27.0000,0,0,7.2250,False,True,True,False,False


In [35]:
df = df.drop(columns=["sex_male"])

KeyError: "['sex_male'] not found in axis"

In [36]:
# build dataset

y = df.survived
X = df.drop(columns="survived")

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [41]:
profile = ProfileReport(X_train, title="Profiling Report")

profile

Summarize dataset: 100%|██████████| 35/35 [00:02<00:00, 15.65it/s, Completed]                  
Generate report structure: 100%|██████████| 1/1 [00:02<00:00,  2.25s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s]




In [45]:
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute

num_cols = [
"pclass",
"age",
"sibsp",
"parch",
"fare",
"sex_female",
]

X_train[num_cols]

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female
1214,3,,0,0,8.6625,False
677,3,26.0,0,0,7.8958,False
534,2,19.0,0,0,26.0000,True
1174,3,,8,2,69.5500,True
864,3,28.0,0,0,7.7750,True
...,...,...,...,...,...,...
1095,3,,0,0,7.6292,True
1130,3,18.0,0,0,7.7750,True
1294,3,28.5,0,0,16.1000,False
860,3,26.0,0,0,7.9250,True


In [52]:
imputer = impute.IterativeImputer()

imputed = imputer.fit_transform(
X_train[num_cols]
)

imputed

array([[ 3.        , 26.98448077,  0.        ,  0.        ,  8.6625    ,
         0.        ],
       [ 3.        , 26.        ,  0.        ,  0.        ,  7.8958    ,
         0.        ],
       [ 2.        , 19.        ,  0.        ,  0.        , 26.        ,
         1.        ],
       ...,
       [ 3.        , 28.5       ,  0.        ,  0.        , 16.1       ,
         0.        ],
       [ 3.        , 26.        ,  0.        ,  0.        ,  7.925     ,
         1.        ],
       [ 3.        , 28.        ,  0.        ,  0.        ,  7.8958    ,
         1.        ]])

In [60]:
X_train.loc[:, num_cols] = imputed # replace missing values

X_train

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,embarked_C,embarked_Q,embarked_S
1214,3,26.984481,0,0,8.6625,0.0,False,False,True
677,3,26.000000,0,0,7.8958,0.0,False,False,True
534,2,19.000000,0,0,26.0000,1.0,False,False,True
1174,3,0.437798,8,2,69.5500,1.0,False,False,True
864,3,28.000000,0,0,7.7750,1.0,False,False,True
...,...,...,...,...,...,...,...,...,...
1095,3,25.011612,0,0,7.6292,1.0,False,True,False
1130,3,18.000000,0,0,7.7750,1.0,False,False,True
1294,3,28.500000,0,0,16.1000,0.0,False,False,True
860,3,26.000000,0,0,7.9250,1.0,False,False,True
