In [1]:
import os

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import ensemble, preprocessing, tree
from sklearn.metrics import auc, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, StratifiedKFold

from yellowbrick.classifier import ConfusionMatrix, ROCAUC
from yellowbrick.model_selection import LearningCurve

In [3]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [4]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [5]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df_train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [7]:
### Para ver resumen interactivo

import pandas_profiling

#pandas_profiling.ProfileReport(df_train)

In [8]:
df_train.shape

(891, 12)

In [9]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [11]:
df_train.isnull().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

In [12]:
mask = df_train.isnull().any(axis=1)
df_train[mask].Age.head()

0    22.0
2    26.0
4    35.0
5     NaN
7     2.0
Name: Age, dtype: float64

In [13]:
df_train.Sex.value_counts(dropna=False)

male      577
female    314
Name: Sex, dtype: int64

In [14]:
df_train.Embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

In [15]:
df_train = df_train.drop(columns=['Name', 'Ticket', 'Cabin']) 

In [16]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [17]:
df_train = pd.get_dummies(df_train, drop_first=True)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,1,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,1
4,5,0,3,35.0,0,0,8.05,1,0,1


In [18]:
y = df_train.Survived
X = df_train.drop(columns='Survived')

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Impute values

In [21]:
from sklearn.experimental import enable_iterative_imputer
from sklearn import impute

In [22]:
num_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male']

In [23]:
imputer = impute.IterativeImputer()

In [24]:
imputed = imputer.fit_transform(X_train[num_cols])
X_train.loc[:, num_cols] = imputed

In [25]:
imputed = imputer.transform(X_val[num_cols])
X_val.loc[:, num_cols] = imputed

In [26]:
meds = X_train.median()
X_train = X_train.fillna(meds)
X_val = X_val.fillna(meds)

## Normalization

In [56]:
cols = "Pclass,Age,SibSp,Fare".split(",")
cols

['Pclass', 'Age', 'SibSp', 'Fare']

In [38]:
sca = preprocessing.StandardScaler()

In [39]:
X_train.shape

(712, 9)

In [40]:
X_train[:3]

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
331,332,1.0,45.5,0.0,0.0,28.5,1.0,0,1
733,734,2.0,23.0,0.0,0.0,13.0,1.0,0,1
382,383,3.0,32.0,0.0,0.0,7.925,1.0,0,1


In [41]:
X_val[:3]

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
709,710,3.0,23.557757,1.0,1.0,15.2458,1.0,0,0
439,440,2.0,31.0,0.0,0.0,10.5,1.0,0,1
840,841,3.0,20.0,0.0,0.0,7.925,1.0,0,1


In [58]:
X_train[cols]

Unnamed: 0,Pclass,Age,SibSp,Fare
331,1.0,45.500000,0.0,28.5000
733,2.0,23.000000,0.0,13.0000
382,3.0,32.000000,0.0,7.9250
704,3.0,26.000000,1.0,7.8542
813,3.0,6.000000,4.0,31.2750
...,...,...,...,...
106,3.0,21.000000,0.0,7.6500
270,1.0,41.440806,0.0,31.0000
860,3.0,41.000000,2.0,14.1083
435,1.0,14.000000,1.0,120.0000


In [59]:
X_train.loc[:, cols] = sca.fit_transform(X_train.loc[:,cols])
X_train.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
331,332,-1.614136,1.198754,-0.470722,0.0,-0.078684,1.0,0,1
733,734,-0.400551,-0.442672,-0.470722,0.0,-0.377145,1.0,0,1
382,383,0.813034,0.213898,-0.470722,0.0,-0.474867,1.0,0,1
704,705,0.813034,-0.223815,0.379923,0.0,-0.47623,1.0,0,1
813,814,0.813034,-1.682861,2.93186,2.0,-0.025249,0.0,0,1


In [60]:
X_val.loc[:, cols] = sca.fit_transform(X_val.loc[:,cols])
X_val.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
709,710,0.887423,-0.505923,0.820363,1.0,-0.392468,1.0,0,0
439,440,-0.255373,0.054234,-0.55202,0.0,-0.513112,1.0,0,1
840,841,0.887423,-0.773705,-0.55202,0.0,-0.578571,1.0,0,1
720,721,-0.255373,-1.827446,-0.55202,1.0,0.058863,0.0,0,1
39,40,0.887423,-1.225308,0.820363,0.0,-0.494257,0.0,0,0


## Model

In [61]:
from sklearn.dummy import DummyClassifier

In [62]:
bm = DummyClassifier()

In [63]:
bm.fit(X_train, y_train)

DummyClassifier()

In [64]:
bm.score(X_val, y_val)

0.5865921787709497

In [65]:
from sklearn import metrics

In [67]:
metrics.precision_score(y_val, bm.predict(X_val))

  _warn_prf(average, modifier, msg_start, len(result))


0.0