## Titanic survival

### Read data

In [1]:
import pandas as pd
from ceteris_paribus.datasets import DATASETS_DIR
import os
df = pd.read_csv(os.path.join(DATASETS_DIR, 'titanic_train.csv'))
import imgkit
import shutil

In [2]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
y = df['Survived']
x = df.drop(['Survived', 'PassengerId', 'Name', 'Cabin', 'Ticket'], inplace=False, axis=1)

In [4]:
x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [5]:
valid = x['Age'].isnull() | x['Embarked'].isnull()
x = x[-valid]
y = y[-valid]

In [6]:
x['Pclass'] = x['Pclass'].astype('float64')
x['SibSp'] = x['SibSp'].astype('float64')
x['Parch'] = x['Parch'].astype('float64')

In [7]:
x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3.0,male,22.0,1.0,0.0,7.25,S
1,1.0,female,38.0,1.0,0.0,71.2833,C
2,3.0,female,26.0,0.0,0.0,7.925,S
3,1.0,female,35.0,1.0,0.0,53.1,S
4,3.0,male,35.0,0.0,0.0,8.05,S


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [14]:
print(X_train)

Pclass     Sex   Age  SibSp  Parch     Fare Embarked
472     2.0  female  33.0    1.0    2.0  27.7500        S
432     2.0  female  42.0    1.0    0.0  26.0000        S
666     2.0    male  25.0    0.0    0.0  13.0000        S
30      1.0    male  40.0    0.0    0.0  27.7208        C
291     1.0  female  19.0    1.0    0.0  91.0792        C
..      ...     ...   ...    ...    ...      ...      ...
93      3.0    male  26.0    1.0    2.0  20.5750        S
135     2.0    male  23.0    0.0    0.0  15.0458        C
338     3.0    male  45.0    0.0    0.0   8.0500        S
549     2.0    male   8.0    1.0    1.0  36.7500        S
131     3.0    male  20.0    0.0    0.0   7.0500        S

[569 rows x 7 columns]


### Building the models

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_features = ['Embarked', 'Sex']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [10]:
from xgboost import XGBClassifier

In [11]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
xgb_clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', XGBClassifier())])

### Train the models

In [12]:
xgb_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['Pclass', 'Age', 'SibSp',
                                                   'Parch', 'Fare']),
                                          

### Evaluate the models

In [14]:
xgb_clf.classes_

array([0, 1])

In [13]:
from sklearn.metrics import accuracy_score
print("XGB {}".format(accuracy_score(y_test, xgb_clf.predict(X_test))))
print((X_test.iloc[[0]]).to_dict(orient='records')[0])
print(xgb_clf.predict_proba(X_test.iloc[[0]]))
#print(xgb_clf.predict_proba(X_test))
print(xgb_clf.predict_proba(X_test.head(2)))


XGB 0.8041958041958042
{'Pclass': 1.0, 'Sex': 'female', 'Age': 24.0, 'SibSp': 0.0, 'Parch': 0.0, 'Fare': 69.3, 'Embarked': 'C'}
[[0.02249211 0.9775079 ]]
[[0.02249211 0.9775079 ]
 [0.05318755 0.94681245]]


In [101]:
print(y)

0      0
1      1
2      1
3      1
4      0
      ..
885    0
886    0
887    1
889    1
890    0
Name: Survived, Length: 712, dtype: int64


### Explain the models

In [17]:
from ceteris_paribus.explainer import explain

explainer_xgb = explain(xgb_clf, data=x, y=y, label='XGBoost', predict_function=lambda X: xgb_clf.predict_proba(X)[::, 1])

##### Ernest James Crease

In [18]:
print(ernest)
print(label_ernest)

NameError: name 'ernest' is not defined

In [19]:
import warnings
import sklearn
warnings.filterwarnings("ignore", category=sklearn.exceptions.DataConversionWarning)
ernest = X_test.iloc[10]
label_ernest = y_test.iloc[10]
print("Referenced observation \n{}".format(ernest))
from ceteris_paribus.profiles import individual_variable_profile
cp_xgb = individual_variable_profile(explainer_xgb, ernest, label_ernest)

Referenced observation 
Pclass           3
Sex           male
Age             19
SibSp            0
Parch            0
Fare        8.1583
Embarked         S
Name: 67, dtype: object


In [17]:
from ceteris_paribus.plots.plots import plot_notebook, plot

In [18]:
plot(cp_xgb, selected_variables=["Age"], width=700, height=800, size=4)

##### Miss. Elizabeth Mussey Eustis

In [34]:
imgkit.from_file('_plot_files/plots0.html', 'out.jpg')

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


True

In [21]:
shutil.rmtree('_plot_files')