In [1]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install yellowbrick



In [2]:
# import libraries
import matplotlib.pyplot as plt
import pandas as pd
import xlrd
import yellowbrick
from sklearn import (ensemble, preprocessing, tree)
from sklearn.metrics import (auc, confusion_matrix, roc_auc_score, roc_curve)
from sklearn.model_selection import (train_test_split, StratifiedKFold)
from yellowbrick.classifier import (ConfusionMatrix, ROCAUC)
from yellowbrick.model_selection import (LearningCurve)

In [3]:
# Load Titanic data
url = (
    "https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3.xls")

In [4]:
df = pd.read_excel(url)
orig_df = df

In [5]:
df.dtypes

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [6]:
!{sys.executable} -m pip install pandas_profiling
import pandas_profiling



In [7]:
df.shape

(1309, 14)

In [8]:
# describe, just first two columns
df.describe().iloc[:, :2]

Unnamed: 0,pclass,survived
count,1309.0,1309.0
mean,2.294882,0.381971
std,0.837836,0.486055
min,1.0,0.0
25%,2.0,0.0
50%,3.0,0.0
75%,3.0,1.0
max,3.0,1.0


In [9]:
# identify missing data, 0 = no missing data, > 0 is the count of missing data
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [10]:
df.isnull().sum(axis=1).loc[:10]

0     1
1     1
2     2
3     1
4     2
5     1
6     1
7     2
8     1
9     2
10    1
dtype: int64

In [11]:
mask = df.isnull().any(axis=1)

In [12]:
mask.head() # rows

0    True
1    True
2    True
3    True
4    True
dtype: bool

In [13]:
df[mask].body.head()

0      NaN
1      NaN
2      NaN
3    135.0
4      NaN
Name: body, dtype: float64

In [14]:
df.sex.value_counts(dropna=False)

male      843
female    466
Name: sex, dtype: int64

In [15]:
df.embarked.value_counts(dropna=False)

S      914
C      270
Q      123
NaN      2
Name: embarked, dtype: int64

In [16]:
name = df.name
name.head(3)

0     Allen, Miss. Elisabeth Walton
1    Allison, Master. Hudson Trevor
2      Allison, Miss. Helen Loraine
Name: name, dtype: object

In [17]:
# p.24
df = pd.get_dummies(df)
df.columns

Index(['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare', 'body',
       'name_Abbing, Mr. Anthony', 'name_Abbott, Master. Eugene Joseph',
       'name_Abbott, Mr. Rossmore Edward',
       ...
       'home.dest_Wimbledon Park, London / Hayling Island, Hants',
       'home.dest_Windsor, England New York, NY', 'home.dest_Winnipeg, MB',
       'home.dest_Winnipeg, MN', 'home.dest_Woodford County, KY',
       'home.dest_Worcester, England', 'home.dest_Worcester, MA',
       'home.dest_Yoevil, England / Cottage Grove, OR',
       'home.dest_Youngstown, OH', 'home.dest_Zurich, Switzerland'],
      dtype='object', length=2841)

In [18]:
# remove perfectly correlated columns
df = df.drop(columns="sex_male")

In [19]:
y = df.survived
X = df.drop(columns="survived")

In [29]:
df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,"name_Abbing, Mr. Anthony","name_Abbott, Master. Eugene Joseph","name_Abbott, Mr. Rossmore Edward",...,"home.dest_Wimbledon Park, London / Hayling Island, Hants","home.dest_Windsor, England New York, NY","home.dest_Winnipeg, MB","home.dest_Winnipeg, MN","home.dest_Woodford County, KY","home.dest_Worcester, England","home.dest_Worcester, MA","home.dest_Yoevil, England / Cottage Grove, OR","home.dest_Youngstown, OH","home.dest_Zurich, Switzerland"
0,1,1,29.0,0,0,211.3375,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0.9167,1,2,151.55,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,2.0,1,2,151.55,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,30.0,1,2,151.55,135.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,25.0,1,2,151.55,,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
df.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,"name_Abbing, Mr. Anthony","name_Abbott, Master. Eugene Joseph","name_Abbott, Mr. Rossmore Edward",...,"home.dest_Wimbledon Park, London / Hayling Island, Hants","home.dest_Windsor, England New York, NY","home.dest_Winnipeg, MB","home.dest_Winnipeg, MN","home.dest_Woodford County, KY","home.dest_Worcester, England","home.dest_Worcester, MA","home.dest_Yoevil, England / Cottage Grove, OR","home.dest_Youngstown, OH","home.dest_Zurich, Switzerland"
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0,1309.0,1309.0,1309.0,...,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917,0.000764,0.000764,0.000764,...,0.000764,0.000764,0.006112,0.000764,0.000764,0.000764,0.001528,0.000764,0.003056,0.002292
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922,0.027639,0.027639,0.027639,...,0.027639,0.027639,0.077967,0.027639,0.027639,0.027639,0.039073,0.027639,0.055216,0.047836
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [32]:
df.describe(include='all')

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,"name_Abbing, Mr. Anthony","name_Abbott, Master. Eugene Joseph","name_Abbott, Mr. Rossmore Edward",...,"home.dest_Wimbledon Park, London / Hayling Island, Hants","home.dest_Windsor, England New York, NY","home.dest_Winnipeg, MB","home.dest_Winnipeg, MN","home.dest_Woodford County, KY","home.dest_Worcester, England","home.dest_Worcester, MA","home.dest_Yoevil, England / Cottage Grove, OR","home.dest_Youngstown, OH","home.dest_Zurich, Switzerland"
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0,121.0,1309.0,1309.0,1309.0,...,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0,1309.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479,160.809917,0.000764,0.000764,0.000764,...,0.000764,0.000764,0.006112,0.000764,0.000764,0.000764,0.001528,0.000764,0.003056,0.002292
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668,97.696922,0.027639,0.027639,0.027639,...,0.027639,0.027639,0.077967,0.027639,0.027639,0.027639,0.039073,0.027639,0.055216,0.047836
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,72.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,0.0,28.0,0.0,0.0,14.4542,155.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,1.0,39.0,1.0,0.0,31.275,256.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.0,1.0,80.0,8.0,9.0,512.3292,328.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [33]:
# Sample Data
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=42)

NameError: name 'model_selection' is not defined