# Titanic Pipeline Tutorial

This is all copied from:

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

which is an awesome source.

Ok, so originally I swapped the logistic regression with random forest, because it performed better, but he uses a grid search for params so I am swapping it back.

Here, I will try some classifiers to see which perform the best on the training data, and whichever performs the best will be the one I use for the test data.

First, I have to import the libraries.

In [23]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

np.random.seed(0)

In [6]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
# y = titanic.frame['survived']


In [7]:
X

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3.0,"Zabour, Miss. Hileni",female,14.5000,1.0,0.0,2665,14.4542,,C,,328.0,
1305,3.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,
1306,3.0,"Zakarian, Mr. Mapriededer",male,26.5000,0.0,0.0,2656,7.2250,,C,,304.0,
1307,3.0,"Zakarian, Mr. Ortin",male,27.0000,0.0,0.0,2670,7.2250,,C,,,


In [8]:
y

0       1
1       1
2       0
3       0
4       0
       ..
1304    0
1305    0
1306    0
1307    0
1308    0
Name: survived, Length: 1309, dtype: category
Categories (2, object): ['0', '1']

So, as opposed to the previous, these are data frames, not arrays. Let's see how that affects things.

In [9]:
numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [10]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.790


In [22]:
clf

In [11]:
X_train

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
1118,3.0,"Peltomaki, Mr. Nikolai Johannes",male,25.0000,0.0,0.0,STON/O 2. 3101291,7.9250,,S,,,
44,1.0,"Burns, Miss. Elizabeth Margaret",female,41.0000,0.0,0.0,16966,134.5000,E40,C,3,,
1072,3.0,"O'Connell, Mr. Patrick D",male,,0.0,0.0,334912,7.7333,,Q,,,
1130,3.0,"Pettersson, Miss. Ellen Natalia",female,18.0000,0.0,0.0,347087,7.7750,,S,,,
574,2.0,"Turpin, Mr. William John Robert",male,29.0000,1.0,0.0,11668,21.0000,,S,,,"Plymouth, England"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,3.0,"Dean, Miss. Elizabeth Gladys 'Millvina'",female,0.1667,1.0,2.0,C.A. 2315,20.5750,,S,10,,"Devon, England Wichita, KS"
835,3.0,"Guest, Mr. Robert",male,,0.0,0.0,376563,8.0500,,S,,,
1216,3.0,"Smyth, Miss. Julia",female,,0.0,0.0,335432,7.7333,,Q,13,,
559,2.0,"Sincock, Miss. Maude",female,20.0000,0.0,0.0,C.A. 33112,36.7500,,S,11,,"Cornwall / Hancock, MI"


Now, try training with a subset of features.

In [12]:
subset_feature = ["embarked", "sex", "pclass", "age", "fare"]
X_train, X_test = X_train[subset_feature], X_test[subset_feature]

In [13]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1047 entries, 1118 to 684
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   embarked  1045 non-null   category
 1   sex       1047 non-null   category
 2   pclass    1047 non-null   float64 
 3   age       841 non-null    float64 
 4   fare      1046 non-null   float64 
dtypes: category(2), float64(3)
memory usage: 35.0 KB


In [14]:
from sklearn.compose import make_column_selector as selector

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category")),
    ]
)
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)


clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
clf

model score: 0.794


In [15]:
selector(dtype_exclude="category")(X_train)

['pclass', 'age', 'fare']

In [16]:
selector(dtype_include="category")(X_train)

['embarked', 'sex']

In [17]:
param_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "classifier__C": [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search

In [18]:
grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)

Best params:
{'classifier__C': 0.1, 'preprocessor__num__imputer__strategy': 'mean'}


In [19]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")

Internal CV score: 0.784


Inspect top results

In [20]:
import pandas as pd

cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[
    [
        "mean_test_score",
        "std_test_score",
        "param_preprocessor__num__imputer__strategy",
        "param_classifier__C",
    ]
].head(5)

Unnamed: 0,mean_test_score,std_test_score,param_preprocessor__num__imputer__strategy,param_classifier__C
0,0.784167,0.035824,mean,0.1
2,0.780366,0.032722,mean,1.0
1,0.780348,0.037245,median,0.1
4,0.779414,0.033105,mean,10.0
6,0.779414,0.033105,mean,100.0


In [21]:
print(
    (
        "best logistic regression from grid search: %.3f"
        % grid_search.score(X_test, y_test)
    )
)

best logistic regression from grid search: 0.794
