In [None]:
# Pandas
import pandas as pd

#Preprocessing 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures

# Pipeline
from sklearn.preprocessing import FunctionTransformer # it makes functions compatible with scikit-learn pipelines
from sklearn.pipeline import Pipeline   # Sequentially apply a list of transformations
from sklearn.compose import ColumnTransformer # Applies in parallel transformations to columns

# Grid search 
from sklearn.model_selection import GridSearchCV

# pipeline visualization (only if sklearn version = 0.23.1)
from sklearn import set_config
set_config(display='diagram')

In [None]:
import sklearn
sklearn.__version__

# The Titanic dataset

In [None]:
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/titanic.csv'
titanic = pd.read_csv(url)
titanic

**Numerical features**:
- Age
- Fare

**Categorical features**:
- Sex
- Ticket
- Cabin
- Embarked (Port of Embarkation: C, Q, and S)

**Ordinal features**:
- Pclass (passenger class)
- SibSp (number of siblings / spouses aboard the Titanic)
- Parch (number of parents / children aboard the Titanic)

In [None]:
# percentage of missing values
100*titanic.isnull().sum()/len(titanic)

In [None]:
# feature matrix/ target vector
feature_cols = ['Pclass','Name','Sex','Age','SibSp','Parch','Fare','Embarked']
X = titanic[feature_cols] 
y = titanic.Survived

In [None]:
# train/test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)

# Creating new features

In [None]:
X.Name.head(30)

In [None]:
# Title feature
titles = titanic.Name.apply(lambda x: x.split(",")[1].split(".")[0].strip())
titles

In [None]:
titles.value_counts().plot(kind='bar',figsize=(12,5))

In [None]:
# family size
family_size = titanic.SibSp+titanic.Parch+1
family_size

In [None]:
def get_family_size(dataframe):
    return dataframe.assign(Family_size=dataframe.SibSp + dataframe.Parch + 1)
def get_title(dataframe):
    return dataframe.assign(Title=dataframe.Name.apply(lambda x: x.split(",")[1].split(".")[0].strip()))

In [None]:
# Functions for the pipeline
family_size_processor = FunctionTransformer(get_family_size)
title_processor = FunctionTransformer(get_title)

# Preprocessing Features

In [None]:
# impute+scale
numeric_features = ['Age', 'Fare']
numeric_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])

In [None]:
categorical_features = ['Embarked', 'Sex','Title']
categorical_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
ordinal_features = ['Family_size', 'Pclass']
ordinal_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

In [None]:
feature_processor = ColumnTransformer(
    transformers=[
        ('num', numeric_processor, numeric_features),
        ('cat', categorical_processor, categorical_features),
        ('ord', ordinal_processor, ordinal_features)],
         remainder='drop') # drop "Name" column

# KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()

In [None]:
pipe = Pipeline(steps=[('get family_size', family_size_processor),
                           ('get title', title_processor),
                           ('preprocessor', feature_processor),
                           ('polynomial', PolynomialFeatures(degree=2)), # add polynomial combinations of the features
                           ('clf',knn_clf)   
                          ])
pipe

# Grid Search for hyperparameter tuning

In [None]:
param_grid = { 
    'clf__n_neighbors': list(range(1,21)),
    'clf__weights' : ['uniform','distance']
}

In [None]:
# instantiate and fit the grid
grid = GridSearchCV(pipe, param_grid, cv=10, scoring='accuracy')
grid.fit(X_train, y_train)

In [None]:
# view the results
pd.DataFrame(grid.cv_results_)[['mean_test_score', 'params']]

In [None]:
# best hyper-parameters
grid.best_params_

In [None]:
# best predictor
best_pred = grid.best_estimator_

# Test the model

In [None]:
# Making predictions: Would I have survived the Titanic disaster?
Javier = pd.DataFrame({'Pclass':[2],
                   'Name':['Perez-Alvaro, Dr. Javier'],
                   'Sex': ['male'],
                   'Age': [34],
                   'SibSp': [0],
                   'Parch': [0],
                   'Fare': [30],
                   'Embarked': ['S']})
Javier

In [None]:
best_pred.predict(Javi)

In [None]:
y_test_pred = best_model.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_test_pred) # percentage of correct predictions