# Setting and load data 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


###############################################################
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold


from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC



###############################################################
import pickle

In [2]:
df = pd.read_csv('datasets/train_next_steps.csv')
#df_test = pd.read_csv('datasets/test.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,0,3,male,22.0,1,0,7.25,S
1,1,1,1,female,38.0,1,0,71.2833,C
2,2,1,3,female,26.0,0,0,7.925,S
3,3,1,1,female,35.0,1,0,53.1,S
4,4,0,3,male,35.0,0,0,8.05,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  891 non-null    int64  
 1   Survived    891 non-null    int64  
 2   Pclass      891 non-null    int64  
 3   Sex         891 non-null    object 
 4   Age         714 non-null    float64
 5   SibSp       891 non-null    int64  
 6   Parch       891 non-null    int64  
 7   Fare        891 non-null    float64
 8   Embarked    889 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB


# Data, label and splitting strategy

In [5]:
X = df.drop('Survived', axis = 1)
y = df['Survived']

In [6]:
cv = StratifiedKFold(n_splits=3, shuffle=False)

# Preprocessing with PIPELINE

In [7]:
def buil_processor(scaler_option='StandardScaler'):
    """More option for scaler"""
    numeric_features = ['Age', 'Fare']
    if scaler_option == 'MinMaxScaler': 
        numeric_steps =  [
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', MinMaxScaler())]
    else:
        numeric_steps =  [
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())]    
    numeric_transformer = Pipeline(steps=numeric_steps)
    ##Categorical columns
    categorical_features = ['Pclass', 'Sex', 'Embarked']
    categorical_steps =  [
        #('imputer', SimpleImputer(strategy= "most_frequent")),
        ('onehot', OneHotEncoder())
    ]
    categorical_transformer = Pipeline(steps = categorical_steps)
    #Combine
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)]
    )
    return preprocessor

# KNN and SVC

In [8]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf_pipe = Pipeline(steps=[('preprocessor', buil_processor()),
                      ('classifier', LogisticRegression())])

In [9]:
param_grid = [
    {'preprocessor': [buil_processor(), buil_processor('MinMaxScaler')], 'classifier': [KNeighborsClassifier()], 
    "classifier__n_neighbors":[2,3,4,5,6,7,8,9,10]},   
    {'preprocessor': [buil_processor(), buil_processor('MinMaxScaler')], 'classifier': [SVC()],
    'classifier__kernel': ['rbf', "sigmoid"], 
     'classifier__C': [0.1, 0.15, 0.2, 0.3, 0.5, 1, 2, 2.5, 3, 3.5, 4, 5, 6, 7, 8, 9, 10]}
]

In [10]:
model = GridSearchCV(clf_pipe, param_grid =param_grid,  cv= cv, scoring="accuracy") 
model.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['Age',
                                                                          'Fare']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('onehot',
                    

In [11]:
model.best_params_

{'classifier': SVC(C=1),
 'classifier__C': 1,
 'classifier__kernel': 'rbf',
 'preprocessor': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('imputer', SimpleImputer()),
                                                  ('scaler', StandardScaler())]),
                                  ['Age', 'Fare']),
                                 ('cat',
                                  Pipeline(steps=[('onehot', OneHotEncoder())]),
                                  ['Pclass', 'Sex', 'Embarked'])])}

In [12]:
model.best_score_

0.8226711560044894

# Save the best model

In [13]:
# Enregistrement
pickle.dump(model, open('saved_models/knn_svm.sav', 'wb'))