# Setting and load data 

In [10]:
## For Colab
from google.colab import drive
drive.mount("gdrive")
%cd '/content/gdrive/MyDrive/AI/Projects/EDA/1-Titanic'

Drive already mounted at gdrive; to attempt to forcibly remount, call drive.mount("gdrive", force_remount=True).
/content/gdrive/MyDrive/AI/Projects/EDA/1-Titanic


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


###############################################################
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold


from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

###############################################################
import pickle

In [12]:
df = pd.read_csv('datasets/train_next_steps.csv')
#df_test = pd.read_csv('datasets/test.csv')

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,0,3,male,22.0,1,0,7.25,S
1,1,1,1,female,38.0,1,0,71.2833,C
2,2,1,3,female,26.0,0,0,7.925,S
3,3,1,1,female,35.0,1,0,53.1,S
4,4,0,3,male,35.0,0,0,8.05,S


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  891 non-null    int64  
 1   Survived    891 non-null    int64  
 2   Pclass      891 non-null    int64  
 3   Sex         891 non-null    object 
 4   Age         714 non-null    float64
 5   SibSp       891 non-null    int64  
 6   Parch       891 non-null    int64  
 7   Fare        891 non-null    float64
 8   Embarked    889 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB


# Data, label and splitting strategy

In [15]:
X = df.drop('Survived', axis = 1)
y = df['Survived']

In [16]:
cv = StratifiedKFold(n_splits=3, shuffle=False)

# Preprocessing with PIPELINE

In [23]:
##Numerical columns
numeric_features = ['Age', 'Fare'] 
numeric_steps = [('imputer', SimpleImputer(strategy = 'mean'))]
numeric_transformer = Pipeline(steps=numeric_steps)
##Categorical columns
categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_steps =  [
    ('imputer', SimpleImputer(strategy= "most_frequent")),
    ('onehot', OneHotEncoder())
]
categorical_transformer = Pipeline(steps = categorical_steps)
#Combine
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)]
)

# Random Forest

In [24]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
forest_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('forest', RandomForestClassifier())])

In [25]:
param_grid = {'forest__n_estimators': [100, 200, 300, 400], 
              'forest__max_depth':[2,3,4,5,6]}


In [26]:
model_forest = GridSearchCV(forest_pipe, param_grid =param_grid,  cv= cv, scoring="accuracy") 
model_forest.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                              

In [28]:
model_forest.best_params_

{'forest__max_depth': 4, 'forest__n_estimators': 300}

In [29]:
model_forest.best_score_

0.8181818181818182

In [30]:
# Enregistrement
pickle.dump(model_forest, open('saved_models/forest.sav', 'wb'))

# Gradient Boosting

In [31]:
gb_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('gb', GradientBoostingClassifier())])

param_grid = {'gb__n_estimators': [100, 200, 300, 400], 
              'gb__max_depth':[2,3,4,5,6], 
              "gb__learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3]}

In [32]:
model_gb = GridSearchCV(gb_pipe, param_grid =param_grid,  cv= cv, scoring="accuracy") 
model_gb.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                              

In [33]:
model_gb.best_params_

{'gb__learning_rate': 0.01, 'gb__max_depth': 5, 'gb__n_estimators': 100}

In [34]:
model_gb.best_score_

0.819304152637486

In [36]:
pickle.dump(model_gb, open('saved_models/gradien_boosting.sav', 'wb'))

# AdaBoostClassifier

In [37]:
adaboot_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('adaboot', GradientBoostingClassifier())])

param_grid = {'adaboot__n_estimators': [100, 200, 300, 400], 
              'adaboot__max_depth':[2,3,4,5,6], 
              "adaboot__learning_rate": [0.001, 0.01, 0.1, 0.2, 0.3]}

In [38]:
model_adaboot = GridSearchCV(adaboot_pipe, param_grid =param_grid,  cv= cv, scoring="accuracy") 
model_adaboot.fit(X, y)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
             error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                              

In [39]:
model_adaboot.best_params_

{'adaboot__learning_rate': 0.01,
 'adaboot__max_depth': 5,
 'adaboot__n_estimators': 100}

In [40]:
model_adaboot.best_score_

0.8204264870931538

In [41]:
pickle.dump(model_adaboot, open('saved_models/ada_boosting.sav', 'wb'))