# Exemple entrainement classifier sur Titanic

In [34]:
#import des librairies publiques
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from category_encoders import OrdinalEncoder 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn import set_config
from joblib import dump 
sklearn.__version__

'0.24.2'

In [24]:
# import des jeux de donnees
# données numériques & categorielles
df = pd.read_csv('data_test/titanic.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,survived
0,0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO",1
1,1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON",1
2,2,1.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0
3,3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON",0
4,4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON",0


## Préparation des données

In [25]:
#On décide d'utiliser 7 variables pour prédire si un passager va survivre
target_var = ['survived']
x_var = ['pclass', 'sex', 'embarked', 'age', 'sibsp', 'parch', 'fare']
df = df[x_var+target_var].copy()
df['pclass'] = df['pclass'].astype('int')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1309 non-null   int64  
 1   sex       1309 non-null   object 
 2   embarked  1307 non-null   object 
 3   age       1046 non-null   float64
 4   sibsp     1309 non-null   float64
 5   parch     1309 non-null   float64
 6   fare      1308 non-null   float64
 7   survived  1309 non-null   int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 81.9+ KB


In [4]:
# Traitement des var manquantes

#Remplacement age manquant par mediane
df['age'] = df['age'].fillna(df['age'].median())

#On drop les lignes avec d'autres valeurs manquantes (qui sont tres peu nombreuses)
df = df.dropna(axis=0, how='any')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1306 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1306 non-null   int64  
 1   sex       1306 non-null   object 
 2   embarked  1306 non-null   object 
 3   age       1306 non-null   float64
 4   sibsp     1306 non-null   float64
 5   parch     1306 non-null   float64
 6   fare      1306 non-null   float64
 7   survived  1306 non-null   int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 91.8+ KB


In [15]:
#Creation X et y
X = df[x_var].copy()
y = df[target_var].values.reshape(-1)
X.head()
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1306 entries, 0 to 1308
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    1306 non-null   int64  
 1   sex       1306 non-null   object 
 2   embarked  1306 non-null   object 
 3   age       1306 non-null   float64
 4   sibsp     1306 non-null   float64
 5   parch     1306 non-null   float64
 6   fare      1306 non-null   float64
dtypes: float64(4), int64(1), object(2)
memory usage: 81.6+ KB


# Entrainement du modele

In [27]:
#separation train/test pour entrainer (et valider le modele)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
x_train.shape
y_train.shape

(1044,)

In [36]:
#Creation d'un encoder pour les vars categorielles
categorical_columns = ['sex', 'embarked']
encoder = OrdinalEncoder(cols=categorical_columns, return_df=True)
encoder

In [37]:
#Creation RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf

In [38]:
#Creation et entrainement du pipeline
from sklearn.pipeline import make_pipeline
from sklearn import set_config
pipe = make_pipeline(encoder, rf)
set_config(display="diagram")
pipe.fit(x_train, y_train)



In [30]:
#Evaluation sur données de test
y_pred = pipe.predict(x_test)

In [39]:
#Calcul de certaines metriques
print("Accuracy: ",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  0.8015267175572519
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       157
           1       0.75      0.76      0.75       105

    accuracy                           0.80       262
   macro avg       0.79      0.79      0.79       262
weighted avg       0.80      0.80      0.80       262



## Sauvegarde du modele

In [40]:
#Sauvegarde du modele pipe
dump(pipe, 'pipe.joblib')

['pipe.joblib']