# Titanic - Machine Learning from Disaster
## (Random Forest)

In [1]:
# Import des modules

# Manipulation de donnée
import pandas as pd
import numpy as np

# Visualisation des données
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Lecture des différentes dataframe
train_data = pd.read_csv('train.csv')

test_data = pd.read_csv('test.csv')

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
print('Shape de la dataframe:', train_data.shape)
print('Type de données de la DataFrame:', train_data.dtypes)

Shape de la dataframe: (891, 12)
Type de données de la DataFrame: PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [5]:
# Répartition de la variable "Embarked"
embarked = train_data['Embarked'].value_counts()

embarked

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

Je vais transformer les variables Embarked et sexe en variable numérique pour pouvoir la création du modèle de Random Forest.

- **Pour Embarked**: C / Q / S --> 1 / 2 / 3

- **Pour Sex**: female / male --> 0 / 1

In [6]:
# Recodage de la variable Embarked

train_data['Embarked'] = train_data['Embarked'].replace(['C','Q','S'] , [1,2,3])

In [7]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,3.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,1.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,3.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,3.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,3.0


In [8]:
train_data['Sex'] = train_data['Sex'].replace(['female', 'male'], [0,1])

## Analyse univarié des variables

In [9]:
# Répartition de la variables Pclass
print('Répartition de la variable Pclass:', train_data['Pclass'].value_counts())

Répartition de la variable Pclass: Pclass
3    491
1    216
2    184
Name: count, dtype: int64


In [10]:
# Description de la variable Age
print('Description de la variable Age', train_data['Age'].describe())

Description de la variable Age count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64


In [11]:
list_var_cat = ['Survived', 'Sex', 'Embarked']

list_var_num = ['SibSp', 'Fare']

In [12]:
# Répartition des variables catégoriel

for var in list_var_cat:
    print(f'Répartion de la variable {var}:', train_data[var].value_counts())

Répartion de la variable Survived: Survived
0    549
1    342
Name: count, dtype: int64
Répartion de la variable Sex: Sex
1    577
0    314
Name: count, dtype: int64
Répartion de la variable Embarked: Embarked
3.0    644
1.0    168
2.0     77
Name: count, dtype: int64


In [13]:
# Description des variables numérique continue

for var in list_var_num:
    print(f'Description de la variable {var}', train_data[var].describe())

Description de la variable SibSp count    891.000000
mean       0.523008
std        1.102743
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        8.000000
Name: SibSp, dtype: float64
Description de la variable Fare count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64


In [14]:
# Vérification de la présence de valeur null

print('Tableau des valeurs manquantes:', train_data.isnull().sum())

Tableau des valeurs manquantes: PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [15]:
# Imputation par la médiane de chaque classe
train_data['Age'] = train_data.groupby('Pclass')['Age'].transform(
    lambda x: x.fillna(x.median())
)

In [17]:
# Recodage des valeur null en 1 
train_data['Embarked'] = train_data['Embarked'].fillna('1')

In [18]:
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [19]:
# Suppression de la variable Cabin car offre peu d'information

train_data = train_data.drop('Cabin', axis = 1)

In [20]:
train_data = train_data.copy()

Pour résumer, je viens de réaliser le nettoyage de la Dataframe afin d'exploiter au mieux la dataframe. J'ai supprimer la variables Cabin qui possédait trop de variable. Imputer la variable age pour ne pas perdre trop d'individu pour l'entraînement du modèle. Transformer certaines variables catégoriel en variable numérique pour une meilleur analyse.

## Test de corrélation

In [21]:
from scipy import stats
from scipy.stats import pearsonr, mstats

In [22]:
# Réalisation du test de corrélation (r de pearson)

colonne_name = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'SibSp', 'Parch']

for var in colonne_name:
    # Test de Pearson
    corr, pval = pearsonr(train_data[var], train_data['Survived'])

    results = []
    
    results.append({
        'Variable': var,
        'Corrélation': corr,
        'P-value': pval,
        'Significatif': 'Oui' if pval < 0.05 else 'Non'
        })
        
    print(f"{var:15} | r = {corr:7.4f} | p-value = {pval:.6f} | Sig: {'*' if pval < 0.05 else ''}")

Pclass          | r = -0.3385 | p-value = 0.000000 | Sig: *
Sex             | r = -0.5434 | p-value = 0.000000 | Sig: *
Age             | r = -0.0473 | p-value = 0.158735 | Sig: 
Fare            | r =  0.2573 | p-value = 0.000000 | Sig: *
Embarked        | r = -0.1742 | p-value = 0.000000 | Sig: *
SibSp           | r = -0.0353 | p-value = 0.292244 | Sig: 
Parch           | r =  0.0816 | p-value = 0.014799 | Sig: *


Le test de pearson montre que les variables Age et SibSp ne sont pas significative et qu'elle ne sont pas nécessaire dans le cadre de la construction du modèle. 

# Création du modèle de classification

Je compte utilisé un random forest qui se décrit comme  un algorithme qui crée de nombreux arbres de décision, chacun entraîné sur une partie aléatoire des données. Pour faire une prédiction, tous les arbres "votent" ensemble : la classe prédite par la majorité devient le résultat final. Cette approche combine la simplicité des arbres de décision avec la puissance de l'ensemble, ce qui rend le modèle robuste et moins sujet au surapprentissage.

## RandomForrest

In [23]:
from sklearn.ensemble import  RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

* y = variable à classifier

* x = feature

In [24]:
y = train_data['Survived']

features = ['Pclass', 'Sex','Parch', 'Fare', 'Embarked']

X = pd.get_dummies(train_data[features])

In [25]:
# transformation de X et y en valeur train et test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Définir les paramètres à tester
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

In [27]:
# Créer le modèle de base
rf = RandomForestClassifier(random_state=42)

# GridSearchCV teste toutes les combinaisons de paramètres
grid_search = GridSearchCV(
    rf,
    param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    verbose=1
)

In [28]:
# Entraîner et chercher les meilleurs paramètres
grid_search.fit(X_train, y_train)

# Afficher les meilleurs paramètres trouvés
print(f"Meilleurs paramètres: {grid_search.best_params_}")
print(f"Meilleur score (CV): {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Meilleurs paramètres: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Meilleur score (CV): 0.8133


In [29]:
# Évaluer sur l'ensemble de test
y_pred = grid_search.predict(X_test)
print(f"\nAccuracy sur le test: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")

# Voir les résultats détaillés pour chaque combinaison
results_df = pd.DataFrame(grid_search.cv_results_)
print(results_df[['param_n_estimators', 'param_max_depth', 'mean_test_score', 'std_test_score']])


Accuracy sur le test: 0.8045
Precision: 0.7746
Recall: 0.7432
F1-score: 0.7586
    param_n_estimators param_max_depth  mean_test_score  std_test_score
0                   50               5         0.803359        0.038109
1                  100               5         0.809002        0.034227
2                  200               5         0.801970        0.036516
3                   50               5         0.807584        0.037937
4                  100               5         0.808992        0.035429
..                 ...             ...              ...             ...
211                100            None         0.806185        0.028255
212                200            None         0.801960        0.025693
213                 50            None         0.800561        0.028343
214                100            None         0.803368        0.024353
215                200            None         0.799153        0.029709

[216 rows x 4 columns]


## Modèle et création de la DataFrame 'submitted'

In [29]:
# Re-transformer les variables comme je l'ai fait au début
test_data['Sex'] = test_data['Sex'].replace(['female', 'male'],[0,1])

test_data['Embarked'] = test_data['Embarked'].replace(['C','Q','S'], [1,2,3])

In [30]:
test_data['Age'] = test_data.groupby('Pclass')['Age'].transform(
    lambda x: x.fillna(x.median())
)

test_data['Embarked'] = test_data['Embarked'].fillna(1)

In [31]:
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0000,,3
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,2
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,3
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,3
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",1,24.0,0,0,A.5. 3236,8.0500,,3
414,1306,1,"Oliva y Ocana, Dona. Fermina",0,39.0,0,0,PC 17758,108.9000,C105,1
415,1307,3,"Saether, Mr. Simon Sivertsen",1,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,3
416,1308,3,"Ware, Mr. Frederick",1,24.0,0,0,359309,8.0500,,3


In [32]:
train_data['Embarked'] = train_data['Embarked'].astype(int)
test_data['Embarked'] = test_data['Embarked'].astype(int)

In [36]:
# Test sur la nouvelle dataframe

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Fare", 'Embarked']
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(max_depth = 10, max_features= 'sqrt', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 200, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
