In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# PRE-PROCESSING

In [3]:
df=data.copy()

In [4]:
# featur ingenering 

# Extraire le groupe à partir de PassengerId
df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0])

# Calculer la taille du groupe pour chaque passager
df['GroupSize'] = df.groupby('Group')['PassengerId'].transform('count')

# Créer une colonne indiquant si le passager est seul ou non
df['IsAlone'] = (df['GroupSize'] == 1).astype(int)


In [5]:
df=df.drop(['PassengerId','Name','Group','GroupSize'],axis=1)

In [6]:
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,IsAlone
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,1
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,0
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,0
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,1


In [7]:
list_col=list(df.columns)
list_col.remove('IsAlone')
list_col.insert(0,'IsAlone')

In [8]:
df=df[list_col]
df.head()

Unnamed: 0,IsAlone,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,1,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,1,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [9]:
df.shape

(8693, 13)

In [10]:
nan_count = df.isnull().sum()
nan_count

IsAlone           0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64

## Encodage

In [12]:
import numpy as np

def clean_dataframe(df):
    # Remplacer les valeurs infinies par NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Remplacer les NaN par la moyenne de la colonne
#     df.fillna(df.mean(), inplace=True)
    df = df.dropna(axis=0)

    return df


In [13]:
import pandas as pd

def apply_one_hot_encoding(df):
    # Appliquer l'encodage one-hot pour les colonnes de type 'object' (catégoriques)
    df_encoded = pd.get_dummies(df, drop_first=True)
    return df_encoded


In [14]:
def convert_booleans_to_integers(df):
    # Rechercher les colonnes de type booléen et les convertir en 1/0
    bool_columns = df.select_dtypes(include=['bool']).columns
    df[bool_columns] = df[bool_columns].astype(int)
    return df


In [15]:
from sklearn.preprocessing import MinMaxScaler

def apply_normalization(df):
    # Identifier les colonnes numériques (int et float)
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns

    # Appliquer la normalisation MinMaxScaler
    scaler = MinMaxScaler()
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    
    return df


In [16]:
# Étape 1 : Nettoyer les NaN et valeurs infinies
df_cleaned = clean_dataframe(df)

# Étape 2 : Appliquer l'encodage one-hot
df_encoded = apply_one_hot_encoding(df_cleaned)

# Étape 3 : Convertir les booléens en 1/0
df_converted = convert_booleans_to_integers(df_encoded)

# Étape 4 : Appliquer la normalisation
df_normalized = apply_normalization(df_converted)

# df_normalized est maintenant le dataframe final traité
print(df_normalized)


      IsAlone       Age  RoomService  FoodCourt  ShoppingMall       Spa  \
0         1.0  0.493671     0.000000   0.000000      0.000000  0.000000   
1         1.0  0.303797     0.007608   0.000302      0.001064  0.024500   
2         0.0  0.734177     0.003001   0.119948      0.000000  0.299670   
3         0.0  0.417722     0.000000   0.043035      0.015793  0.148563   
4         1.0  0.202532     0.021149   0.002348      0.006428  0.025214   
5         1.0  0.556962     0.000000   0.016201      0.000000  0.012986   
6         0.0  0.329114     0.002932   0.051622      0.000128  0.000000   
7         0.0  0.354430     0.000000   0.000000      0.000000  0.000000   
8         1.0  0.443038     0.000000   0.026331      0.000724  0.009639   
9         0.0  0.177215     0.000000   0.000000      0.000000  0.000000   
10        0.0  0.430380     0.000000   0.000000      0.007395  0.000000   
11        0.0  0.569620     0.002722   0.244692      0.025072  0.004909   
12        1.0  0.405063  

In [18]:
df=df_normalized
df.head()

Unnamed: 0,IsAlone,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,...,Cabin_G/999/S,Cabin_T/0/P,Cabin_T/1/P,Cabin_T/2/P,Cabin_T/2/S,Cabin_T/3/P,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_0.0234393404005,VIP_True
0,1.0,0.493671,0.0,0.0,0.0,0.0,0.0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,1.0,0.303797,0.007608,0.000302,0.001064,0.0245,0.001823,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.0,0.734177,0.003001,0.119948,0.0,0.29967,0.00203,0,1,0,...,0,0,0,0,0,0,0,1,0,1
3,0.0,0.417722,0.0,0.043035,0.015793,0.148563,0.007997,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,1.0,0.202532,0.021149,0.002348,0.006428,0.025214,8.3e-05,1,0,0,...,0,0,0,0,0,0,0,1,0,0


## TrainTest - Nettoyage - Encodage

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
trainset, testset = train_test_split(df,test_size=0.2,random_state=0)

In [21]:
trainset['Transported'].value_counts()

1    3502
0    3452
Name: Transported, dtype: int64

In [22]:
testset['Transported'].value_counts()

1    876
0    863
Name: Transported, dtype: int64

In [27]:
def x_y(df):
    x=df.drop(['Transported'],axis=1)
    y=df['Transported']
    return x,y


In [29]:
X_train,Y_train=x_y(trainset)
X_test,Y_test=x_y(testset)

In [31]:
X_train.shape

(6954, 6574)

In [32]:
trainset.shape

(6954, 6575)

# Modellisation

In [38]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA

  from numpy.core.umath_tests import inner1d


In [39]:
model_1 = RandomForestClassifier(random_state=0)

In [None]:
model_2 = make_pipeline(PolynomialFeatures(2), SelectKBest(f_classif, k=10),
                      RandomForestClassifier(random_state=0))