In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# PRE-PROCESSING

In [26]:
df=data.copy()

In [27]:
# featur ingenering 

# Extraire le groupe à partir de PassengerId
df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0])

# Calculer la taille du groupe pour chaque passager
df['GroupSize'] = df.groupby('Group')['PassengerId'].transform('count')

# Créer une colonne indiquant si le passager est seul ou non
df['IsAlone'] = (df['GroupSize'] == 1).astype(int)


In [28]:
df=df.drop(['PassengerId','Name','Group','GroupSize'],axis=1)

In [29]:
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,IsAlone
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,1
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,0
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,0
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,1


In [30]:
list_col=list(df.columns)
list_col.remove('IsAlone')
list_col.insert(0,'IsAlone')

In [31]:
df=df[list_col]
df.head()

Unnamed: 0,IsAlone,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,1,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,0,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,0,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,1,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [32]:
df.shape

(8693, 13)

In [33]:
nan_count = df.isnull().sum()
nan_count

IsAlone           0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64

## TrainTest - Nettoyage - Encodage

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
trainset, testset = train_test_split(df,test_size=0.2,random_state=0)

In [36]:
trainset['Transported'].value_counts()

True     3502
False    3452
Name: Transported, dtype: int64

In [37]:
testset['Transported'].value_counts()

True     876
False    863
Name: Transported, dtype: int64

## Encodage

In [14]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def preprocess_dataframe(df):
    # Copier le dataframe original pour ne pas modifier l'original
    df_processed = df.copy()

    # Appliquer l'encodage one-hot pour les colonnes de type object (catégoriques)
    df_processed = pd.get_dummies(df_processed, drop_first=True)

    # Normalisation des colonnes avec des valeurs numériques (floats et entiers)
    # Identifie les colonnes numériques (int et float)
    numeric_columns = df_processed.select_dtypes(include=['int64', 'float64']).columns

    # Normaliser les colonnes numériques à l'aide de MinMaxScaler (pour les amener entre 0 et 1)
    scaler = MinMaxScaler()
    df_processed[numeric_columns] = scaler.fit_transform(df_processed[numeric_columns])

    # Conversion des colonnes booléennes True/False en 1/0
    bool_columns = df_processed.select_dtypes(include=['bool']).columns
    df_processed[bool_columns] = df_processed[bool_columns].astype(int)

    return df_processed

# Exemple d'utilisation
# data = {
#     'Destination': ['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', 'TRAPPIST-1e'],
#     'Age': [18, 25, 40, 60],
#     'Distance': [0.5, 2.3, 1.1, 0.9],  # Variable numérique continue (float)
#     'Voyage_Possible': [True, False, True, True]  # Variable booléenne
# }

# df = pd.DataFrame(df)
df_processed = preprocess_dataframe(df)

print(df_processed)


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [18]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def preprocess_dataframe(df):
    # Copier le dataframe original pour ne pas modifier l'original
    df_processed = df.copy()

    # Appliquer l'encodage one-hot pour les colonnes de type object (catégoriques)
    df_processed = pd.get_dummies(df_processed, drop_first=True)

    # Remplacer les valeurs infinies par NaN (si elles existent)
    df_processed.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Gérer les valeurs NaN : ici, on choisit de remplacer les NaN par la moyenne de chaque colonne
    df_processed.fillna(df_processed.mean(), inplace=True)

    # Normalisation des colonnes avec des valeurs numériques (floats et entiers)
    # Identifie les colonnes numériques (int et float)
    numeric_columns = df_processed.select_dtypes(include=['int64', 'float64']).columns

    # Normaliser les colonnes numériques à l'aide de MinMaxScaler (pour les amener entre 0 et 1)
    scaler = MinMaxScaler()
    df_processed[numeric_columns] = scaler.fit_transform(df_processed[numeric_columns])

    # Conversion des colonnes booléennes True/False en 1/0
    bool_columns = df_processed.select_dtypes(include=['bool']).columns
    df_processed[bool_columns] = df_processed[bool_columns].astype(int)

    return df_processed

# Exemple d'utilisation
data = {
    'Destination': ['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', 'TRAPPIST-1e'],
    'Age': [18, 25, np.nan, 60],  # Il y a un NaN ici
    'Distance': [0.5, 2.3, np.inf, 0.9],  # Il y a une valeur infinie ici
    'Voyage_Possible': [True, False, True, True]  # Variable booléenne
}

df = pd.DataFrame(data)
df_processed = preprocess_dataframe(df)

print(df_processed)


        Age  Distance  Voyage_Possible  Destination_PSO J318.5-22  \
0  0.000000  0.000000                1                        0.0   
1  0.166667  1.000000                0                        1.0   
2  0.388889  0.407407                1                        0.0   
3  1.000000  0.222222                1                        0.0   

   Destination_TRAPPIST-1e  
0                      1.0  
1                      0.0  
2                      0.0  
3                      1.0  


In [19]:
import numpy as np

def clean_dataframe(df):
    # Remplacer les valeurs infinies par NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Remplacer les NaN par la moyenne de la colonne
    df.fillna(df.mean(), inplace=True)

    return df


In [20]:
import pandas as pd

def apply_one_hot_encoding(df):
    # Appliquer l'encodage one-hot pour les colonnes de type 'object' (catégoriques)
    df_encoded = pd.get_dummies(df, drop_first=True)
    return df_encoded


In [21]:
def convert_booleans_to_integers(df):
    # Rechercher les colonnes de type booléen et les convertir en 1/0
    bool_columns = df.select_dtypes(include=['bool']).columns
    df[bool_columns] = df[bool_columns].astype(int)
    return df


In [22]:
from sklearn.preprocessing import MinMaxScaler

def apply_normalization(df):
    # Identifier les colonnes numériques (int et float)
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns

    # Appliquer la normalisation MinMaxScaler
    scaler = MinMaxScaler()
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    
    return df


In [38]:
# Étape 1 : Nettoyer les NaN et valeurs infinies
df_cleaned = clean_dataframe(df)

# Étape 2 : Appliquer l'encodage one-hot
df_encoded = apply_one_hot_encoding(df_cleaned)

# Étape 3 : Convertir les booléens en 1/0
df_converted = convert_booleans_to_integers(df_encoded)

# Étape 4 : Appliquer la normalisation
df_normalized = apply_normalization(df_converted)

# df_normalized est maintenant le dataframe final traité
print(df_normalized)


      IsAlone       Age  RoomService  FoodCourt  ShoppingMall       Spa  \
0         1.0  0.493671     0.000000   0.000000      0.000000  0.000000   
1         1.0  0.303797     0.007608   0.000302      0.001064  0.024500   
2         0.0  0.734177     0.003001   0.119948      0.000000  0.299670   
3         0.0  0.417722     0.000000   0.043035      0.015793  0.148563   
4         1.0  0.202532     0.021149   0.002348      0.006428  0.025214   
5         1.0  0.556962     0.000000   0.016201      0.000000  0.012986   
6         0.0  0.329114     0.002932   0.051622      0.000128  0.000000   
7         0.0  0.354430     0.000000   0.000000      0.000000  0.000000   
8         1.0  0.443038     0.000000   0.026331      0.000724  0.009639   
9         0.0  0.177215     0.000000   0.000000      0.000000  0.000000   
10        0.0  0.430380     0.000000   0.000000      0.007395  0.000000   
11        0.0  0.569620     0.002722   0.244692      0.025072  0.004909   
12        1.0  0.405063  

In [39]:
df_normalized

Unnamed: 0,IsAlone,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,...,Cabin_G/999/S,Cabin_T/0/P,Cabin_T/1/P,Cabin_T/2/P,Cabin_T/2/S,Cabin_T/3/P,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_0.0234393404005,VIP_True
0,1.0,0.493671,0.000000,0.000000,0.000000,0.000000,0.000000,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,1.0,0.303797,0.007608,0.000302,0.001064,0.024500,0.001823,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.0,0.734177,0.003001,0.119948,0.000000,0.299670,0.002030,0,1,0,...,0,0,0,0,0,0,0,1,0,1
3,0.0,0.417722,0.000000,0.043035,0.015793,0.148563,0.007997,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,1.0,0.202532,0.021149,0.002348,0.006428,0.025214,0.000083,1,0,0,...,0,0,0,0,0,0,0,1,0,0
5,1.0,0.556962,0.000000,0.016201,0.000000,0.012986,0.000000,1,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0.0,0.329114,0.002932,0.051622,0.000128,0.000000,0.000000,1,0,0,...,0,0,0,0,0,0,0,1,0,0
7,0.0,0.354430,0.000000,0.000000,0.000000,0.000000,0.012632,1,0,0,...,0,0,0,0,0,0,0,1,0,0
8,1.0,0.443038,0.000000,0.026331,0.000724,0.009639,0.000000,1,0,0,...,0,0,0,0,0,0,0,1,0,0
9,0.0,0.177215,0.000000,0.000000,0.000000,0.000000,0.000000,1,1,0,...,0,0,0,0,0,0,0,0,0,0
