In [1]:
import numpy as np
import pandas as pd

In [2]:
training = pd.read_csv('train.csv')

In [3]:
training.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
data_clean = training.drop(columns=['PassengerId', 'Name', 'Ticket'])

In [5]:
data_clean.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [6]:
data_clean = pd.get_dummies(data_clean, columns=['Sex', 'Embarked'], drop_first=True)
data_clean.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,,True,False,True
1,1,1,38.0,1,0,71.2833,C85,False,False,False
2,1,3,26.0,0,0,7.925,,False,False,True
3,1,1,35.0,1,0,53.1,C123,False,False,True
4,0,3,35.0,0,0,8.05,,True,False,True


In [7]:
data_clean['Cabin'].value_counts

<bound method IndexOpsMixin.value_counts of 0       NaN
1       C85
2       NaN
3      C123
4       NaN
       ... 
886     NaN
887     B42
888     NaN
889    C148
890     NaN
Name: Cabin, Length: 891, dtype: object>

In [8]:
data_clean['Cabin'] = data_clean['Cabin'].fillna('U') # on remplace les valeurs
# nulles par des 'U'

data_clean['Cabin'] = data_clean['Cabin'].str[0] # on remplace les valuers 
# uniques de la colonne 'Cabin' par la première lettre des numéros de cabine

data_clean = pd.get_dummies(data_clean, columns=['Cabin'], drop_first=True)
data_clean.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
0,0,3,22.0,1,0,7.25,True,False,True,False,False,False,False,False,False,False,True
1,1,1,38.0,1,0,71.2833,False,False,False,False,True,False,False,False,False,False,False
2,1,3,26.0,0,0,7.925,False,False,True,False,False,False,False,False,False,False,True
3,1,1,35.0,1,0,53.1,False,False,True,False,True,False,False,False,False,False,False
4,0,3,35.0,0,0,8.05,True,False,True,False,False,False,False,False,False,False,True


In [9]:
data_clean['Age'].isnull().head()

0    False
1    False
2    False
3    False
4    False
Name: Age, dtype: bool

In [10]:
data_clean['Age'].fillna(data_clean['Age'].mean())
data_clean.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
0,0,3,22.0,1,0,7.25,True,False,True,False,False,False,False,False,False,False,True
1,1,1,38.0,1,0,71.2833,False,False,False,False,True,False,False,False,False,False,False
2,1,3,26.0,0,0,7.925,False,False,True,False,False,False,False,False,False,False,True
3,1,1,35.0,1,0,53.1,False,False,True,False,True,False,False,False,False,False,False
4,0,3,35.0,0,0,8.05,True,False,True,False,False,False,False,False,False,False,True


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler()
data_clean[['Age']] = scaler.fit_transform(data_clean[['Age']])
data_clean[['Fare']] = scaler.fit_transform(data_clean[['Fare']])
data_clean.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
0,0,3,-0.530377,1,0,-0.502445,True,False,True,False,False,False,False,False,False,False,True
1,1,1,0.571831,1,0,0.786845,False,False,False,False,True,False,False,False,False,False,False
2,1,3,-0.254825,0,0,-0.488854,False,False,True,False,False,False,False,False,False,False,True
3,1,1,0.365167,1,0,0.42073,False,False,True,False,True,False,False,False,False,False,False
4,0,3,0.365167,0,0,-0.486337,True,False,True,False,False,False,False,False,False,False,True


In [13]:
X = data_clean.drop('Survived', axis=1)
y = data_clean['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [15]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model = model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [16]:
y_pred = model.predict(X_test)

In [17]:
print(f'The accuracy score is of {accuracy_score(y_test, y_pred)*100:.2f}%')

The accuracy score is of 79.89%


In [18]:
print(y_pred)

[0 0 0 1 1 1 1 0 1 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0
 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 1 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 1
 0 0 0 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1
 0 1 1 0 0 1 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 1 1]


# __Test for submission__

In [19]:
data_test = pd.read_csv('test.csv')

In [20]:
data_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [25]:
def Titanic_prediction2(data):
    # We keep the Ids of the passenger for the submission table
    x = data['PassengerId']
    
    # We start working on the data:
    data_clean = data.drop(columns=['PassengerId', 'Name', 'Ticket'])
    data_clean = pd.get_dummies(data_clean, columns=['Sex', 'Embarked'], 
                                drop_first=True)
    
    # We work on the 'Cabin' column:
    data_clean['Cabin'] = data_clean['Cabin'].fillna('U')
    data_clean['Cabin'] = data_clean['Cabin'].str[0]
    data_clean = pd.get_dummies(data_clean, columns=['Cabin'], drop_first=True)
    
    # We work on the 'Age' column:
    data_clean['Age'].fillna(data_clean['Age'].mean(), inplace=True)

    # We normalize the 'Age' and 'Fare' columns:
    data_clean[['Age']] = scaler.fit_transform(data_clean[['Age']])
    data_clean[['Fare']] = scaler.fit_transform(data_clean[['Fare']])
    
    # We avoid features mismatch:    
    expected_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 
                         'Sex_male', 'Embarked_Q', 'Embarked_S', 
                         'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 
                         'Cabin_F', 'Cabin_G', 'Cabin_T', 'Cabin_U']

    # Ensure all expected features are in data_clean
    for feature in expected_features:
        if feature not in data_clean.columns:
            data_clean[feature] = 0  # Add missing features with value 0

    # Align the columns to expected features (in the same order)
    data_clean = data_clean[expected_features]
    
    # We use the model we trained:
    y = model.predict(data_clean)
    return pd.DataFrame({'PassengerId': x, 'Survived': y}) # We get the output for our submission

In [26]:
Titanic_prediction2(data_test).to_csv('Victor_SOTO_Titanic_Submission2.csv', index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_clean['Age'].fillna(data_clean['Age'].mean(), inplace=True)
