In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [44]:
titanic_train_data = pd.read_csv("train.csv")

In [45]:
titanic_train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [46]:
titanic_train_data = titanic_train_data.drop(columns=["Cabin", "Name", "Ticket"])

In [47]:
train_set, test_set = train_test_split(
    titanic_train_data,
    test_size=0.2,
    random_state=0,
    stratify=titanic_train_data["Sex"],
)

In [48]:
y_train = train_set["Survived"]
X_train = train_set.drop(columns=["Survived"])

In [49]:
colums_with_missing_values = X_train.columns[X_train.isnull().any()]

In [50]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_imputer = SimpleImputer(strategy="median")
object_imputer = SimpleImputer(strategy="most_frequent")
oneHotEncoder = OneHotEncoder(
    handle_unknown="ignore", sparse_output=False, drop="first"
)

In [51]:
def getLogPlusOne(fare):
    return np.log(fare + 1)

In [52]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

fare_transformer = Pipeline(
    steps=[
        (
            "log_plus_1",
            FunctionTransformer(
                getLogPlusOne, validate=True, feature_names_out="one-to-one"
            ),
        ),
        ("standard_scaler", StandardScaler()),
    ]
)

In [53]:
pipeline_numerical_imputer = Pipeline(
    steps=[("numerical_imputer", numerical_imputer), ("scaler", StandardScaler())]
)
pipeline_object_imputer = Pipeline(
    steps=[("object_imputer", object_imputer), ("one_hot_encode", oneHotEncoder)]
)
numerical_standard_pipeline = Pipeline(steps=[("scaler", StandardScaler())])
object_standard_pipeline = Pipeline(steps=[("one_hot_encoder", oneHotEncoder)])

In [54]:
X_train.columns

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [55]:
from sklearn.compose import ColumnTransformer

columnTransformer = ColumnTransformer(
    [
        ("fare_transformer", fare_transformer, ["Fare"]),
        ("numerical_with_missing_vals", pipeline_numerical_imputer, ["Age"]),
        ("object_with_missing_vals", pipeline_object_imputer, ["Embarked"]),
        ("standard_object", object_standard_pipeline, ["Sex"]),
        ("standard_numeric", numerical_standard_pipeline, ["SibSp", "Parch"]),
    ],
    remainder="passthrough",
)

In [56]:
columnTransformer = columnTransformer.fit(X_train)

In [57]:
X_train_transformed = columnTransformer.transform(X_train)

In [58]:
X_train_transformed_df = pd.DataFrame(
    X_train_transformed,
    index=X_train,
    columns=columnTransformer.get_feature_names_out(),
)

In [59]:
y_test = test_set["Survived"]
X_test = test_set.drop(columns=["Survived"])

In [60]:
X_test_transformed = columnTransformer.transform(X_test)

## Random Forest 🏆🏆

In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

randomForest = RandomForestClassifier(n_estimators=50, criterion="entropy", random_state=0)

In [62]:
randomForest.fit(X_train_transformed, y_train)
y_test_predict = randomForest.predict(X_test_transformed)

In [63]:
accuracies = cross_val_score(estimator= randomForest, X=X_train_transformed, y=y_train, cv= 10)
print(accuracies.mean())
print(accuracies.std())

0.8384585289514866
0.04048462185410346


In [64]:
print(confusion_matrix(y_test, y_test_predict, normalize='all'))
print(accuracy_score(y_test, y_test_predict, normalize=True))

[[0.50837989 0.11731844]
 [0.12290503 0.25139665]]
0.7597765363128491


## Cat Boost

In [None]:
from catboost import CatBoostClassifier
cat_classifier = CatBoostClassifier()
cat_classifier.fit(X_train_transformed, y_train)

Learning rate set to 0.008911
0:	learn: 0.6896101	total: 1.64ms	remaining: 1.64s
1:	learn: 0.6843821	total: 2.62ms	remaining: 1.3s
2:	learn: 0.6783118	total: 3.74ms	remaining: 1.24s
3:	learn: 0.6728875	total: 4.83ms	remaining: 1.2s
4:	learn: 0.6677360	total: 5.84ms	remaining: 1.16s
5:	learn: 0.6620517	total: 8.24ms	remaining: 1.36s
6:	learn: 0.6572754	total: 9.71ms	remaining: 1.38s
7:	learn: 0.6524914	total: 11.1ms	remaining: 1.37s
8:	learn: 0.6475718	total: 12.4ms	remaining: 1.36s
9:	learn: 0.6426821	total: 13.8ms	remaining: 1.37s
10:	learn: 0.6377577	total: 38.4ms	remaining: 3.46s
11:	learn: 0.6349919	total: 44.4ms	remaining: 3.66s
12:	learn: 0.6301541	total: 49.3ms	remaining: 3.75s
13:	learn: 0.6253363	total: 50.5ms	remaining: 3.56s
14:	learn: 0.6212741	total: 51.2ms	remaining: 3.36s
15:	learn: 0.6168816	total: 51.8ms	remaining: 3.19s
16:	learn: 0.6123746	total: 53ms	remaining: 3.06s
17:	learn: 0.6083492	total: 54.1ms	remaining: 2.95s
18:	learn: 0.6050447	total: 55.2ms	remaining: 2.

<catboost.core.CatBoostClassifier at 0x10646b6d0>

In [68]:
accuracies = cross_val_score(estimator= cat_classifier, X=X_train_transformed, y=y_train, cv= 10)
print(accuracies.mean())
print(accuracies.std())

Learning rate set to 0.008515
0:	learn: 0.6901082	total: 1.33ms	remaining: 1.33s
1:	learn: 0.6849125	total: 3.1ms	remaining: 1.55s
2:	learn: 0.6799459	total: 4.82ms	remaining: 1.6s
3:	learn: 0.6756555	total: 5.86ms	remaining: 1.46s
4:	learn: 0.6699872	total: 7.12ms	remaining: 1.42s
5:	learn: 0.6654858	total: 8.23ms	remaining: 1.36s
6:	learn: 0.6603664	total: 9.83ms	remaining: 1.39s
7:	learn: 0.6555419	total: 10.8ms	remaining: 1.34s
8:	learn: 0.6512754	total: 11.8ms	remaining: 1.3s
9:	learn: 0.6471244	total: 12.5ms	remaining: 1.23s
10:	learn: 0.6431889	total: 13.3ms	remaining: 1.2s
11:	learn: 0.6387736	total: 14.4ms	remaining: 1.19s
12:	learn: 0.6352507	total: 15.7ms	remaining: 1.2s
13:	learn: 0.6316074	total: 19.4ms	remaining: 1.36s
14:	learn: 0.6277395	total: 20.5ms	remaining: 1.34s
15:	learn: 0.6234970	total: 21.1ms	remaining: 1.3s
16:	learn: 0.6197664	total: 21.8ms	remaining: 1.26s
17:	learn: 0.6157649	total: 22.6ms	remaining: 1.23s
18:	learn: 0.6119192	total: 23.4ms	remaining: 1.21

In [71]:
y_test_predict = cat_classifier.predict(X_test_transformed)
print(confusion_matrix(y_test, y_test_predict, normalize='all'))
print(accuracy_score(y_test, y_test_predict, normalize=True))

[[0.51955307 0.10614525]
 [0.12849162 0.24581006]]
0.7653631284916201


## XGBoost

In [69]:
from xgboost import XGBClassifier
xg_classifier = XGBClassifier()
xg_classifier.fit(X_train_transformed, y_train)

In [70]:
accuracies = cross_val_score(estimator= xg_classifier, X=X_train_transformed, y=y_train, cv= 10)
print(accuracies.mean())
print(accuracies.std())

0.8230633802816902
0.034401136145953456


In [72]:
y_test_predict = xg_classifier.predict(X_test_transformed)
print(confusion_matrix(y_test, y_test_predict, normalize='all'))
print(accuracy_score(y_test, y_test_predict, normalize=True))

[[0.48044693 0.1452514 ]
 [0.10055866 0.27374302]]
0.7541899441340782
