In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
titanic_train_data = pd.read_csv("train.csv")

In [3]:
titanic_train_data = titanic_train_data.drop(columns=["Cabin", "Name", "Ticket"])

In [None]:
fare_cats_bins = list(range(0, 301, 50))
fare_cats_bins_with_inf = fare_cats_bins + [np.inf]
titanic_train_data["Fare_cat"] = pd.cut(
    titanic_train_data["Fare"],
    include_lowest=True,
    bins=pd.Series(fare_cats_bins_with_inf),
    labels=pd.Series(fare_cats_bins),
)
titanic_train_data["Fare_cat"] = titanic_train_data["Fare_cat"].astype(int)

In [5]:
train_set, test_set = train_test_split(
    titanic_train_data,
    test_size=0.2,
    random_state=0,
    stratify=titanic_train_data["Fare_cat"],
)

In [6]:
y_train = train_set["Survived"]
X_train = train_set.drop(columns=["Survived"])

In [7]:
colums_with_missing_values = X_train.columns[X_train.isnull().any()]

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_imputer = SimpleImputer(strategy="median")
object_imputer = SimpleImputer(strategy="most_frequent")
oneHotEncoder = OneHotEncoder(
    handle_unknown="ignore", sparse_output=False, drop="first"
)

In [9]:
def getLogPlusOne(fare):
    return np.log(fare + 1)

In [10]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

fare_transformer = Pipeline(
    steps=[
        (
            "log_plus_1",
            FunctionTransformer(
                getLogPlusOne, validate=True, feature_names_out="one-to-one"
            ),
        ),
        ("standard_scaler", StandardScaler()),
    ]
)

In [11]:
pipeline_numerical_imputer = Pipeline(
    steps=[("numerical_imputer", numerical_imputer), ("scaler", StandardScaler())]
)
pipeline_object_imputer = Pipeline(
    steps=[("object_imputer", object_imputer), ("one_hot_encode", oneHotEncoder)]
)
numerical_standard_pipeline = Pipeline(steps=[("scaler", StandardScaler())])
object_standard_pipeline = Pipeline(steps=[("one_hot_encoder", oneHotEncoder)])

In [None]:
from sklearn.compose import ColumnTransformer

columnTransformer = ColumnTransformer(
    [
        ("fare_transformer", fare_transformer, ["Fare"]),
        ("numerical_with_missing_vals", pipeline_numerical_imputer, ["Age"]),
        ("object_with_missing_vals", pipeline_object_imputer, ["Embarked"]),
        ("standard_object", object_standard_pipeline, ["Sex"]),
        (
            "standard_numeric",
            numerical_standard_pipeline,
            ["SibSp", "Parch", "Fare_cat"],
        ),
    ],
    remainder="passthrough",
)

In [13]:
columnTransformer = columnTransformer.fit(X_train)

In [14]:
X_train_transformed = columnTransformer.transform(X_train)

In [15]:
y_test = test_set["Survived"]
X_test = test_set.drop(columns=["Survived"])

In [16]:
X_test_transformed = columnTransformer.transform(X_test)

# Random Forest (winner)

In [None]:
from sklearn.ensemble import RandomForestClassifier

randomForest = RandomForestClassifier(
    n_estimators=50, criterion="entropy", random_state=0
)

In [18]:
randomForest.fit(X_train_transformed, y_train)
y_test_predict = randomForest.predict(X_test_transformed)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

print(confusion_matrix(y_test, y_test_predict, normalize="all"))
print(accuracy_score(y_test, y_test_predict, normalize=True))

[[0.57541899 0.06703911]
 [0.09497207 0.26256983]]
0.8379888268156425


Perform random search for hyper params

In [None]:
from scipy.stats import randint, uniform

param_distributions = {
    "n_estimators": randint(low=50, high=1000),  # Number of trees in the forest
    "max_features": [
        "sqrt",
        "log2",
        None,
    ],  # Number of features to consider at each split
    "max_depth": randint(low=10, high=110),  # Maximum number of levels in a tree
    "min_samples_split": randint(
        low=2, high=20
    ),  # Minimum number of samples required to split a node
    "min_samples_leaf": randint(
        low=1, high=10
    ),  # Minimum number of samples required at each leaf node
    "bootstrap": [
        True,
        False,
    ],  # Whether bootstrap samples are used when building trees
    "criterion": ["gini", "entropy"],  # The function to measure the quality of a split
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator=randomForest,
    param_distributions=param_distributions,
    n_iter=100,  # Number of random combinations to try
    cv=5,  # 5-fold cross-validation
    verbose=2,  # Increase verbosity to see progress
    random_state=42,  # Set for reproducibility
    n_jobs=-1,  # Use all available CPU cores
)

In [None]:
# 4. Fit RandomizedSearchCV to your data
print("Performing randomized search...")
random_search.fit(X_train_transformed, y_train)

# 5. Print the best hyperparameters and the corresponding score
print("\nBest hyperparameters found:")
print(random_search.best_params_)

print("\nBest cross-validation score:")
print(random_search.best_score_)

# 6. Get the best estimator
best_random_forest = random_search.best_estimator_
print("\nBest Random Forest Classifier:")
print(best_random_forest)

# 7. Evaluate on the test set
y_pred = best_random_forest.predict(X_test_transformed)
from sklearn.metrics import accuracy_score, classification_report

print("\nAccuracy on test set:")
print(accuracy_score(y_test, y_pred))
print("\nClassification Report on test set:")
print(classification_report(y_test, y_pred))

Performing randomized search...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END bootstrap=True, criterion=entropy, max_depth=102, max_features=None, min_samples_leaf=8, min_samples_split=8, n_estimators=171; total time=   0.3s
[CV] END bootstrap=True, criterion=entropy, max_depth=102, max_features=None, min_samples_leaf=8, min_samples_split=8, n_estimators=171; total time=   0.3s
[CV] END bootstrap=True, criterion=entropy, max_depth=102, max_features=None, min_samples_leaf=8, min_samples_split=8, n_estimators=171; total time=   0.3s
[CV] END bootstrap=True, criterion=entropy, max_depth=102, max_features=None, min_samples_leaf=8, min_samples_split=8, n_estimators=171; total time=   0.3s
[CV] END bootstrap=True, criterion=entropy, max_depth=102, max_features=None, min_samples_leaf=8, min_samples_split=8, n_estimators=171; total time=   0.4s
[CV] END bootstrap=False, criterion=gini, max_depth=31, max_features=sqrt, min_samples_leaf=2, min_samples_split=13, n_estima

Preparing submission

In [30]:
titanic_test_data = pd.read_csv("test.csv")

In [31]:
titanic_test_data = titanic_test_data.drop(columns=["Cabin", "Name", "Ticket"])

In [33]:
colums_with_missing_values = titanic_test_data.columns[titanic_test_data.isnull().any()]
print(colums_with_missing_values)

Index(['Age', 'Fare'], dtype='object')


In [None]:
titanic_test_data["Fare"].fillna(value=0, inplace=True)

In [None]:
fare_cats_bins = list(range(0, 301, 50))
fare_cats_bins_with_inf = fare_cats_bins + [np.inf]
titanic_test_data["Fare_cat"] = pd.cut(
    titanic_test_data["Fare"],
    include_lowest=True,
    bins=pd.Series(fare_cats_bins_with_inf),
    labels=pd.Series(fare_cats_bins),
)
titanic_test_data["Fare_cat"] = titanic_test_data["Fare_cat"].astype(int)

In [36]:
titanic_test_data_transformed = columnTransformer.transform(titanic_test_data)

In [37]:
y_submission = best_random_forest.predict(titanic_test_data_transformed)

In [38]:
submission_df = pd.DataFrame(
    titanic_test_data["PassengerId"],
    columns=["PassengerId"],
    index=titanic_test_data.index,
)

In [39]:
submission_df['Survived'] = y_submission