# from titanic_pipelines_random_forest_vs_cat we know random forest is winner overall here we try to fine tune

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
titanic_train_data = pd.read_csv("train.csv")

In [6]:
titanic_train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
titanic_train_data = titanic_train_data.drop(columns=["Cabin", "Name", "Ticket"])

In [8]:
train_set, test_set = train_test_split(
    titanic_train_data,
    test_size=0.2,
    random_state=0,
    stratify=titanic_train_data["Sex"],
)

In [9]:
y_train = train_set["Survived"]
X_train = train_set.drop(columns=["Survived"])

In [10]:
colums_with_missing_values = X_train.columns[X_train.isnull().any()]

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_imputer = SimpleImputer(strategy="median")
object_imputer = SimpleImputer(strategy="most_frequent")
oneHotEncoder = OneHotEncoder(
    handle_unknown="ignore", sparse_output=False, drop="first"
)

In [12]:
def getLogPlusOne(fare):
    return np.log(fare + 1)

In [13]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

fare_transformer = Pipeline(
    steps=[
        (
            "log_plus_1",
            FunctionTransformer(
                getLogPlusOne, validate=True, feature_names_out="one-to-one"
            ),
        ),
        ("standard_scaler", StandardScaler()),
    ]
)

In [14]:
pipeline_numerical_imputer = Pipeline(
    steps=[("numerical_imputer", numerical_imputer), ("scaler", StandardScaler())]
)
pipeline_object_imputer = Pipeline(
    steps=[("object_imputer", object_imputer), ("one_hot_encode", oneHotEncoder)]
)
numerical_standard_pipeline = Pipeline(steps=[("scaler", StandardScaler())])
object_standard_pipeline = Pipeline(steps=[("one_hot_encoder", oneHotEncoder)])

In [15]:
X_train.columns

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [16]:
from sklearn.compose import ColumnTransformer

columnTransformer = ColumnTransformer(
    [
        ("fare_transformer", fare_transformer, ["Fare"]),
        ("numerical_with_missing_vals", pipeline_numerical_imputer, ["Age"]),
        ("object_with_missing_vals", pipeline_object_imputer, ["Embarked"]),
        ("standard_object", object_standard_pipeline, ["Sex"]),
        ("standard_numeric", numerical_standard_pipeline, ["SibSp", "Parch"]),
    ],
    remainder="passthrough",
)

In [17]:
columnTransformer = columnTransformer.fit(X_train)

In [18]:
X_train_transformed = columnTransformer.transform(X_train)

In [19]:
X_train_transformed_df = pd.DataFrame(
    X_train_transformed,
    index=X_train,
    columns=columnTransformer.get_feature_names_out(),
)

In [20]:
y_test = test_set["Survived"]
X_test = test_set.drop(columns=["Survived"])

In [21]:
X_test_transformed = columnTransformer.transform(X_test)

## Random Forest 🏆🏆

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

randomForest = RandomForestClassifier(criterion="entropy", random_state=0)

## grid search

In [27]:
from sklearn.model_selection import GridSearchCV

params = [
    {
        "n_estimators": list(range(10, 201, 50)),
        "max_features": ["sqrt", "log2", 0.1, 0.2, 0.3, 0.4, 0.5],
        "max_depth":list(range(10,50,10)),
        "min_samples_split":list(range(10,20,5)),
        "min_samples_leaf":list(range(2,10,2))
    }
]
grid_search = GridSearchCV(
    estimator=randomForest, param_grid=params, scoring="accuracy", cv=10, n_jobs=-1, verbose= 2
)
grid_search.fit(X=X_train_transformed, y=y_train)
best_accuracy = grid_search.best_score_
best_params = grid_search.best_params_

print("Best Accuracy: {:.2f} %".format(best_accuracy * 100))
print("Best Parameters:", best_params)

Fitting 10 folds for each of 896 candidates, totalling 8960 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=10; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=10; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=10; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=10; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=60; total time=   0.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=10; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=60; total time=   0.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimator

## Best estimator

In [28]:
randomForest = grid_search.best_estimator_

In [29]:
import joblib
filename = 'best_random_forest_model.joblib'

# 1. Save the model
joblib.dump(randomForest, filename)
print(f"Model saved successfully to {filename}")

Model saved successfully to best_random_forest_model.joblib


In [32]:
# randomForest.fit(X_train_transformed, y_train)
y_test_predict = randomForest.predict(X_test_transformed)

In [31]:
# accuracies = cross_val_score(
#     estimator=randomForest, X=X_train_transformed, y=y_train, cv=10
# )
# print(accuracies.mean())
# print(accuracies.std())

In [33]:
print(confusion_matrix(y_test, y_test_predict, normalize="all"))
print(accuracy_score(y_test, y_test_predict, normalize=True))

[[0.51955307 0.10614525]
 [0.12290503 0.25139665]]
0.770949720670391


## creating submission

In [34]:
titanic_test_data = pd.read_csv("test.csv")

In [35]:
titanic_test_data.drop(columns=["Cabin", "Name", "Ticket"])

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0000,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S
...,...,...,...,...,...,...,...,...
413,1305,3,male,,0,0,8.0500,S
414,1306,1,female,39.0,0,0,108.9000,C
415,1307,3,male,38.5,0,0,7.2500,S
416,1308,3,male,,0,0,8.0500,S


In [37]:
titanic_test_data["Fare"].fillna(value=0, inplace=True)
titanic_test_data_transformed = columnTransformer.transform(titanic_test_data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_test_data["Fare"].fillna(value=0, inplace=True)


In [38]:
y_submission = randomForest.predict(titanic_test_data_transformed)

In [39]:
submission_df = pd.DataFrame(
    titanic_test_data["PassengerId"],
    columns=["PassengerId"],
    index=titanic_test_data.index,
)

In [40]:
submission_df['Survived'] = y_submission