In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer


In [58]:
data_train = pd.read_csv('data\\train.csv', index_col='PassengerId')
X_test = pd.read_csv('data\\test.csv', index_col='PassengerId')

y = data_train['Survived']
X = data_train.drop(columns='Survived')

In [68]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), ['Age', 'Fare']),
        ('cat', OneHotEncoder(sparse_output=False), ['Sex', 'Pclass']),
        ('cat2', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['SibSp', 'Parch'])
    ])
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=0))
])

cv_results = cross_val_score(pipeline, X, y, cv=10, scoring='accuracy')
cv_results.mean()

0.8104244694132335

In [66]:
# Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [5, 10, 20],
    'classifier__min_samples_split': [2, 3, 5],
    'classifier__min_samples_leaf': [1, 2, 4],
    # 'classifier__max_features': ['auto', 'sqrt', 'log2']
}

# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, n_jobs=4, verbose=4, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X, y)

# Print best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


Fitting 10 folds for each of 81 candidates, totalling 810 fits
Best parameters found:  {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Best cross-validation score:  0.8362297128589263


In [65]:
params = {'max_depth': 20,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 200
}
pipeline['classifier'].set_params(**params)

pipeline.fit(X, y)
y_pred = pipeline.predict(X_test)

submission = pd.DataFrame(X_test.index)
submission['Survived'] = y_pred
submission.to_csv('my_submission.csv', index=False)