In [9]:
# importing the libraries
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


# load the titanic dataset from seaborn 
titanic_data= sns.load_dataset('titanic')

# select feature and target variable
X= titanic_data[['pclass', 'sex', 'age', 'parch', 'fare', 'embarked']]
y= titanic_data['survived']

# split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# create a list of models to evaluate
models= [
    ('Random Forest',RandomForestClassifier(random_state=42)),
    ('XGBoost',XGBClassifier(random_state=42)),
    ('Gradient Boosting',GradientBoostingClassifier(random_state=42))
    
    ]


best_model = None
best_accuracy = 0.0

# iterate over the models and evaluate their performance
for name, model in models:
    # create a pipeline
    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ('model', model)
    ])
    
# perform cross-validation
score= cross_val_score(pipeline, X_train, y_train, cv=5)
# calculate mean accuracy
mean_accuracy = score.mean()

# fit the model pipline on the training data 
pipeline.fit(X_train, y_train)
# make predictions on the test data
y_pred = pipeline.predict(X_test)
# calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)


# print the performance metrics
print("model:", name)
print("cross-val Accuracy:", mean_accuracy)
print("test accuracy", accuracy)
print()

# check if the current model has the best accuracy
if mean_accuracy > best_accuracy:
    best_accuracy = mean_accuracy
    best_model = pipeline


# rewrite the bset model
print("Best Model:", best_model)




model: Gradient Boosting
cross-val Accuracy: 0.8118191667487442
test accuracy 0.7988826815642458

Best Model: Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore')),
                ('model', GradientBoostingClassifier(random_state=42))])
