# Titanic data pipeline

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# data inladen
titanic_df = pd.read_csv("data/titanic_dataset.csv")

# Features selecteren, voorspeld item aanduiden (overleefd of niet)
X = titanic_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = titanic_df['Survived']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Preprocessing voor numerieke features
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), ## dit vangt gemiste waardes op
    ('scaler', StandardScaler())
])

# Preprocessing voor categorische features
categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# combineer deze transformators door aan te duiden welke waar moet werken
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Encode the target variable
label_encoder = LabelEncoder() #label omzetten in een numerieke waarde, in dit geval was dit eigenlijk al numeriek
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Create an SVM classifier
svm_clf = SVC()

# Create a pipeline that includes preprocessing and the SVM classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', svm_clf)
])

# Define the parameter grid for grid search
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  # SVM regularization parameter
    'classifier__kernel': ['linear', 'rbf'],  # Kernel types
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding accuracy
print("Best Parameters: ", grid_search.best_params_)
print("Best Cross-Validation Score: {:.2f}".format(grid_search.best_score_))

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test Set Accuracy: {:.2f}".format(test_score))

Best Parameters:  {'classifier__C': 1, 'classifier__kernel': 'rbf'}
Best Cross-Validation Score: 0.83
Test Set Accuracy: 0.82
