# Titanic Survivor Prediction Model

This notebook trains a machine learning model to predict Titanic survivors using 5 different scikit-learn models and an ensemble Voting Classifier.

## 1. Import Libraries

In [9]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

## 2. Load Data

In [10]:
# Load dataset
df = pd.read_csv('../scikit-learn/data/titanic/train.csv')

# Select relevant features
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

X = df[features]
y = df[target]

print(X.head())

   Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0       3    male  22.0      1      0   7.2500        S
1       1  female  38.0      1      0  71.2833        C
2       3  female  26.0      0      0   7.9250        S
3       1  female  35.0      1      0  53.1000        S
4       3    male  35.0      0      0   8.0500        S


## 3. Preprocessing Pipeline

In [11]:
# Define numerical and categorical features
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']

# Numeric Transformer: Impute median, Scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical Transformer: Impute frequent, OneHotEncode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

## 4. Define Models

In [12]:
clf1 = LogisticRegression(random_state=42)
clf2 = DecisionTreeClassifier(random_state=42)
clf3 = RandomForestClassifier(random_state=42)
clf4 = SVC(probability=True, random_state=42)
clf5 = KNeighborsClassifier()

voting_clf = VotingClassifier(
    estimators=[
        ('lr', clf1), 
        ('dt', clf2), 
        ('rf', clf3), 
        ('svc', clf4), 
        ('knn', clf5)
    ],
    voting='soft'
)

# Create full pipeline with preprocessor and model
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', voting_clf)])

## 5. Train and Evaluate

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Voting Classifier Accuracy: {accuracy:.4f}")

Voting Classifier Accuracy: 0.8380


## 6. Save Model

In [14]:
joblib.dump(model_pipeline, 'titanic_voting_model.pkl')
print("Model saved as titanic_voting_model.pkl")

Model saved as titanic_voting_model.pkl
