In [362]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression

In [363]:
train_file_path = 'titanic/titanic_train.csv'
test_file_path = 'titanic/titanic_test.csv'

train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)


In [364]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [365]:
train_df.isna().sum()
test_df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [366]:
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

In [367]:
train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


In [368]:
title_mapping = {
    "Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Col": 7,
    "Major": 7, "Mlle": 8, "Countess": 9, "Ms": 2, "Lady": 9, "Jonkheer": 9,
    "Don": 10, "Dona": 10, "Mme": 3, "Capt": 7, "Sir": 10
}

In [369]:
train_df['Title'] = train_df['Title'].map(title_mapping)
test_df['Title'] = test_df['Title'].map(title_mapping)

train_df['Title'].fillna(0, inplace=True)
test_df['Title'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Title'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Title'].fillna(0, inplace=True)


In [370]:
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interme

In [371]:
train_df['FareBin'] = pd.qcut(train_df['Fare'], 4)
test_df['FareBin'] = pd.qcut(test_df['Fare'], 4)

In [372]:
train_df['AgeBin'] = pd.cut(train_df['Age'].astype(int), 5)
test_df['AgeBin'] = pd.cut(test_df['Age'].astype(int), 5)

In [373]:
from sklearn.preprocessing import LabelEncoder

In [374]:
label = LabelEncoder()
for dataset in [train_df, test_df]:
    dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])


In [375]:
train_df = pd.get_dummies(train_df, columns=['Sex', 'Embarked'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Sex', 'Embarked'], drop_first=True)

train_df.drop(['FareBin', 'AgeBin'], axis=1, inplace=True)
test_df.drop(['FareBin', 'AgeBin'], axis=1, inplace=True)

In [376]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [377]:
features = ['Pclass', 'Age', 'Fare', 'FamilySize', 'Title', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'AgeBin_Code', 'FareBin_Code']
X = train_df[features]
y = train_df['Survived']
X_test = test_df[features]

In [378]:
from sklearn.preprocessing import StandardScaler

In [379]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [380]:
models = {
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'LogisticRegression': LogisticRegression(),
    'LinearSVC': LinearSVC()
}

In [381]:
def evaluate_model(model):
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    return scores.mean()

In [382]:
for name, model in models.items():
    accuracy = evaluate_model(model)
    print(f'{name} Accuracy: {accuracy:.4f}')

RandomForest Accuracy: 0.8148
GradientBoosting Accuracy: 0.8305
KNN Accuracy: 0.8328
SVM Accuracy: 0.8272
LogisticRegression Accuracy: 0.8047
LinearSVC Accuracy: 0.8047


In [385]:
from sklearn.ensemble import VotingClassifier

ensemble_model = VotingClassifier(estimators=[
    ('rf', models['RandomForest']),
    ('gb', models['GradientBoosting']),
    ('knn', models['KNN']),
    ('log', models['LogisticRegression'])
     ],voting='soft')

ensemble_model.fit(X, y)

In [386]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
ensemble_model.fit(X_train, y_train)
y_pred = ensemble_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.4f}')

Validation Accuracy: 0.7765
