In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
combine = [train_df, test_df]

In [3]:
# Data Preprocessing
train_df = train_df.drop(['Ticket', 'Cabin', 'Name', 'PassengerId'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin', 'Name'], axis=1)
combine = [train_df, test_df]

In [4]:
# Encoding 'Sex'
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map({'female': 1, 'male': 0}).astype(int)

In [5]:
# Fill missing 'Embarked' with mode
freq_port = train_df.Embarked.dropna().mode()[0]
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
    dataset['Embarked'] = dataset['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

In [6]:
# Log Transformation for 'Fare'
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
train_df['Fare'] = train_df['Fare'].map(lambda x: np.log(x) if x > 0 else 0)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)


In [7]:
# Advanced Imputation for 'Age'
imputer = IterativeImputer()
train_df['Age'] = imputer.fit_transform(train_df[['Age']])

In [8]:
# Scaling 'Age'
scaler = StandardScaler()
train_df['Age'] = scaler.fit_transform(train_df[['Age']])

In [9]:
# Feature Interaction: Age * Fare, Fare * Pclass
train_df['Age*Fare'] = train_df['Age'] * train_df['Fare']
train_df['Fare*Pclass'] = train_df['Fare'] * train_df['Pclass']

In [10]:
# Preparing Data for Modeling
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test = test_df.drop("PassengerId", axis=1).copy()

In [11]:
# Models Initialization
models = {
    'Logistic Regression': LogisticRegression(C=0.8, random_state=42),
    'Support Vector Machines': SVC(kernel='rbf', C=0.9, random_state=42),
    'Linear SVC': LinearSVC(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
    'Perceptron': Perceptron(random_state=42),
    'SGD Classifier': SGDClassifier(random_state=42)
}

In [12]:
# Random Forest with Hyperparameter Tuning
param_dist = {
    'n_estimators': [100, 150],
    'max_depth': [4, 6],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, 
                                   n_iter=10, cv=3, scoring='accuracy', random_state=42)
random_search.fit(X_train, Y_train)
best_rf = random_search.best_estimator_

# Adding Random Forest (Tuned) to the models
models['Random Forest (Tuned)'] = best_rf

In [13]:
# Model Training and Accuracy Calculation
accuracies = {}
for name, model in models.items():
    model.fit(X_train, Y_train)
    accuracies[name] = round(model.score(X_train, Y_train) * 100, 2)



In [14]:
# Displaying the accuracies
accuracy_df = pd.DataFrame(list(accuracies.items()), columns=['Model', 'Accuracy (%)'])

In [16]:
# Display the final results
print("Complete Model Accuracies with 9 Models")
print(accuracy_df)

Complete Model Accuracies with 9 Models
                     Model  Accuracy (%)
0      Logistic Regression         79.80
1  Support Vector Machines         82.94
2               Linear SVC         80.13
3      K-Nearest Neighbors         84.85
4              Naive Bayes         78.45
5            Decision Tree         84.06
6               Perceptron         74.75
7           SGD Classifier         74.52
8    Random Forest (Tuned)         87.88
