In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# EDA analysis on titanic training dataset
df= pd.read_csv("data\\train.csv")
df.head()

In [None]:
# create basic EDA 
df.info()    # basic information

In [None]:
df.describe()  # statistics summary

In [None]:
df.isnull().sum() # check missing values

In [None]:
df['Survived'].value_counts()

In [None]:
df['Survived'].value_counts(normalize=True)*100  
#about 38% survived, so dataset is imbalanced, the 'class 0' has more weight that 'class 1'  

In [None]:
import matplotlib.pyplot as plt

# Handling missing values, Filling missing Age with median, Embarked with mode
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Age distribution
plt.hist(df['Age'].dropna(), bins=30)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

In [None]:
#Fare Distribution

plt.hist(df['Fare'].dropna(), bins=30)
plt.title('Fare distribution')
plt.xlabel('Fare')
plt.ylabel('Count')
plt.show()

In [None]:
df.groupby('Sex')['Survived'].mean()*100
# 74% of all woman survived , while ~19% of all men survived.

In [None]:
df['Sex'].value_counts(normalize=True)*100
# total travelers were about 65% were men , and ~ 35% woman

In [None]:
df.groupby('Pclass')['Survived'].mean()

In [None]:
 # survival % for kids <= 12 years
cond = df['Age'].le(12, fill_value=False)
df.groupby(cond)['Survived'].mean()


In [None]:
#SVM Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Select features and target
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

#Encoding categorical variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# defning X and y
X = df[features]
y = df[target]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

#Scale features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Train SVM model
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Evaluate SVM model
y_pred = svm_model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


The SVM model predicts survival about 81.6% of the time.
The SVM model correctly predicted that 93 people die (0)
The SVM model correctly predicted that 53 People survived (1)
The SVM model mistaked that 12 survived but they didn't
The SVM model mistaked that 21  died but they survived.

In [None]:
# Train Logistic Regression model

from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(max_iter=500)
log_model.fit(X_train_scaled, y_train)

# Evaluate Logistic Regression
y_pred_log = log_model.predict(X_test_scaled)
print("===== Logistic Regression Results =====")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log))

In [None]:
# Train Decision Tree model
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(
    criterion='gini',
    max_depth=None,
    random_state=42
)
dt_model.fit(X_train, y_train)  # Decision trees not require scaling

# Evaluate Decision Tree
y_pred_dt = dt_model.predict(X_test)
print("===== Decision Tree Results =====")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))

In [None]:
# Train Neural Network (MLPClassifier)

from sklearn.neural_network import MLPClassifier

nn_model = MLPClassifier(
    hidden_layer_sizes=(64, 32),   # two layers: 64 â†’ 32 nodes
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42
)

nn_model.fit(X_train_scaled, y_train)

# Evaluate Neural Network
y_pred_nn = nn_model.predict(X_test_scaled)
print("===== Neural Network (MLP) Results =====")
print("Accuracy:", accuracy_score(y_test, y_pred_nn))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_nn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_nn))

In [None]:
# Evaluating the algorithms with Cross Validation to estimate
# how well the model will generalize to unseen data

from sklearn.model_selection import cross_val_score, KFold

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kfold = KFold(n_splits=5, shuffle=True, random_state=42)  # Cross-validation setup

# Compute cross-validation scores
cv_results = {
    "SVM": cross_val_score(svm_model, X_scaled, y, cv=kfold, scoring='accuracy'),
    "Logistic Regression": cross_val_score(log_model, X_scaled, y, cv=kfold, scoring='accuracy'),
    "Decision Tree": cross_val_score(dt_model, X, y, cv=kfold, scoring='accuracy'),  # no scaling
    "Neural Network": cross_val_score(nn_model, X_scaled, y, cv=kfold, scoring='accuracy')
}

# Print results
print("\n======= Cross-Validation Results (5-Fold) =======")
for model, scores in cv_results.items():
    print(f"\n{model}")
    print(f"Scores: {scores}")
    print(f"Mean Accuracy: {scores.mean():.4f}")
    print(f"Standard Deviation: {scores.std():.4f}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n======= Full Train/Test Evaluation =======")

# SVM
svm_model.fit(X_train_scaled, y_train)
print("\n--- SVM ---")
print("Accuracy:", accuracy_score(y_test, svm_model.predict(X_test_scaled)))

# Logistic Regression
log_model.fit(X_train_scaled, y_train)
print("\n--- Logistic Regression ---")
print("Accuracy:", accuracy_score(y_test, log_model.predict(X_test_scaled)))

# Decision Tree
dt_model.fit(X_train, y_train)
print("\n--- Decision Tree ---")
print("Accuracy:", accuracy_score(y_test, dt_model.predict(X_test)))

# Neural Network
nn_model.fit(X_train_scaled, y_train)
print("\n--- Neural Network (MLP) ---")
print("Accuracy:", accuracy_score(y_test, nn_model.predict(X_test_scaled)))

In [None]:
# SVM Hyperparameter Optimization (GridSearchCV)

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# FEATURES & TARGET
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = df[features]
y = df['Survived']

# COLUMNS BY TYPE
numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']

# PREPROCESSING PIPELINES
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# SVM MODEL (PIPELINE)
svm_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", SVC())
])

# HYPERPARAMETER GRID
param_grid = {
    "model__C": [0.1, 1, 10, 50, 100],
    "model__gamma": ["scale", "auto", 0.1, 0.01, 0.001],
    "model__kernel": ["rbf", "poly", "sigmoid"],
}

# GRID SEARCH (5-FOLD)
grid = GridSearchCV(
    svm_pipeline,
    param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
    verbose=2
)

# TRAIN/TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fit the search
grid.fit(X_train, y_train)

# RESULTS
print("\n===== Best Hyperparameters Found =====")
print(grid.best_params_)

# Extract all 5-fold scores for the best model
cv_scores = grid.cv_results_['split0_test_score'][grid.best_index_], \
            grid.cv_results_['split1_test_score'][grid.best_index_], \
            grid.cv_results_['split2_test_score'][grid.best_index_], \
            grid.cv_results_['split3_test_score'][grid.best_index_], \
            grid.cv_results_['split4_test_score'][grid.best_index_]

print("\n===== Cross-Validation Scores (5 folds) =====")
for i, score in enumerate(cv_scores, 1):
    print(f"Fold {i} Accuracy: {score:.4f}")

print("\n===== Mean Cross-Validation Accuracy =====")
print(f"Mean Accuracy: {grid.best_score_:.4f}")

# Evaluate on test data
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

print("\n===== Test Accuracy =====")
print(accuracy_score(y_test, y_pred))

print("\n===== Confusion Matrix =====")
print(confusion_matrix(y_test, y_pred))

print("\n===== Classification Report =====")
print(classification_report(y_test, y_pred))
