In [7]:
import pandas as pd
import numpy as np

In [8]:
# Dataset URL
URI = "https://raw.githubusercontent.com/softwareWCU/Data-Preprocessing-for-ML-using-Titanic-Dataset/main/titanic2.csv"
# Load dataset from GitHub
df = pd.read_csv(URI)

In [16]:
# Load dataset from GitHub
df = pd.read_csv(URI)

print("File loaded successfully from URL!\n")

print("First 8 rows:")
display(df.head(8))

print("\n Dataset shape:", df.shape)

print("\n❗ Missing values per column:")
print(df.isnull().sum())

File loaded successfully from URL!

First 8 rows:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",Female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",MALE,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S



 Dataset shape: (909, 12)

❗ Missing values per column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            199
SibSp            0
Parch            0
Ticket           0
Fare            18
Cabin          701
Embarked         2
dtype: int64


In [10]:
# Cell 2: Data Cleaning
import re

# Standardize column names
df.columns = [col.strip() for col in df.columns]

# Normalize text columns
df['Sex'] = df['Sex'].astype(str).str.strip().str.lower()
df['Embarked'] = df['Embarked'].astype(str).str.strip().str.lower()
df['Embarked'] = df['Embarked'].replace({'cherbourg':'c', 'southampton':'s', 'queenstown':'q'})

# Make sure Pclass is numeric (1,2,3)
df['Pclass'] = df['Pclass'].astype(str).str.extract(r'(\d+)').astype(int)

# Fill missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Drop noisy / unhelpful columns
if 'Cabin' in df.columns and 'Ticket' in df.columns:
    df.drop(['Cabin','Ticket'], axis=1, inplace=True)

print("Cleaning done. Sample rows after cleaning:")
display(df.head())


Cleaning done. Sample rows after cleaning:


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,s
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,c
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,s
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,s
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,s


In [11]:
import re

In [12]:

# Extract title from Name
def extract_title(name):
    match = re.search(r',\s*([^.]+)\.', str(name))
    return match.group(1).strip() if match else 'Other'

df['Title'] = df['Name'].apply(extract_title)

title_map = {
    'Mr':'Mr','Mrs':'Mrs','Miss':'Miss','Master':'Master',
    'Dr':'Rare','Rev':'Rare','Col':'Rare','Major':'Rare',
    'Mlle':'Miss','Mme':'Mrs','Lady':'Rare','Sir':'Rare',
    'Don':'Rare','Countess':'Rare','Jonkheer':'Rare','Capt':'Rare'
}
df['Title'] = df['Title'].map(lambda x: title_map.get(x, 'Rare'))

In [13]:
# Family features
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

In [14]:
# Age groups (engineered categorical feature)
def age_group(age):
    if age <= 5: return 'Baby'
    if age <= 12: return 'Child'
    if age <= 19: return 'Teen'
    if age <= 35: return 'YoungAdult'
    if age <= 55: return 'Adult'
    return 'Senior'
df['AgeGroup'] = df['Age'].apply(age_group)

In [15]:
# Fare quartile groups
df['FareGroup'] = pd.qcut(df['Fare'], 4, labels=['Low','Medium','High','VeryHigh'])

In [17]:
# Drop Name (we extracted Title)
df.drop('Name', axis=1, inplace=True)

print("Feature engineering completed. New columns added:")
print(['Title','FamilySize','IsAlone','AgeGroup','FareGroup'])
display(df.head())

Feature engineering completed. New columns added:
['Title', 'FamilySize', 'IsAlone', 'AgeGroup', 'FareGroup']


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,male,35.0,0,0,373450,8.05,,S


In [31]:
# Example: check existing columns
print("Columns before encoding:", df.columns.tolist())


Columns before encoding: ['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [32]:
# 1️⃣ Create Title if Name exists
if 'Name' in df.columns:
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)


In [33]:
# 2️⃣ Create AgeGroup if Age exists
if 'Age' in df.columns:
    df['AgeGroup'] = pd.cut(df['Age'],
                            bins=[0, 12, 20, 40, 60, 80],
                            labels=['Child', 'Teen', 'Adult', 'Middle', 'Senior'])

In [34]:
# 3️⃣ Create FareGroup if Fare exists
if 'Fare' in df.columns:
    df['FareGroup'] = pd.qcut(df['Fare'],
                              q=4,
                              labels=['Low', 'Medium', 'High', 'Very_High'])

In [36]:

# 4️⃣ Encode only existing categorical columns
categorical_cols = ['Sex', 'Embarked', 'Title', 'AgeGroup', 'FareGroup']
existing_cols = [col for col in categorical_cols if col in df.columns]

df = pd.get_dummies(df, columns=existing_cols, drop_first=True)

print("Encoding done. Columns now:")
print(df.columns.tolist())
print("\nShape after encoding:", df.shape)

Encoding done. Columns now:
['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Sex_Female', 'Sex_MALE', 'Sex_Male', 'Sex_female', 'Sex_male', 'Sex_male ', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Embarked_c', 'Embarked_c ', 'Embarked_q', 'Embarked_s', 'Embarked_southampton', 'AgeGroup_Teen', 'AgeGroup_Adult', 'AgeGroup_Middle', 'AgeGroup_Senior', 'FareGroup_Medium', 'FareGroup_High', 'FareGroup_Very_High']

Shape after encoding: (909, 30)


In [39]:
# Train/Test split + Scaling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [40]:
# Drop PassengerId and Survived for X
X = df.drop(['Survived', 'PassengerId'], axis=1)
y = df['Survived']

In [47]:
# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (727, 28) Test shape: (182, 28)


In [52]:
# Ensure all features are numeric
X = df.drop(['Survived', 'PassengerId'], axis=1)
X = X.apply(pd.to_numeric, errors='coerce')  # Convert all to numeric, force NaN for bad values
X.fillna(0, inplace=True)  # Replace NaN with 0

# Split and scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [53]:
# Clean Pclass
df['Pclass'] = df['Pclass'].astype(str).str.extract(r'(\d+)').astype(int)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaling complete. Example (first row scaled):")
print(X_train_scaled[0])

Scaling complete. Example (first row scaled):
[-1.59713429  0.7169685  -0.46084844 -0.46682636 -0.43802868 -0.08703246
  0.         -0.05252257 -0.17665128 -0.06437116 -0.69546637  0.82635971
 -0.15933572 -0.46007246 -0.27467032  0.6976443  -0.03711348 -0.13493437
  0.         -0.06437116 -0.17246751 -0.37588201  1.17295226 -0.41086529
 -0.14514619 -0.58635123  1.824111   -0.56517187]


In [56]:
#  KNN + GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

param_grid = {
    'n_neighbors': list(range(3, 25)),   # widened range
    'weights': ['uniform', 'distance']
}

grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train_scaled, y_train)

print("\nBest parameters:", grid.best_params_)
best_model = grid.best_estimator_

Fitting 5 folds for each of 44 candidates, totalling 220 fits

Best parameters: {'n_neighbors': 4, 'weights': 'uniform'}


In [57]:
# Predictions on test set
y_pred = best_model.predict(X_test_scaled)

In [58]:
# Metrics
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f"\nTest Accuracy: {acc:.4f}")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Test Accuracy: 0.7198

Confusion Matrix:
 [[96 16]
 [35 35]]

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.86      0.79       112
           1       0.69      0.50      0.58        70

    accuracy                           0.72       182
   macro avg       0.71      0.68      0.68       182
weighted avg       0.71      0.72      0.71       182



In [60]:
# Visualizations
import plotly.express as px
import plotly.figure_factory as ff

In [61]:
# Survival rate by AgeGroup - but use original df before dummy expansion; create a temporary group
age_survival = df.filter(regex='AgeGroup_').copy()  # columns with AgeGroup dummies
# We need to reconstruct AgeGroup labels for visualization; easier to use original unencoded df snapshot:
# If you didn't keep a snapshot, create AgeGroup vs Survived from encoded columns:
# Simpler: reload a small copy from earlier by re-creating AgeGroup mapping from Age values in df (Age column still exists)
# Note: Age column still exists because we did not drop it.
age_vs_surv = (df[['Age','Survived']].copy())
age_vs_surv['AgeGroup'] = age_vs_surv['Age'].apply(lambda a: 'Baby' if a<=5 else ('Child' if a<=12 else ('Teen' if a<=19 else ('YoungAdult' if a<=35 else ('Adult' if a<=55 else 'Senior')))))
age_rate = age_vs_surv.groupby('AgeGroup')['Survived'].mean().reindex(['Baby','Child','Teen','YoungAdult','Adult','Senior']).reset_index()

fig1 = px.bar(age_rate, x='AgeGroup', y='Survived', title='Survival Rate by AgeGroup')
fig1.show()