# Titanic Survival with Custom Logistic Regression
Preprocess the Titanic dataset, scale features, train the NumPy-only logistic regression, and inspect the learned weights.


In [1]:
import os, sys
sys.path.append(os.path.abspath(".."))

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from src.model import LogisticRegression, StandardScaler


In [2]:
# Load dataset
data_path = os.path.join("..", "data", "titanic.csv")
df = pd.read_csv(data_path)
print(f"Loaded {df.shape[0]} rows with {df.shape[1]} columns")
df.head()


Loaded 1309 rows with 14 columns


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
# Basic cleaning: drop ID-like or high-missing columns, then handle missing values
work = df.copy()
drop_cols = ["name", "ticket", "cabin", "boat", "body", "home.dest"]
work = work.drop(columns=drop_cols, errors="ignore")

# Fill missing numeric values
for col in ["age", "fare"]:
    if col in work.columns:
        work[col] = work[col].fillna(work[col].median())

# Fill categorical and one-hot encode
if "embarked" in work.columns:
    work["embarked"] = work["embarked"].fillna(work["embarked"].mode()[0])
categorical_cols = [c for c in ["sex", "embarked"] if c in work.columns]
work = pd.get_dummies(work, columns=categorical_cols, drop_first=True)

# Drop any remaining rows with missing values
work = work.dropna()

print(work.dtypes)


pclass          int64
survived        int64
age           float64
sibsp           int64
parch           int64
fare          float64
sex_male         bool
embarked_Q       bool
embarked_S       bool
dtype: object


In [4]:
# Separate features and target
y = work["survived"].astype(float).values.reshape(-1, 1)
X_df = work.drop(columns=["survived"])
feature_names = X_df.columns
X = X_df.astype(float).values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train.shape, X_test.shape


((1047, 8), (262, 8))

In [5]:
# Train model
model = LogisticRegression(
    n_features=X_train.shape[1],
    lr=0.05,
    reg_lambda=0.001,
)

epochs = 400
losses = []

for epoch in range(epochs):
    _, y_hat = model.forward(X_train)
    loss = model.loss(y_train, y_hat)
    dW, dB = model.backward(X_train, y_train, y_hat)
    model.update(dW, dB)
    losses.append(loss)
    if epoch % 50 == 0:
        print(f"Epoch {epoch:03d} | Loss: {loss:.4f}")


Epoch 000 | Loss: 1.1758
Epoch 050 | Loss: 0.8305
Epoch 100 | Loss: 0.6529
Epoch 150 | Loss: 0.5648
Epoch 200 | Loss: 0.5222
Epoch 250 | Loss: 0.5010
Epoch 300 | Loss: 0.4894
Epoch 350 | Loss: 0.4823


In [6]:
# Evaluate
_, train_probs = model.forward(X_train)
_, test_probs = model.forward(X_test)

train_acc = model.accuracy(y_train, train_probs)
test_acc = model.accuracy(y_test, test_probs)

print(f"Train accuracy: {train_acc:.3f}")
print(f"Test accuracy:  {test_acc:.3f}")


Train accuracy: 0.774
Test accuracy:  0.802


In [7]:
# Inspect learned weights (top positive and negative signals)
weights = model.weights.flatten()
ordered = np.argsort(weights)

print("Top positive signals:")
for idx in ordered[-5:][::-1]:
    print(f"  {feature_names[idx]}: {weights[idx]:.4f}")

print("Top negative signals:")
for idx in ordered[:5]:
    print(f"{feature_names[idx]}: {weights[idx]:.4f}")


Top positive signals:
  fare: 0.4395
  parch: -0.0961
  sibsp: -0.1760
  embarked_Q: -0.2085
  age: -0.2241
Top negative signals:
sex_male: -1.0903
pclass: -0.3331
embarked_S: -0.2345
age: -0.2241
embarked_Q: -0.2085
