In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [2]:
data_url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
df = pd.read_csv(data_url)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [3]:
df.shape

(887, 8)

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Survived,887.0,0.385569,0.487004,0.0,0.0,0.0,1.0,1.0
Pclass,887.0,2.305524,0.836662,1.0,2.0,3.0,3.0,3.0
Age,887.0,29.471443,14.121908,0.42,20.25,28.0,38.0,80.0
Siblings/Spouses Aboard,887.0,0.525366,1.104669,0.0,0.0,0.0,1.0,8.0
Parents/Children Aboard,887.0,0.383315,0.807466,0.0,0.0,0.0,0.0,6.0
Fare,887.0,32.30542,49.78204,0.0,7.925,14.4542,31.1375,512.3292


In [5]:
df.dtypes

Survived                     int64
Pclass                       int64
Name                        object
Sex                         object
Age                        float64
Siblings/Spouses Aboard      int64
Parents/Children Aboard      int64
Fare                       float64
dtype: object

In [6]:
df.isnull().sum()

Survived                   0
Pclass                     0
Name                       0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64

In [7]:
df['FamilySize'] = df['Siblings/Spouses Aboard'] + df['Parents/Children Aboard'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

In [8]:
target = 'Survived'
features = ['Pclass', 'Sex', 'Age', 'Fare', 'FamilySize', 'IsAlone']
X = df[features]
y = df[target]

In [9]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [10]:
numeric_features = ['Age', 'Fare', 'FamilySize']
categorical_features = ['Sex', 'Pclass', 'IsAlone']

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_leaf=2, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}

best_model = None
best_accuracy = 0.0

for name, model in models.items():
    clf_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Train the model
    clf_pipeline.fit(X_train, y_train)
    
    # Make predictions and get accuracy
    y_pred = clf_pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    print(f"{name} Accuracy: {acc:.4f}")
    
    # Check if this is the best model
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = clf_pipeline

print(f"\nBest model is: {type(best_model.named_steps['classifier']).__name__} with {best_accuracy:.4f} accuracy.")

Logistic Regression Accuracy: 0.7528
Decision Tree Accuracy: 0.7472
Random Forest Accuracy: 0.8146
Gradient Boosting Accuracy: 0.8371

Best model is: GradientBoostingClassifier with 0.8371 accuracy.


In [14]:
with open('titanic_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)