# Simple Titanic Model

In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')
df.columns = [col.lower() for col in df.columns]
df['embarked'] = df['embarked'].str.lower()

## Preprocessing

In [3]:
from sklearn.preprocessing import OneHotEncoder

ohe_cols = ['embarked', 'sex']

ohe = OneHotEncoder(sparse_output=False, drop=[np.nan, 'male']).fit(df[ohe_cols])
ohe_out = ohe.transform(df[ohe_cols])

In [4]:
from sklearn.impute import SimpleImputer

df['age']= SimpleImputer(strategy='mean').fit_transform(df[['age']])

In [5]:
X = pd.concat(
    [df,
     pd.DataFrame(ohe_out, columns=ohe.get_feature_names_out()),
     ], axis=1).drop(columns=['survived', 'passengerid', 'name', 'ticket', 'cabin', 'sex', 'embarked'])
y = df['survived']

## Fitting

In [6]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X, y)


## Prediction

In [7]:
X.iloc[0].to_dict()

{'pclass': 3.0,
 'age': 22.0,
 'sibsp': 1.0,
 'parch': 0.0,
 'fare': 7.25,
 'embarked_c': 0.0,
 'embarked_q': 0.0,
 'embarked_s': 1.0,
 'sex_female': 0.0}

In [8]:
new_obs = {'pclass': 1.0,
           'age': 29.0,
           'sibsp': 1.0,
           'parch': 0.0,
           'fare': 17.25,
           'embarked_c': 0.0,
           'embarked_q': 0.0,
           'embarked_s': 1.0,
           'sex_female': 0.0}

In [9]:
model.predict(pd.DataFrame([new_obs]))[0]

0

In [10]:
model.predict_proba(pd.DataFrame([new_obs]))

array([[0.59831503, 0.40168497]])

## Export

In [11]:
X.columns

Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'embarked_c', 'embarked_q',
       'embarked_s', 'sex_female'],
      dtype='object')

In [12]:
joblib.dump(model, "model.joblib")

['model.joblib']