In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import joblib

# 1. Load Data
df = sns.load_dataset('titanic')

# 2. Data Cleaning (Crucial for your assignment!)
# Drop columns with too many missing values or irrelevant info
df = df.drop(columns=['deck', 'embark_town', 'alive', 'class', 'who', 'adult_male'])
# Fill missing ages with the average age
df['age'] = df['age'].fillna(df['age'].mean())
# Fill missing embarkation with most frequent value
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])
# Convert 'sex' and 'embarked' to numbers (Label Encoding)
df['sex'] = df['sex'].map({'male': 0, 'female': 1})
df['embarked'] = df['embarked'].map({'S': 0, 'C': 1, 'Q': 2})

# 3. Model Training
X = df[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']]
y = df['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train both models
lr_model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
dt_model = DecisionTreeClassifier().fit(X_train, y_train)

# 4. Evaluation
print(f"LR Accuracy: {accuracy_score(y_test, lr_model.predict(X_test))}")
print(f"DT Accuracy: {accuracy_score(y_test, dt_model.predict(X_test))}")

# 5. Export
joblib.dump(lr_model, 'logistic_model.joblib')
joblib.dump(dt_model, 'decision_tree_model.joblib')

LR Accuracy: 0.7988826815642458
DT Accuracy: 0.7877094972067039


['decision_tree_model.joblib']