In [150]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score
)

df = pd.read_csv("Titanic Dataset.csv")
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [151]:
df.isna().mean() * 100 

pclass        0.000000
survived      0.000000
name          0.000000
sex           0.000000
age          20.091673
sibsp         0.000000
parch         0.000000
ticket        0.000000
fare          0.076394
cabin        77.463713
embarked      0.152788
boat         62.872422
body         90.756303
home.dest    43.086325
dtype: float64

In [152]:
pd.crosstab(df['sex'], df['age'].isna(), normalize='index') * 100
pd.crosstab(df['pclass'], df['age'].isna(), normalize='index') * 100

age,False,True
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,87.925697,12.074303
2,94.223827,5.776173
3,70.662906,29.337094


Baseline: 
- usuwamy kolumny z duza iloscia pustych wartosci: cabin, boat, body, home.dest
- uzupelniamy age srednia
- embarked najczestsza wartosc

In [153]:
to_drop = ['cabin', 'boat', 'body', 'home.dest']
df_base = df.drop(columns=to_drop)

num_cols = df_base.select_dtypes(include=['number']).columns
for col in num_cols:
    df_base[col].fillna(df_base[col].mean(), inplace=True)

cat_cols = df_base.select_dtypes(include=['object']).columns
for col in cat_cols:
    df_base[col].fillna(df_base[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_base[col].fillna(df_base[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_base[col].fillna(df_base[col].mode()[0], inplace=True)


In [154]:
X = pd.get_dummies(df_base.drop(columns=['survived', 'name', 'ticket']), drop_first=True)
y = df_base['survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy : {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall   : {rec:.3f}")
print(f"F1-score : {f1:.3f}")


Accuracy : 0.812
Precision: 0.771
Recall   : 0.720
F1-score : 0.745


Zaawansowany:

- cabin zamieniamy na puste/niepuste
- uzywamy tytulow do uzupelniania wieku mediana
- reszta jak wyzej

In [155]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

df = pd.read_csv("Titanic Dataset.csv")

df['hascabin'] = df['cabin'].notna().astype(int)
df["title"] = df["name"].str.extract(r",\s*([^\.]+)\.", expand=False).str.strip()

print(df["title"].value_counts().sort_values(ascending=False))

med = df.groupby("title")["age"].transform("median")

df["age"] = df["age"].fillna(med).fillna(df["age"].median())

title
Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Ms                2
Major             2
Mlle              2
Capt              1
Sir               1
Dona              1
Jonkheer          1
the Countess      1
Don               1
Mme               1
Lady              1
Name: count, dtype: int64


In [156]:
df["hascabin"] = df["cabin"].notna().astype(int)

to_drop = ["boat", "body", "home.dest", "cabin"]
df = df.drop(columns=[c for c in to_drop if c in df.columns])

num_cols = df.select_dtypes(include="number").columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

cat_cols = df.select_dtypes(include="object").columns
if len(cat_cols) > 0:
    modes = df[cat_cols].mode().iloc[0]
    df[cat_cols] = df[cat_cols].fillna(modes)


In [157]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

X = pd.get_dummies(df.drop(columns=["survived", "name", "ticket"]), drop_first=True)
y = df["survived"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred  = model.predict(X_test)

print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Precision:",round(precision_score(y_test, y_pred), 3))
print("Recall:", round(recall_score(y_test, y_pred), 3))
print("F1:",round(f1_score(y_test, y_pred), 3))

Accuracy: 0.832
Precision: 0.776
Recall: 0.787
F1: 0.781
