In [None]:
import pandas as pd
df = pd.read_csv("data/Titanic Dataset.csv")
df.head()

Collecting scipy
  Using cached scipy-1.16.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting matplotlib
  Using cached matplotlib-3.10.7-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting statsmodels
  Downloading statsmodels-0.14.5-cp313-cp313-win_amd64.whl.metadata (9.8 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.3-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Using cached fonttools-4.60.1-cp313-cp313-win_amd64.whl.metadata (114 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.9-cp313-cp313-win_amd64.whl.metadata (6.4 kB)
Collecting pillow>=8 (from matplotlib)
  Using cached pil

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

df['hascabin'] = df['cabin'].notna().astype(int)
df["title"] = df["name"].str.extract(r",\s*([^\.]+)\.", expand=False).str.strip()

#print(df["title"].value_counts().sort_values(ascending=False))

med = df.groupby("title")["age"].transform("median")

df["age"] = df["age"].fillna(med).fillna(df["age"].median())

df["hascabin"] = df["cabin"].notna().astype(int)

to_drop = ["boat", "body", "home.dest", "cabin"]
df = df.drop(columns=[c for c in to_drop if c in df.columns])

num_cols = df.select_dtypes(include="number").columns
df[num_cols] = df[num_cols].fillna(df[num_cols].mean())

cat_cols = df.select_dtypes(include="object").columns
if len(cat_cols) > 0:
    modes = df[cat_cols].mode().iloc[0]
    df[cat_cols] = df[cat_cols].fillna(modes)

In [2]:
y = df['survived']
X = df.drop(columns=['survived', 'name'])

from sklearn.preprocessing import OrdinalEncoder

cat_cols = X.select_dtypes(include="object").columns
num_cols = ["age", "fare", "sibsp", "parch"]

if len(cat_cols) > 0:
    enc = OrdinalEncoder()
    X[cat_cols] = enc.fit_transform(X[cat_cols])

ModuleNotFoundError: No module named 'sklearn'

Podzielenie zakresu zmiennych na równe części:

In [None]:
def quantize_equal_width(X, num_cols, k):
    X2 = X.copy()
    for col in num_cols:
        X2[col] = pd.cut(X2[col], bins=k, labels=False, include_lowest=True)
    return X2

Podzielenie obserwacji treningowych na równoliczne części:

In [None]:
def quantize_equal_freq(X, num_cols, k):
    X2 = X.copy()
    for col in num_cols:
        X2[col] = pd.qcut(
            X2[col],
            q=k,
            labels=False,
            duplicates='drop'
        )
    return X2

Klasteryzacja wartości każdej zmiennej:

In [None]:
from sklearn.cluster import KMeans

def quantize_kmeans(X, num_cols, k):
    X2 = X.copy()
    for col in num_cols:
        values = X2[col].values.reshape(-1,1)
        km = KMeans(n_clusters=k, n_init=10, random_state=42)
        X2[col] = km.fit_predict(values)
    return X2

In [None]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def train(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )
    X_train = X_train.copy()
    X_test = X_test.copy()

    for col in X_train.columns:
        max_cat = X_train[col].max()
        X_test.loc[X_test[col] > max_cat, col] = max_cat

    model = CategoricalNB()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    return accuracy_score(y_test, preds)



In [None]:
methods = {
    'equal_width': quantize_equal_width,
    'equal_freq': quantize_equal_freq,
    'kmeans': quantize_kmeans
}

results = []

for method_name, method in methods.items():
    for k in [2, 5, 10, 25]:

        X_quant = method(X, num_cols, k)
        acc = train(X_quant, y)

        results.append({
            'method': method_name,
            'k': k,
            'accuracy': acc
        })

results_df = pd.DataFrame(results)
print(results_df)
