## Init

In [48]:
import numpy as np
import polars as pl
import pandas as pd
import pyarrow as pa
import matplotlib.pyplot as plt

from abc import ABCMeta, abstractmethod
from dataclasses import dataclass
from sklearn.model_selection import train_test_split
import pandas as pd
import pyarrow as pa
from typing import Callable
import re

from utils.baseclass import Dataset
from utils.data import TitanicDataset
from utils.decorators import PolarsCompatibleTransformer
from utils.transformers import MakeNameFeatures, TransformColToCategorical, TransformStringColToNumeric, TransformColToBins, DropColumns, AddFamilyUnitID, AddSurvivalRate, CleanCabin, FillNull, AddRandomColumn, CleanHonorific, MakeNameFeatures, CleanAge

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier
import xgboost as xgb

from catboost import CatBoostClassifier

In [49]:
DATAPATH = './data'
train = pl.read_csv(f'{DATAPATH}/train.csv')
test = pl.read_csv(f'{DATAPATH}/test.csv')
submission_sample = pl.read_csv(f'{DATAPATH}/gender_submission.csv')

In [50]:
%load_ext autoreload
%autoreload 2

pl.Config.set_tbl_formatting("UTF8_FULL_CONDENSED")
pl.Config.set_tbl_width_chars(175)
pl.Config.set_tbl_cols(12)

pd.options.display.max_columns = None
pd.options.display.max_rows = None

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
# display(train.head())
# display(train_transformed.X.head())

## Data Exploration

In [59]:
train_explore = train.clone()

In [63]:
# train_explore.head()
train_explore.describe()

describe,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
str,f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
"""count""",891.0,891.0,891.0,"""891""","""891""",891.0,891.0,891.0,"""891""",891.0,"""891""","""891"""
"""null_count""",0.0,0.0,0.0,"""0""","""0""",177.0,0.0,0.0,"""0""",0.0,"""687""","""2"""
"""mean""",446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
"""std""",257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
"""min""",1.0,0.0,1.0,"""Abbing, Mr. An…","""female""",0.42,0.0,0.0,"""110152""",0.0,"""A10""","""C"""
"""25%""",223.0,0.0,2.0,,,20.0,0.0,0.0,,7.8958,,
"""50%""",446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
"""75%""",669.0,1.0,3.0,,,38.0,1.0,0.0,,31.0,,
"""max""",891.0,1.0,3.0,"""van Melkebeke,…","""male""",80.0,8.0,6.0,"""WE/P 5735""",512.3292,"""T""","""S"""


In [234]:
'''
Name
Ticket
Cabin
'''
(
    train_explore
    .select(
        '*',
        pl.col('Name').str.split(by=', ').apply(lambda x: x[0]).alias('LastName'),
        pl.col('Name').str.split(by=', ').apply(lambda x: x[1]).str.split(by=' ').apply(lambda x: x[0]).alias('Honorific')
    )
)

# train_explore.select('Name', pl.col('Ticket')).sort('Ticket').head(5)

# train_explore.select('Cabin', pl.col('Ticket')).sort('Cabin').head(5)
# train_explore.groupby('Cabin').agg(pl.n_unique('PassengerId').alias('counts')).sort('counts', descending=True)
# train_explore.filter(pl.col('Cabin')=='C23 C25 C27')
# display(train_explore.groupby('Ticket').agg(pl.n_unique('Cabin').alias('cabincount')).filter(pl.col('cabincount') > 1).head(5))
# train_explore.filter(pl.col('Ticket') == '17421')

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,LastName,Honorific
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""","""Braund""","""Mr."""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""","""Cumings""","""Mrs."""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S""","""Heikkinen""","""Miss."""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S""","""Futrelle""","""Mrs."""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S""","""Allen""","""Mr."""
6,0,3,"""Moran, Mr. Jam…","""male""",,0,0,"""330877""",8.4583,,"""Q""","""Moran""","""Mr."""
7,0,1,"""McCarthy, Mr. …","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S""","""McCarthy""","""Mr."""
8,0,3,"""Palsson, Maste…","""male""",2.0,3,1,"""349909""",21.075,,"""S""","""Palsson""","""Master."""
9,1,3,"""Johnson, Mrs. …","""female""",27.0,0,2,"""347742""",11.1333,,"""S""","""Johnson""","""Mrs."""
10,1,2,"""Nasser, Mrs. N…","""female""",14.0,1,0,"""237736""",30.0708,,"""C""","""Nasser""","""Mrs."""


In [57]:
def get_features_to_outcome_effect(df: pl.DataFrame, group_cols: list[str], outcome_col: str) -> None:
    for col in group_cols:
        if df[col].dtype in [pl.Float32, pl.Float64]:
            df = (
                df
                .with_columns(pl.col(col).qcut(5).alias(col))
            )
    return df.groupby(group_cols).agg(pl.mean(outcome_col)).sort(group_cols, descending=False)

# single_variable_relation_to_outcome(train_explore, ['Sex'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Pclass'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Age'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['SibSp'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Parch'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Fare'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Embarked'], 'Survived')

## Transform data

In [52]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import RocCurveDisplay

In [56]:
PIPELINE = Pipeline([
    ('make_name_features', MakeNameFeatures()),
    
    # ('transform_sex', TransformStringColToNumeric('Sex', replace_original=False)),
    # ('transform_embarked', TransformStringColToNumeric('Embarked', replace_original=False)),
    # # ('clean_honorific', CleanHonorific()),
    # ('transform_honorific', TransformStringColToNumeric('Honorific', replace_original=False)),
    # ('transform_age', TransformColToBins('Age', replace_original=False, return_numeric=True)),
    # ('transform_fare', TransformColToBins('Fare', replace_original=False, return_numeric=True)),
    # ('clean_cabin', CleanCabin()),
    # ('transform_cabin', TransformStringColToNumeric('CabinFirstLetter', replace_original=False)),

    ('transform_sex', TransformColToCategorical('Sex', replace_original=False)),
    ('transform_embarked', TransformColToCategorical('Embarked', replace_original=False)),
    # ('clean_honorific', CleanHonorific()),
    ('transform_honorific', TransformColToCategorical('Honorific', replace_original=False)),
    
    ('clean_age', CleanAge()),
    ('transform_age', TransformColToBins('Age', replace_original=False, return_numeric=True)),
    
    ('transform_fare', TransformColToBins('Fare', replace_original=False, return_numeric=True)),
    
    ('clean_cabin', CleanCabin()),
    ('transform_cabin', TransformColToCategorical('CabinFirstLetter', replace_original=False)),

    # ('add_familyid_lastname', AddFamilyUnitID(['LastName'])),
    # ('add_survivalrate_lastname', AddSurvivalRate(['LastName'])),
    # ('add_familyid_lastname_ticket', AddFamilyUnitID(['LastName', 'Ticket'])),
    # ('add_survivalrate_lastname_ticket', AddSurvivalRate(['LastName', 'Ticket'])),
    # ('add_familyid_lastname_cabin', AddFamilyUnitID(['LastName', 'Cabin'])),
    # ('add_survivalrate_lastname_cabin', AddSurvivalRate(['LastName', 'Cabin'])),
    
    # ('drop_unneeded_cols', DropColumns(
    #     cols_to_drop=['PassengerId', 'Survived'], #, 'Survived'],
    #     regex_to_drop='groupid_*',
    #     drop_strings = True
    # )),

    ('fill_nulls', FillNull(-1)),
    
    # ('xgbc', XGBClassifier(enable_categorical=True, tree_method='approx'))
    # ('xgbc', XGBClassifier(random_state=123))
    # ('catboost', PipelineCompatiableCatBoostClassifier(random_seed=123))
])
train_transformed = TitanicDataset(train.clone(), 'Survived', retain_outcome_col=True, test_size=0.4)
test_transformed = TitanicDataset(test.clone(), 'Survived')

df_clean = PIPELINE.fit_transform(X=train_transformed.xtrain, y=train_transformed.ytrain)
df_clean.head(3)



  return fit_method(estimator, *args, **kwargs)


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,LastName,Honorific,Sex_categorical,Embarked_categorical,Honorific_categorical,AgePred,Age_binned,Fare_binned,CabinFirstLetter,CabinFirstLetter_categorical
0,868,0,1,male,31.0,0,0,PC 17590,50.4958,A24,S,Roebling,Mr.,1,2,9,36.157333,5,8,A,0
1,846,0,3,male,42.0,0,0,C.A. 5547,7.55,,S,Abbing,Mr.,1,2,9,31.972833,8,0,,_NULL_VALUE_
2,290,1,3,female,22.0,0,0,370373,7.75,,Q,Connolly,Miss.,0,1,7,31.480381,2,1,,_NULL_VALUE_


In [57]:
df_clean.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,LastName,Honorific,Sex_categorical,Embarked_categorical,Honorific_categorical,AgePred,Age_binned,Fare_binned,CabinFirstLetter,CabinFirstLetter_categorical
count,534.0,534.0,534.0,534,534.0,534.0,534.0,534,534.0,534.0,532,534,534,534.0,534.0,534.0,534.0,534.0,534.0,125,534
unique,,,,2,,,,442,,100.0,3,433,13,2.0,4.0,13.0,,,,8,9
top,,,,male,,,,CA. 2343,,,S,Sage,Mr.,1.0,2.0,9.0,,,,C,_NULL_VALUE_
freq,,,,341,,,,6,,409.0,389,6,311,341.0,389.0,311.0,,,,36,409
mean,442.183521,0.393258,2.314607,,29.74497,0.539326,0.382022,,33.837624,,,,,,,,29.764207,4.44382,4.419476,,
std,254.596808,0.488931,0.843412,,13.341656,1.167633,0.798765,,53.58908,,,,,,,,10.124098,2.867236,2.897252,,
min,1.0,0.0,1.0,,0.42,0.0,0.0,,0.0,,,,,,,,5.7491,0.0,0.0,,
25%,223.25,0.0,2.0,,22.0,0.0,0.0,,7.925,,,,,,,,25.005898,2.0,2.0,,
50%,445.5,0.0,3.0,,29.640321,0.0,0.0,,14.4583,,,,,,,,30.091844,4.0,4.0,,
75%,643.0,1.0,3.0,,36.0,1.0,0.0,,31.275,,,,,,,,33.824189,7.0,7.0,,


In [58]:
# PIPELINE.fit(X=train_transformed.xtrain, y=train_transformed.ytrain);

In [59]:
# RocCurveDisplay.from_estimator(PIPELINE, train_transformed.xtest, train_transformed.ytest);
# print(accuracy_score(train_transformed.ytrain, PIPELINE.predict(train_transformed.xtrain)))
# print(accuracy_score(train_transformed.ytest, PIPELINE.predict(train_transformed.xtest)))

In [60]:
# from collections import Counter
# Counter(PIPELINE.predict(test_transformed.X))

In [55]:
# from datetime import datetime
# import os

# submission = pl.DataFrame({
#     'PassengerId': test_transformed.X['PassengerId'],
#     #'Survived': PIPELINE.predict(test_transformed.X),
#     'Survived': np.where(test_transformed.X['Sex'] == 'female', 1, 0)
# })
# filename = f'''./submission/submit_{datetime.now().strftime('%Y%m%d_%H%M')}.csv'''
# submission.write_csv(filename)
# os.system(f'''kaggle competitions submit -c titanic -f {filename} -m {filename}''')

## TODO

In [None]:
# Basic prediction: Predict men die, women live
# Basic prediction: Women live. Men who are "master" live. All else die

# Remove correlated features?
# Model tuning
# Stacking models
# Ensembling models