## Init

In [1]:
import numpy as np
import polars as pl
import pandas as pd
import pyarrow as pa

In [2]:
DATAPATH = './data'
train = pl.read_csv(f'{DATAPATH}/train.csv')
test = pl.read_csv(f'{DATAPATH}/test.csv')
submission_sample = pl.read_csv(f'{DATAPATH}/gender_submission.csv')

In [9]:
%load_ext autoreload
%autoreload 2

pl.Config.set_tbl_formatting("UTF8_FULL_CONDENSED")
pl.Config.set_tbl_width_chars(175)
pl.Config.set_tbl_cols(12)

pd.options.display.max_columns = None

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data Exploration

In [59]:
train_explore = train.clone()

In [63]:
# train_explore.head()
train_explore.describe()

describe,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
str,f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
"""count""",891.0,891.0,891.0,"""891""","""891""",891.0,891.0,891.0,"""891""",891.0,"""891""","""891"""
"""null_count""",0.0,0.0,0.0,"""0""","""0""",177.0,0.0,0.0,"""0""",0.0,"""687""","""2"""
"""mean""",446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
"""std""",257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
"""min""",1.0,0.0,1.0,"""Abbing, Mr. An…","""female""",0.42,0.0,0.0,"""110152""",0.0,"""A10""","""C"""
"""25%""",223.0,0.0,2.0,,,20.0,0.0,0.0,,7.8958,,
"""50%""",446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
"""75%""",669.0,1.0,3.0,,,38.0,1.0,0.0,,31.0,,
"""max""",891.0,1.0,3.0,"""van Melkebeke,…","""male""",80.0,8.0,6.0,"""WE/P 5735""",512.3292,"""T""","""S"""


In [234]:
'''
Name
Ticket
Cabin
'''
(
    train_explore
    .select(
        '*',
        pl.col('Name').str.split(by=', ').apply(lambda x: x[0]).alias('LastName'),
        pl.col('Name').str.split(by=', ').apply(lambda x: x[1]).str.split(by=' ').apply(lambda x: x[0]).alias('Honorific')
    )
)

# train_explore.select('Name', pl.col('Ticket')).sort('Ticket').head(5)

# train_explore.select('Cabin', pl.col('Ticket')).sort('Cabin').head(5)
# train_explore.groupby('Cabin').agg(pl.n_unique('PassengerId').alias('counts')).sort('counts', descending=True)
# train_explore.filter(pl.col('Cabin')=='C23 C25 C27')
# display(train_explore.groupby('Ticket').agg(pl.n_unique('Cabin').alias('cabincount')).filter(pl.col('cabincount') > 1).head(5))
# train_explore.filter(pl.col('Ticket') == '17421')

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,LastName,Honorific
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,str,str
1,0,3,"""Braund, Mr. Ow…","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""","""Braund""","""Mr."""
2,1,1,"""Cumings, Mrs. …","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""","""Cumings""","""Mrs."""
3,1,3,"""Heikkinen, Mis…","""female""",26.0,0,0,"""STON/O2. 31012…",7.925,,"""S""","""Heikkinen""","""Miss."""
4,1,1,"""Futrelle, Mrs.…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S""","""Futrelle""","""Mrs."""
5,0,3,"""Allen, Mr. Wil…","""male""",35.0,0,0,"""373450""",8.05,,"""S""","""Allen""","""Mr."""
6,0,3,"""Moran, Mr. Jam…","""male""",,0,0,"""330877""",8.4583,,"""Q""","""Moran""","""Mr."""
7,0,1,"""McCarthy, Mr. …","""male""",54.0,0,0,"""17463""",51.8625,"""E46""","""S""","""McCarthy""","""Mr."""
8,0,3,"""Palsson, Maste…","""male""",2.0,3,1,"""349909""",21.075,,"""S""","""Palsson""","""Master."""
9,1,3,"""Johnson, Mrs. …","""female""",27.0,0,2,"""347742""",11.1333,,"""S""","""Johnson""","""Mrs."""
10,1,2,"""Nasser, Mrs. N…","""female""",14.0,1,0,"""237736""",30.0708,,"""C""","""Nasser""","""Mrs."""


In [57]:
def get_features_to_outcome_effect(df: pl.DataFrame, group_cols: list[str], outcome_col: str) -> None:
    for col in group_cols:
        if df[col].dtype in [pl.Float32, pl.Float64]:
            df = (
                df
                .with_columns(pl.col(col).qcut(5).alias(col))
            )
    return df.groupby(group_cols).agg(pl.mean(outcome_col)).sort(group_cols, descending=False)

# single_variable_relation_to_outcome(train_explore, ['Sex'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Pclass'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Age'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['SibSp'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Parch'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Fare'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Embarked'], 'Survived')

## Transform data

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

In [5]:
# train_transformed.drop('Survived')

'''
read in as polars
store dataset object as pandas
for each transformation, convert to polars, then convert back to pandas
'''

'\nread in as polars\nstore dataset object as pandas\nfor each transformation, convert to polars, then convert back to pandas\n'

In [71]:
from abc import ABCMeta, abstractmethod
from dataclasses import dataclass
from sklearn.model_selection import train_test_split
import pandas as pd
import pyarrow as pa
from typing import Callable
import re

from utils.baseclass import Dataset
from utils.data import TitanicDataset
from utils.decorators import PolarsCompatibleTransformer
from utils.transformers import MakeNameFeatures, TransformColToCategorical, TransformStringColToNumeric, TransformColToBins, DropColumns

from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier
import xgboost as xgb

@PolarsCompatibleTransformer
class FillNull(BaseEstimator, TransformerMixin):
    def __init__(self, null_value: float):
        self.null_value = null_value
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = (
            X
            .fill_null(self.null_value)
            .fill_nan(self.null_value)
        )
        return X_transformed

PIPELINE = Pipeline([
    ('make_name_features', MakeNameFeatures()),
    ('transform_pclass', TransformStringColToNumeric('Pclass', replace_original=False)),
    ('transform_sex', TransformStringColToNumeric('Sex', replace_original=False)),
    ('transform_sibsp', TransformStringColToNumeric('SibSp', replace_original=False)),
    ('transform_parch', TransformStringColToNumeric('Parch', replace_original=False)),
    ('transform_embarked', TransformStringColToNumeric('Embarked', replace_original=False)),
    ('transform_honorific', TransformStringColToNumeric('Honorific', replace_original=False)),
    ('transform_age', TransformColToBins('Age', replace_original=False)),
    ('transform_fare', TransformColToBins('Fare', replace_original=False)),
    ('add_familyid_lastname', AddFamilyUnitID(['LastName'])),
    ('add_survivalrate_lastname', AddSurvivalRate(['LastName'])),
    ('add_familyid_lastname_ticket', AddFamilyUnitID(['LastName', 'Ticket'])),
    ('add_survivalrate_lastname_ticket', AddSurvivalRate(['LastName', 'Ticket'])),
    ('add_familyid_lastname_cabin', AddFamilyUnitID(['LastName', 'Cabin'])),
    ('add_survivalrate_lastname_cabin', AddSurvivalRate(['LastName', 'Cabin'])),
    ('drop_unneeded_cols', DropColumns(
        cols_to_drop=['LastName', 'Cabin', 'Ticket', 'PassengerId', 'Pclass', 'Sex', 'Embarked', 'Honorific'],
        regex_to_drop='groupid_*'
    )),  
    ('fill_nulls', FillNull(-1)),
    ('xgbc', XGBClassifier(enable_categorical=True, tree_method='hist'))
])
train_transformed = TitanicDataset(train.clone(), 'Survived')
test_transformed = TitanicDataset(test.clone(), 'Survived')

PIPELINE.fit(X=train_transformed.xtrain, y=train_transformed.ytrain)

SchemaFieldNotFoundError: LastName

In [70]:
# preds = PIPELINE.predict_proba(train_transformed.xtest, train_transformed.ytest)
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_estimator(PIPELINE, train_transformed.xtest, train_transformed.ytest)
print(train_transformed.xtest.shape)
print(train_transformed.ytest.shape)

ValueError: Found input variables with inconsistent numbers of samples: [268, 1285]