## Init

In [2]:
import numpy as np
import polars as pl
import pandas as pd
import pyarrow as pa
import matplotlib.pyplot as plt

from abc import ABCMeta, abstractmethod
from dataclasses import dataclass
from sklearn.model_selection import train_test_split
import pandas as pd
import pyarrow as pa
from typing import Callable
import re
from datetime import datetime
import os

from utils.baseclass import Dataset
from utils.data import TitanicDataset, clean_test_to_fit_answer_sheet, submit_answer
from utils.decorators import PolarsCompatibleTransformer
from utils.transformers import MakeNameFeatures, TransformColToCategorical, TransformStringColToNumeric, TransformColToBins, DropColumns, AddFamilyUnitID, AddSurvivalRate, CleanCabin, FillNull, AddRandomColumn, CleanHonorific, MakeNameFeatures, CleanAge, PipelineCompatibleCatBoostClassifier, CleanFare

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier
import xgboost as xgb

from catboost import CatBoostClassifier

from collections import Counter

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import RocCurveDisplay

pl.Config.set_tbl_rows(999)
pl.Config.set_tbl_cols(999)
pl.Config.set_tbl_width_chars(9999)
pl.Config.set_fmt_str_lengths(9999)

polars.config.Config

In [5]:
DATAPATH = './data'

answer_sheet = pl.read_csv(f'{DATAPATH}/answer_sheet.csv').select(pl.col('name').alias('Name'), pl.col('age').alias("Age"), pl.col('survived').alias("Survived"))

train = pl.read_csv(f'{DATAPATH}/train.csv')
test = clean_test_to_fit_answer_sheet(pl.read_csv(f'{DATAPATH}/test.csv')).join(answer_sheet, on=['Name', 'Age'], how='inner')
submission_sample = pl.read_csv(f'{DATAPATH}/gender_submission.csv')

In [6]:
%load_ext autoreload
%autoreload 2

pl.Config.set_tbl_formatting("UTF8_FULL_CONDENSED")
pl.Config.set_tbl_width_chars(175)
pl.Config.set_tbl_cols(99)

pd.options.display.max_columns = None
pd.options.display.max_rows = None

## Data Exploration

In [None]:
train_explore = train.clone()

In [None]:
# train_explore.head()
train_explore.describe()

In [None]:
'''
Name
Ticket
Cabin
'''
(
    train_explore
    .select(
        '*',
        pl.col('Name').str.split(by=', ').apply(lambda x: x[0]).alias('LastName'),
        pl.col('Name').str.split(by=', ').apply(lambda x: x[1]).str.split(by=' ').apply(lambda x: x[0]).alias('Honorific')
    )
)

# train_explore.select('Name', pl.col('Ticket')).sort('Ticket').head(5)

# train_explore.select('Cabin', pl.col('Ticket')).sort('Cabin').head(5)
# train_explore.groupby('Cabin').agg(pl.n_unique('PassengerId').alias('counts')).sort('counts', descending=True)
# train_explore.filter(pl.col('Cabin')=='C23 C25 C27')
# display(train_explore.groupby('Ticket').agg(pl.n_unique('Cabin').alias('cabincount')).filter(pl.col('cabincount') > 1).head(5))
# train_explore.filter(pl.col('Ticket') == '17421')

In [None]:
def get_features_to_outcome_effect(df: pl.DataFrame, group_cols: list[str], outcome_col: str) -> None:
    for col in group_cols:
        if df[col].dtype in [pl.Float32, pl.Float64]:
            df = (
                df
                .with_columns(pl.col(col).qcut(5).alias(col))
            )
    return df.groupby(group_cols).agg(pl.mean(outcome_col)).sort(group_cols, descending=False)

# single_variable_relation_to_outcome(train_explore, ['Sex'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Pclass'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Age'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['SibSp'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Parch'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Fare'], 'Survived')
# single_variable_relation_to_outcome(train_explore, ['Embarked'], 'Survived')

## Transform data

In [207]:
PIPELINE = Pipeline([
    ('make_name_features', MakeNameFeatures()),
    
    # ('transform_sex', TransformStringColToNumeric('Sex', replace_original=False)),
    # ('transform_embarked', TransformStringColToNumeric('Embarked', replace_original=False)),
    # ('clean_honorific', CleanHonorific()),
    # ('transform_honorific', TransformStringColToNumeric('Honorific', replace_original=False)),
    # ('clean_fare', CleanFare()),
    # ('transform_fare', TransformColToBins('Fare', replace_original=False, return_numeric=True)),
    # ('clean_age', CleanAge()),
    # ('transform_age', TransformColToBins('Age', replace_original=False, return_numeric=True)),
    # ('clean_cabin', CleanCabin()),
    # ('transform_cabin', TransformStringColToNumeric('CabinFirstLetter', replace_original=False)),

    # ('transform_sex', TransformColToCategorical('Sex', replace_original=False)),
    # ('transform_embarked', TransformColToCategorical('Embarked', replace_original=False)),
    # ('clean_honorific', CleanHonorific()),
    # ('transform_honorific', TransformColToCategorical('Honorific', replace_original=False)),
    
    # ('clean_fare', CleanFare()),
    # ('transform_fare', TransformColToBins('Fare', replace_original=False, return_numeric=True)),

    # ('clean_age', CleanAge()),
    # ('transform_age', TransformColToBins('Age', replace_original=False, return_numeric=True)),
    
    # ('clean_cabin', CleanCabin()),
    # ('transform_cabin', TransformColToCategorical('CabinFirstLetter', replace_original=False)),

    # ('add_familyid_lastname', AddFamilyUnitID(['LastName'])),
    # ('add_survivalrate_lastname', AddSurvivalRate(['LastName'])),
    # ('add_familyid_ticket', AddFamilyUnitID(['Ticket'])),
    # ('add_survivalrate_ticket', AddSurvivalRate(['Ticket'])),
    # ('add_familyid_lastname_ticket', AddFamilyUnitID(['LastName', 'Ticket'])),
    # ('add_survivalrate_lastname_ticket', AddSurvivalRate(['LastName', 'Ticket'])),
    # ('add_familyid_lastname_cabin', AddFamilyUnitID(['LastName', 'Cabin'])),
    # ('add_survivalrate_lastname_cabin', AddSurvivalRate(['LastName', 'Cabin'])),
    
    ('drop_unneeded_cols', DropColumns(
        cols_to_drop=['PassengerId', 'Survived'], # + ['Pclass', 'Age', 'Parch', 'SibSp', 'Fare'],
        regex_to_drop='groupid_*',
        drop_strings = True
    )),

    # ('fill_nulls', FillNull(-1)),
    
    # ('xgbc', XGBClassifier(enable_categorical=True, tree_method='approx'))
    # ('xgbc', XGBClassifier(random_state=123))
    # ('catboost', PipelineCompatibleCatBoostClassifier(verbose=False, random_seed=123))
])
train_transformed = TitanicDataset(train.clone(), 'Survived', retain_outcome_col=True, test_size=0.4)
test_transformed = TitanicDataset(test.clone(), 'Survived', predict_only=True)

# df_clean = PIPELINE.fit_transform(X=train_transformed.xtrain, y=train_transformed.ytrain)
# df_clean.head(3)
# df_clean.describe(include='all')

PIPELINE.fit(X=train_transformed.xtrain, y=train_transformed.ytrain);
print(accuracy_score(train_transformed.ytrain, PIPELINE.predict(train_transformed.xtrain)))
print(accuracy_score(train_transformed.ytest, PIPELINE.predict(train_transformed.xtest)))
print(accuracy_score(test_transformed.y, PIPELINE.predict(test_transformed.X)))

  return fit_method(estimator, *args, **kwargs)
[32m2023-08-31 12:52:21.761[0m | [1mINFO    [0m | [36mutils.transformers[0m:[36mfit[0m:[36m216[0m - [1mString drop cols: ['Sex', 'Ticket', 'Cabin', 'Embarked', 'LastName', 'Honorific', 'CabinFirstLetter'][0m
[32m2023-08-31 12:52:21.762[0m | [1mINFO    [0m | [36mutils.transformers[0m:[36mfit[0m:[36m220[0m - [1mSpecified drop cols: ['PassengerId', 'Survived'][0m
[32m2023-08-31 12:52:21.763[0m | [1mINFO    [0m | [36mutils.transformers[0m:[36mfit[0m:[36m221[0m - [1mRegex drop cols: [][0m
[32m2023-08-31 12:52:21.764[0m | [1mINFO    [0m | [36mutils.transformers[0m:[36mfit[0m:[36m222[0m - [1mString drop cols: ['Sex', 'Ticket', 'Cabin', 'Embarked', 'LastName', 'Honorific', 'CabinFirstLetter'][0m
[32m2023-08-31 12:52:21.764[0m | [1mINFO    [0m | [36mutils.transformers[0m:[36mfit[0m:[36m223[0m - [1mDropping cols: ['Survived', 'LastName', 'Honorific', 'Sex', 'Cabin', 'CabinFirstLetter', 'Pas

0.9887640449438202
0.8207282913165266
0.7296650717703349


In [196]:
# Counter(df_clean['Honorific'])

In [197]:
# RocCurveDisplay.from_estimator(PIPELINE, train_transformed.xtest, train_transformed.ytest);
# print(accuracy_score(train_transformed.ytrain, PIPELINE.predict(train_transformed.xtrain)))
# print(accuracy_score(train_transformed.ytest, PIPELINE.predict(train_transformed.xtest)))
# print(accuracy_score(test_transformed.y, PIPELINE.predict(test_transformed.X)))

0.8801498127340824
0.8515406162464986
0.7679425837320574


In [81]:
manual_preds = (
    pl.from_pandas(test_transformed.X)
    .with_columns(
        pl.Series(PIPELINE.predict(test_transformed.X)).alias('model_preds')
    )
    .with_columns(
        pl.col('Name').str.split(", ").apply(lambda x: x[0]).alias('LastName'),
        pl.col('Name').str.split(", ").apply(lambda x: x[1]).str.split('. ').apply(lambda x: x[0]).alias('Honorific')
    )
    .with_columns(
        pl.when(
            ((pl.col('Sex') == 'female') & (pl.col("Age") <= 18)) | 
            (pl.col('Honorific') == 'Master')
        ).then(
            pl.lit(1)
        ).otherwise(
            # pl.col('model_preds')
            pl.lit(0)
        ).alias('Survived')
    )
)
# manual_preds.head()
# manual_preds.sort('Honorific', 'Sex').head(50)

In [82]:
from datetime import datetime
import os

PassengerId,Survived
i64,i32
892,0
893,0
894,0
895,0
896,0


100%|██████████| 2.77k/2.77k [00:03<00:00, 777B/s]


Successfully submitted to Titanic - Machine Learning from Disaster

0

## TODO

In [None]:
# Basic prediction: Predict men die, women live
# Basic prediction: Women live. Men who are "master" live. All else die

# Remove correlated features?
# Model tuning
# Stacking models
# Ensembling models