## 

In [88]:
from paths import DATA_DIR


In [186]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import  ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn import set_config

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
set_config(transform_output="default")



In [121]:
def extract_family_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["LastName"] = df["Name"].str.split().str[-1]
    df["FamilyCount"] = df.groupby("LastName")["Name"].transform("count")
    df["NameLength"] = df["Name"].str.len()
    return df
    
family_tf = FunctionTransformer(
    extract_family_features,
    validate=False
)

def cabin_deck(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["Cabin"] = df["Cabin"].fillna(df["Cabin"].mode()[0])
    cabin = df["Cabin"].str.split("/", expand=True)
    df["CabinDeck"] = cabin[0]
    df["CabinNum"] = pd.to_numeric(cabin[1], errors="coerce")
    df["CabinSide"] = cabin[2]
    df.drop(["Cabin"], axis=1)
    return df

def prep_catboost(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = extract_family_features(df) 
    df = cabin_deck(df)   
    df = df.drop(columns=["Name", "Cabin"])

    for c in cat_boost_features:
        df[c] = df[c].fillna(df[c].mode()[0])

    for c in num_features:
        df[c] = df[c].fillna(df[c].median())

    return df

In [92]:
train = pd.read_csv(f"{DATA_DIR}/spaceship-titanic/train.csv")
test = pd.read_csv(f"{DATA_DIR}/spaceship-titanic/test.csv")
X = train.copy().drop(["PassengerId", "Transported"], axis=1)
y = train["Transported"].copy()
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.2, random_state=42)

col = train.columns
one_hot_features = ["HomePlanet", "CryoSleep", "VIP", "CabinDeck", "CabinSide"]
cat_ordinal_features = [
    "Destination",
    "LastName",        
    "CabinNum"
]

num_features = [
    "Age",
    "RoomService",
    "FoodCourt",
    "ShoppingMall",
    "Spa",
    "VRDeck",
    "FamilyCount",     
    "NameLength",
]
cat_boost_features = one_hot_features + cat_ordinal_features


In [177]:
log_params = {'solver': 'liblinear', 'penalty': 'l2', 'C': 0.1, 'max_iter': 1500, 'class_weight': 'balanced', 'random_state': 42}
rand_forest_params = dict(
    n_jobs=6,
    n_estimators=800,
    min_samples_leaf=4,
    min_samples_split=8,
    max_depth=8,
    max_features=0.5,
    bootstrap=True,
    random_state=42
)
cat_boost_params = dict(
    iterations=600,
    verbose=False,
    depth=5,
    l2_leaf_reg=5,
    cat_features = cat_boost_features,
    bootstrap_type="Bayesian",
    bagging_temperature=0.5,
    random_strength=2,
    eval_metric="Accuracy",
    loss_function="Logloss",
    random_state=42, 
)

lgbm_tree_params = dict(
    n_estimators=2000,
    learning_rate=0.01,
    num_leaves=60,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=5,
    random_state=42,
    n_jobs=6,
    min_data_in_leaf=30,
    metric="binary_error",
    verbose=-1
)


In [157]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='median'))
]) 

cat_ord_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder(
        handle_unknown="use_encoded_value",
        unknown_value=-1
    )),
])

cat_ohe_transforme = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

cat_imputer = SimpleImputer(strategy="most_frequent")

prep_1 = ColumnTransformer(transformers=[
    ("ohe features", cat_ohe_transforme, cat_ordinal_features),
    ("cat ordinal fetaures", cat_ord_transformer, cat_ordinal_features),
    ("num features", numeric_transformer, num_features)
    
])

prep_2 = ColumnTransformer(transformers=[
    ("ohe features", cat_ohe_transforme, cat_ordinal_features),
    ("cat ordinal fetaures", cat_ohe_transforme, cat_ordinal_features),
    ("num features", numeric_transformer, num_features)
])

In [194]:
m1 = Pipeline(steps=[
    ("name", family_tf),
    ("drop name", FunctionTransformer(lambda df: df.drop(columns=["Name"]))),
    ("Cabin features", FunctionTransformer(lambda df: cabin_deck(df))),
    ("prep", prep_1),
    ("model", RandomForestClassifier(**rand_forest_params))
])

In [195]:
m2 = Pipeline(steps=[
    ("prep", FunctionTransformer(prep_catboost)),
    ("model", CatBoostClassifier(**cat_boost_params))
])

In [196]:
m3 = Pipeline(steps=[
    ("name", family_tf),
    ("drop name", FunctionTransformer(lambda df: df.drop(columns=["Name"]))),
    ("Cabin features", FunctionTransformer(lambda df: cabin_deck(df))),
    ("prep", prep_2),
    ("model", LGBMClassifier(**lgbm_tree_params))
])

In [144]:
meta_model = LogisticRegression(**log_params)

In [179]:
train_sizes, train_scores, val_scores = learning_curve(
    m1, X_train, y_train, cv=5, train_sizes=np.linspace(0.1, 1.0, 10)
)

print(train_scores.mean(axis=1))
print(val_scores.mean(axis=1))

[0.88920863 0.88633094 0.86858513 0.86510791 0.85956835 0.85923261
 0.86166495 0.86618705 0.86522782 0.86115108]
[0.77675147 0.76639836 0.77962669 0.78826228 0.78711286 0.79229521
 0.80207195 0.79689788 0.79459571 0.79574017]


In [111]:
m1.fit(X_train, y_train)

0,1,2
,steps,"[('name', ...), ('drop name', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function ext...x78b7d67fbec0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,func,<function <la...x78b7d645ce00>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,func,<function <la...x78b7d645d800>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('ohe features', ...), ('cat ordinal fetaures', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,n_estimators,1000
,criterion,'gini'
,max_depth,
,min_samples_split,5
,min_samples_leaf,3
,min_weight_fraction_leaf,0.0
,max_features,0.5
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [123]:
m2.fit(X_train, y_train)

  df[c] = df[c].fillna(df[c].mode()[0])


0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function pre...x78b7d6d3e700>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,


In [112]:
m3.fit(X_train, y_train)

0,1,2
,steps,"[('name', ...), ('drop name', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function ext...x78b7d67fbec0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,func,<function <la...x78b7d6494e00>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,func,<function <la...x78b7d6494fe0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,transformers,"[('ohe features', ...), ('cat ordinal fetaures', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.03
,n_estimators,3000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [202]:
stack = StackingClassifier(
    estimators=[("rnd", m1), ("cat", m2), ('lgbm', m3)],
    final_estimator=meta_model,
    stack_method="predict_proba",
    cv=6,
    n_jobs=4,
    passthrough=False
)
# stack.fit(X_train, y_train)

In [198]:
pred_m1 = m1.predict_proba(X_val)[:, 1]
pred_m2 = m2.predict_proba(X_val)[:, 1]
pred_m3 = m3.predict_proba(X_val)[:, 1]
print(f"Rand Tree: {roc_auc_score(y_val, pred_m1)}")
print(f"Cat Boost: {roc_auc_score(y_val, pred_m2)}")
print(f"Light GBM: {roc_auc_score(y_val, pred_m3)}")

NotFittedError: Pipeline is not fitted yet.

In [200]:
stack_pred = stack.predict_proba(X_val)[:, 1]
stack_m1_pred = stack.named_estimators_["rnd"].predict_proba(X_val)[:, 1]
stack_m2_pred = stack.named_estimators_["cat"].predict_proba(X_val)[:, 1]
stack_m3_pred = stack.named_estimators_["lgbm"].predict_proba(X_val)[:, 1]

print(f"Stack roc_auc: {roc_auc_score(y_val, stack_pred):.5f}")
print(f"Rnd roc_auc: {roc_auc_score(y_val, stack_m1_pred):.5f}")
print(f"Cat roc_auc: {roc_auc_score(y_val, stack_m2_pred):.5f}")
print(f"Light roc_auc: {roc_auc_score(y_val, stack_m3_pred):.5f}")

  df[c] = df[c].fillna(df[c].mode()[0])
  df[c] = df[c].fillna(df[c].mode()[0])


Stack roc_auc: 0.87621
Rnd roc_auc: 0.84352
Cat roc_auc: 0.87808
Light roc_auc: 0.84640


In [184]:
stack.fit(X, y)
pred = m1.predict(test.copy().drop(["PassengerId"], axis=1))
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"].copy(),
    "Transported": pred 
})
submission.to_csv("submission.csv", index=False)
# !kaggle competitions submit -c spaceship-titanic -f submission.csv -m "Rand Tree" 


[LightGBM] [Info] Number of positive: 3492, number of negative: 3463
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029705 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1904
[LightGBM] [Info] Number of data points in the train set: 6955, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502085 -> initscore=0.008339
[LightGBM] [Info] Start training from score 0.008339
[LightGBM] [Info] Number of positive: 3284, number of negative: 3236
[LightGBM] [Info] Number of positive: 3283, number of negative: 3236
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002146 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1902
[LightGBM] [Info] Auto-choosing row-wise multi-th



[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020372 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1908
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495


In [201]:
stack_pred = stack.predict(X_val)
stack_m1_pred = stack.named_estimators_["rnd"].predict(X_val)
stack_m2_pred = stack.named_estimators_["cat"].predict(X_val)
stack_m3_pred = stack.named_estimators_["lgbm"].predict(X_val)

print(f"Stack roc_auc: {accuracy_score(y_val, stack_pred):.5f}")
print(f"Rnd roc_auc: {accuracy_score(y_val, stack_m1_pred):.5f}")
print(f"Cat roc_auc: {accuracy_score(y_val, stack_m2_pred):.5f}")
print(f"Light roc_auc: {accuracy_score(y_val, stack_m3_pred):.5f}")

  df[c] = df[c].fillna(df[c].mode()[0])
  df[c] = df[c].fillna(df[c].mode()[0])


Stack roc_auc: 0.79094
Rnd roc_auc: 0.78303
Cat roc_auc: 0.79339
Light roc_auc: 0.77930


In [234]:
# stack.fit(X, y)
pred = stack.named_estimators_["cat"].predict(test.drop(["PassengerId"], axis=1)).astype(bool)
submission = pd.DataFrame({
    "PassengerId" : test['PassengerId'].copy(),
    "Transported": pred
})
submission.to_csv("submission.csv", index=False)
!kaggle competitions submit -c spaceship-titanic -f submission.csv -m "Fourth try catboost"

  df[c] = df[c].fillna(df[c].mode()[0])


100%|██████████████████████████████████████| 56.3k/56.3k [00:01<00:00, 53.4kB/s]
Successfully submitted to Spaceship Titanic

In [223]:
x = pd.DataFrame(pred, columns=['pred'])

In [231]:
x["pred"] = x["pred"].apply(lambda x: return bool(x))

SyntaxError: invalid syntax (3775830963.py, line 1)