In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import FunctionTransformer

In [2]:
# Improve this next time
# Create a directory for datasets

def get_data(data_link):
    data = pd.read_csv(data_link)
    return data

train_data = get_data('https://raw.githubusercontent.com/wyx-smrf/Windows-Misc/main/Titanic%20Project/train.csv')
test_data = get_data('https://raw.githubusercontent.com/wyx-smrf/Windows-Misc/main/Titanic%20Project/test.csv')

### Pipeline

In [3]:
y = train_data["Survived"]
X = train_data.drop(columns=["Survived"])

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Pandas Pipeline 1: Delete irrelevant columns

In [5]:
params = {'dataframe': train_data, 
          'del_cols_aslist': ['PassengerId', 'Name', 'Ticket', 'Cabin']}

def delete_cols(dataframe, del_cols_aslist):
    new_dataframe = dataframe.drop(columns = del_cols_aslist, axis=1)
    return new_dataframe

### Pandas Pipeline 2: Add additional columns

In [6]:
# column_operation_params = {'dataframe': removecols_df, 
#                            'num_col_1': 'SibSp', 
#                            'num_col_2': 'Parch',
#                            'result_colname': 'Companion'}

def column_addition(dataframe, num_col_1, num_col_2, result_colname):
    dataframe[result_colname] = dataframe[num_col_1] + dataframe[num_col_2]
    return dataframe

In [7]:
# column_operation_params = {'dataframe': removecols_df, 
#                            'num_col_1': 'Fare', 
#                            'num_col_2': 'Companion',
#                            'result_colname': 'Indiv_Price'}

def column_division(dataframe, num_col_1, num_col_2, result_colname):
    dataframe[result_colname] = dataframe[num_col_1] / dataframe[num_col_2]
    
    dataframe[result_colname].replace([np.inf, -np.inf], np.nan, inplace=True)
    dataframe[result_colname].fillna(dataframe[num_col_1], inplace=True)
    
    return dataframe

### Pandas Pipeline 3: Impute Missing Values based on group statistics

In [8]:
# Age Conditional Imputing

# conditional_fillna_params = {'dataframe':       divcols_df,
#                              'cat_col_1':       'Pclass', 
#                              'cat_col_2':       'Sex', 
#                              'num_col':         'Age', 
#                              'impute_strategy': 'mean'}

def conditional_fillna(dataframe, cat_col_1, cat_col_2, num_col, impute_strategy):
  # Create a grouping for the dataset
  grouped_data = dataframe.groupby([cat_col_1, cat_col_2])[num_col]

  # Fill missing values in a column using the defined impute strategy
  dataframe[num_col].fillna(grouped_data.transform(impute_strategy), inplace=True)

  # Optional: Covert the imputed numerical values into an integer
  dataframe[num_col] = dataframe[num_col].round(0).astype(int)

  return dataframe

## Compiled Pandas Pipeline

In [9]:
del_cols_aslist = ['PassengerId', 'Name', 'Ticket', 'Cabin']
column_addition_params = {'num_col_1': 'SibSp', 'num_col_2': 'Parch', 'result_colname': 'Companion'}
column_division_params = {'num_col_1': 'Fare', 'num_col_2': 'Companion', 'result_colname': 'Indiv_Price'}
conditional_fillna_params = {'cat_col_1': 'Pclass', 'cat_col_2': 'Sex', 'num_col': 'Age', 'impute_strategy': 'mean'}


pandas_pipeline = (X_train
                   .pipe(delete_cols, del_cols_aslist)
                   .pipe(column_addition, **column_addition_params)
                   .pipe(column_division, **column_division_params)
                   .pipe(conditional_fillna, **conditional_fillna_params))

In [10]:
pandas_pipeline.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Companion,Indiv_Price
331,1,male,46,0,0,28.5,S,0,28.5
733,2,male,23,0,0,13.0,S,0,13.0
382,3,male,32,0,0,7.925,S,0,7.925
704,3,male,26,1,0,7.8542,S,1,7.8542
813,3,female,6,4,2,31.275,S,6,5.2125


In [11]:
def monkey_patch_get_signature_names_out():
    """Monkey patch some classes which did not handle get_feature_names_out()
       correctly in Scikit-Learn 1.0.*."""
    from inspect import Signature, signature, Parameter
    import pandas as pd
    from sklearn.impute import SimpleImputer
    from sklearn.pipeline import make_pipeline, Pipeline
    from sklearn.preprocessing import FunctionTransformer, StandardScaler

    default_get_feature_names_out = StandardScaler.get_feature_names_out

    if not hasattr(SimpleImputer, "get_feature_names_out"):
      print("Monkey-patching SimpleImputer.get_feature_names_out()")
      SimpleImputer.get_feature_names_out = default_get_feature_names_out

    if not hasattr(FunctionTransformer, "get_feature_names_out"):
        print("Monkey-patching FunctionTransformer.get_feature_names_out()")
        orig_init = FunctionTransformer.__init__
        orig_sig = signature(orig_init)

        def __init__(*args, feature_names_out=None, **kwargs):
            orig_sig.bind(*args, **kwargs)
            orig_init(*args, **kwargs)
            args[0].feature_names_out = feature_names_out

        __init__.__signature__ = Signature(
            list(signature(orig_init).parameters.values()) + [
                Parameter("feature_names_out", Parameter.KEYWORD_ONLY)])

        def get_feature_names_out(self, names=None):
            if callable(self.feature_names_out):
                return self.feature_names_out(self, names)
            assert self.feature_names_out == "one-to-one"
            return default_get_feature_names_out(self, names)

        FunctionTransformer.__init__ = __init__
        FunctionTransformer.get_feature_names_out = get_feature_names_out

monkey_patch_get_signature_names_out()

Monkey-patching SimpleImputer.get_feature_names_out()
Monkey-patching FunctionTransformer.get_feature_names_out()


### Sklearn Pipeline

In [12]:
# Pipeline for categorical columns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"), 
    StandardScaler())

preprocessing = ColumnTransformer([
    ("numeric", num_pipeline, make_column_selector(dtype_include=np.number)),
    ("categorical", cat_pipeline, make_column_selector(dtype_include=object))], 
    remainder='passthrough')

---

In [13]:
pandas_pipeline_test = (X_test
                        .pipe(delete_cols, del_cols_aslist)
                        .pipe(column_addition, **column_addition_params)
                        .pipe(column_division, **column_division_params)
                        .pipe(conditional_fillna, **conditional_fillna_params))

sk_test = pd.DataFrame(preprocessing.fit_transform(pandas_pipeline_test))

# Training

In [14]:
from sklearn import set_config

set_config(display='diagram')

In [15]:
from sklearn.linear_model import LogisticRegression

log_clf = make_pipeline(preprocessing, LogisticRegression())
log_clf.fit(pandas_pipeline, y_train)

In [16]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = make_pipeline(preprocessing, DecisionTreeClassifier())
tree_clf.fit(pandas_pipeline, y_train)

In [25]:
from xgboost import XGBClassifier

xgb_clf = make_pipeline(preprocessing, XGBClassifier())
xgb_clf.fit(pandas_pipeline, y_train)

---

# Cross-Validation

In [17]:
from sklearn.model_selection import cross_val_score

log_cv_a = cross_val_score(log_clf, pandas_pipeline, y_train,
                            scoring="accuracy", cv=10).mean()

log_cv_a

0.7991001564945227

In [18]:
from sklearn.model_selection import cross_val_score

tree_cv_a = cross_val_score(tree_clf, pandas_pipeline, y_train,
                            scoring="accuracy", cv=10).mean()

tree_cv_a

0.7668427230046948

In [22]:
from sklearn.ensemble import RandomForestClassifier

forest_reg = make_pipeline(preprocessing, RandomForestClassifier(random_state=42))
forest_rmses = cross_val_score(forest_reg, pandas_pipeline, y_train,
                               scoring="accuracy", cv=5)

In [24]:
forest_rmses.mean()

0.7893725992317541

In [28]:
from sklearn.model_selection import cross_val_score

xgb_cv_a = cross_val_score(xgb_clf, pandas_pipeline, y_train,
                            scoring="accuracy", cv=5).mean()

xgb_cv_a

0.7893036540923865