In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from feature_engine.selection import DropFeatures
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer

from category_encoders.ordinal import OrdinalEncoder

import preprocessors as pp

import joblib

pd.set_option('display.max_columns', None)

SEED = 42

In [2]:
df = pd.read_csv('../data/data.csv')
print(df.shape)
df.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Split data into train and test

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df, df['Survived'], test_size=0.2, random_state=SEED)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((712, 12), (712,), (179, 12), (179,))

# Config

In [4]:
DROP_VAR = ['Survived', 'PassengerId', 'Name', 'Cabin', 'Ticket']

CAT_NA_WITH_MODE = ['Embarked']

NUM_NA_WITH_MEDIAN = ['Age']

BINS_FARE = [0, 7.9104, 14.4542, 31.0, 512.3292]
LABELS_FARE = ['Low', 'Medium', 'High', 'Very High']

MAPPING_VAR = [
    {'col': 'Sex', 'mapping': {'male': 1, 'female': 0}},
    {'col': 'Fare', 'mapping': {'Low': 0, 'Medium': 1, 'High': 2, 'Very High': 3}}
]

ONE_HOT_VAR = ['Embarked']
ONE_HOT_DROP = ['Embarked_C']



# Pipeline Feature Engineering

In [5]:
fe_pipeline = Pipeline([
    
    # ====== DROP ======
    # Drop unnecessary features
    ('drop_unnecessary', DropFeatures(features_to_drop=DROP_VAR)),

    # ====== IMPUTER ======
    # Impute categprocal data
    ('frequent_imputer', CategoricalImputer(imputation_method='frequent', variables=CAT_NA_WITH_MODE)),

    # Impute numerical data
    ('median_imputer', MeanMedianImputer(imputation_method='median', variables=NUM_NA_WITH_MEDIAN)),

    # ====== CUT ======
    # Cut skewness data
    ('cut_skewness', pp.FareDiscretizer(bins=BINS_FARE, labels=LABELS_FARE)),

    # ======  MAPPING ======
    # Apply mapping
    ('Map_categorical', OrdinalEncoder(mapping=MAPPING_VAR)),

    # ====== DUMMIES ======
    # Apply dummies
    ('Encode_categorical', pp.CustomOneHotEncoder(drop_cols=ONE_HOT_DROP, columns=ONE_HOT_VAR)),

    # ====== SCALING ======
    # Feature Scaling
    ('Scale_features', pp.CustomScaler(columns=['Age']))
])

In [6]:
fe_pipeline.fit(X_train, y_train)

In [7]:
X_train = fe_pipeline.transform(X_train)
X_test = fe_pipeline.transform(X_test)

In [8]:
for name, step in fe_pipeline.named_steps.items():
    print(f"Step: {name}")
    # Print general parameters for each step
    if hasattr(step, 'get_params'):
        params = step.get_params()
        for param_name, param_value in params.items():
            print(f"  {param_name}: {param_value}")

    # Additional details for scalers and other specific transformers
    if hasattr(step, 'scaler'):  # CustomScaler or any step containing a scaler object
        print("  Internal Scaler Parameters:")
        print(f"    Data minimum (data_min_): {step.scaler.data_min_}")
        print(f"    Data maximum (data_max_): {step.scaler.data_max_}")
        print(f"    Scale (scale_): {step.scaler.scale_}")
        print(f"    Min (min_): {step.scaler.min_}")
    elif hasattr(step, 'imputer_dict_'):  # Feature-engine imputers
        print("  Imputation Dictionary:")
        print(f"    {step.imputer_dict_}")
    elif isinstance(step, MeanMedianImputer) or isinstance(step, CategoricalImputer):
        # These may not expose internal parameters like a scaler, but here's how you might handle it
        print("  Internal Imputer Information:")
        if hasattr(step, 'variables_'):
            print(f"    Variables: {step.variables_}")
    print("\n")

Step: drop_unnecessary
  features_to_drop: ['Survived', 'PassengerId', 'Name', 'Cabin', 'Ticket']


Step: frequent_imputer
  fill_value: Missing
  ignore_format: False
  imputation_method: frequent
  return_object: False
  variables: ['Embarked']
  Imputation Dictionary:
    {'Embarked': 'S'}


Step: median_imputer
  imputation_method: median
  variables: ['Age']
  Imputation Dictionary:
    {'Age': 28.0}


Step: cut_skewness
  bins: [0, 7.9104, 14.4542, 31.0, 512.3292]
  column: Fare
  labels: ['Low', 'Medium', 'High', 'Very High']


Step: Map_categorical
  cols: ['Sex', 'Fare', 'Embarked']
  drop_invariant: False
  handle_missing: value
  handle_unknown: value
  mapping: [{'col': 'Sex', 'mapping': male      1
female    0
dtype: int64, 'data_type': dtype('O')}, {'col': 'Fare', 'mapping': Low          0
Medium       1
High         2
Very High    3
dtype: int64, 'data_type': dtype('O')}]
  return_df: True
  verbose: 0


Step: Encode_categorical
  columns: ['Embarked']
  drop_cols: ['Emb

In [9]:
X_train_to_test = pd.read_csv('../data/X_train.csv')
y_train_to_test = pd.read_csv('../data/y_train.csv')

X_test_to_test = pd.read_csv('../data/X_test.csv')
y_test_to_test = pd.read_csv('../data/y_test.csv')


In [10]:
def compare_dataframes(df1, df2, tolerance=1e-5):
    # Check for the same shape
    assert df1.shape == df2.shape, "DataFrames do not have the same shape."
    
    # Check for columns in both DataFrames
    assert df1.columns.equals(df2.columns), "DataFrames do not have the same columns."
    
    # For each column, check if it is close within the given tolerance
    for column in df1.columns:
        if df1[column].dtype.kind in 'fc' and df2[column].dtype.kind in 'fc':
            # If columns are float, compare using numpy.isclose
            if not np.isclose(df1[column], df2[column], atol=tolerance).all():
                diff = np.isclose(df1[column], df2[column], atol=tolerance) == False
                error_message = (f"Column '{column}' values differ. \n"
                                 f"First differing elements:\n{df1[column][diff]}\n{df2[column][diff]}")
                raise AssertionError(error_message)
        else:
            # If columns are not float, use equals
            if not df1[column].equals(df2[column]):
                diff_index = df1[column] != df2[column]
                error_message = (f"Column '{column}' values differ. \n"
                                 f"First differing elements:\n{df1[column][diff_index]}\n{df2[column][diff_index]}")
                raise AssertionError(error_message)

    print("DataFrames are equal within the given tolerance.")

In [11]:
compare_dataframes(X_train.reset_index(drop=True), X_train_to_test, tolerance=1e-5)
compare_dataframes(X_test.reset_index(drop=True), X_test_to_test, tolerance=1e-5)

compare_dataframes(y_train.reset_index(drop=True).to_frame(name='Survived'),
                   y_train_to_test, tolerance=1e-5)

compare_dataframes(y_test.reset_index(drop=True).to_frame(name='Survived'),
                   y_test_to_test, tolerance=1e-5)

DataFrames are equal within the given tolerance.
DataFrames are equal within the given tolerance.
DataFrames are equal within the given tolerance.
DataFrames are equal within the given tolerance.


In [12]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
331,1,1,0.566474,0,0,2,0,1
733,2,1,0.28374,0,0,1,0,1
382,3,1,0.396833,0,0,1,0,1
704,3,1,0.321438,1,0,0,0,1
813,3,0,0.070118,4,2,3,0,1
