In [1]:
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from feature_engine.selection import DropFeatures
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer

from category_encoders.ordinal import OrdinalEncoder

import joblib

pd.set_option('display.max_columns', None)

seed = 42

In [2]:
df = pd.read_csv('../data/data.csv')
print(df.shape)
df.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Split data into train and test

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df, df['Survived'], test_size=0.2, random_state=seed)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((712, 12), (712,), (179, 12), (179,))

# Drop unnecessary features

In [4]:
drop_var = ['Survived', 'PassengerId', 'Name', 'Cabin', 'Ticket']

drop_features = DropFeatures(features_to_drop=drop_var)

X_train = drop_features.fit_transform(X_train)
X_test = drop_features.transform(X_test)

print(drop_features.features_to_drop_)

['Survived', 'PassengerId', 'Name', 'Cabin', 'Ticket']


# Missing Data

In [5]:
cat_na_with_mode = ['Embarked']

imputer = CategoricalImputer(imputation_method='frequent', variables=cat_na_with_mode)
imputer.fit(X_train)

print(imputer.imputer_dict_)

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

{'Embarked': 'S'}


In [6]:
num_na_with_median = ['Age']

imputer = MeanMedianImputer(imputation_method='median', variables=num_na_with_median)
imputer.fit(X_train)

print(imputer.imputer_dict_)

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

{'Age': 28.0}


# Cut skewness data

In [7]:
class FareDiscretizer(BaseEstimator, TransformerMixin):
    def __init__(self, bins, labels, column='Fare'):
        self.bins = bins
        self.labels = labels
        self.column = column
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()  
        X[self.column] = pd.cut(X[self.column], bins=self.bins, labels=self.labels, include_lowest=True)
        return X
    
bins = [0, 7.9104, 14.4542, 31.0, 512.3292]
labels = ['Low', 'Medium', 'High', 'Very High']

discretizer = FareDiscretizer(bins=bins, labels=labels)

print("Bins used in the discretizer:", discretizer.bins)
print("Labels used in the discretizer:", discretizer.labels)

X_train = discretizer.fit_transform(X_train)
X_test = discretizer.transform(X_test)

Bins used in the discretizer: [0, 7.9104, 14.4542, 31.0, 512.3292]
Labels used in the discretizer: ['Low', 'Medium', 'High', 'Very High']


# Apply mapping

In [8]:
mapping = [
    {'col': 'Sex', 'mapping': {'male': 1, 'female': 0}},
    {'col': 'Fare', 'mapping': {'Low': 0, 'Medium': 1, 'High': 2, 'Very High': 3}}
]

encoder = OrdinalEncoder(mapping=mapping)

for map_info in encoder.mapping:
    print(f"{map_info['col']} mapped as: {map_info['mapping']}")
    
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

Sex mapped as: male      1
female    0
dtype: int64
Fare mapped as: Low          0
Medium       1
High         2
Very High    3
dtype: int64


# Apply dummies

In [9]:
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, drop_cols, columns):
        self.drop_cols = drop_cols
        self.columns = columns
    
    def fit(self, X, y=None):
        self.columns_ = pd.get_dummies(X, columns=self.columns).columns
        return self
    
    def transform(self, X):
        X_encoded = pd.get_dummies(X, columns=self.columns, dtype=int)
        X_encoded.drop(labels=[col for col in self.drop_cols if col in X_encoded.columns], axis=1, inplace=True)
        return X_encoded

encoder = CustomOneHotEncoder(drop_cols=['Embarked_C'], columns=['Embarked'])
encoder.fit(X_train)

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

# Feature Scaling

In [10]:
class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.scaler = MinMaxScaler()
    
    def fit(self, X, y=None):
        if self.columns is not None:
            self.scaler.fit(X[self.columns])
        else:
            self.scaler.fit(X)
        return self
    
    def transform(self, X):
        X = X.copy()  
        if self.columns is not None:
            X_scaled = self.scaler.transform(X[self.columns])
            X[self.columns] = X_scaled
        else:
            X_scaled = self.scaler.transform(X)
            X = pd.DataFrame(X_scaled, columns=X.columns)
        return X
    
scaler = CustomScaler(columns=['Age'])
scaler.fit(X_train)

for i, col in enumerate(scaler.columns):
    print(f"\nColumn: {col}")
    print(f"  Minimum Value in {col} (data_min_): {scaler.scaler.data_min_[i]}")
    print(f"  Maximum Value in {col} (data_max_): {scaler.scaler.data_max_[i]}")
    print(f"  Scale for {col} (scale_): {scaler.scaler.scale_[i]}")
    print(f"  Min for {col} (min_): {scaler.scaler.min_[i]}")

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


Column: Age
  Minimum Value in Age (data_min_): 0.42
  Maximum Value in Age (data_max_): 80.0
  Scale for Age (scale_): 0.012565971349585323
  Min for Age (min_): -0.005277707966825835


# Save

In [11]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
331,1,1,0.566474,0,0,2,0,1
733,2,1,0.28374,0,0,1,0,1
382,3,1,0.396833,0,0,1,0,1
704,3,1,0.321438,1,0,0,0,1
813,3,0,0.070118,4,2,3,0,1


In [12]:
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)

y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)

In [13]:
joblib.dump(scaler, '../model/minmaxscaler.joblib')

['../model/minmaxscaler.joblib']