In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
from typing import List, Optional
from scipy import stats

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    FunctionTransformer,
    LabelEncoder,
    OneHotEncoder,
    StandardScaler,
)
from sklearn.pipeline import make_pipeline

# Introduction

We shall use X to present on how do we use python tools to process data

In [2]:
# Load Titanic dataset from seaborn
df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
df.shape

(891, 15)



# Data Cleaning

Lets check the dataset for missing values



In [4]:
# Checking for missing values
print(df.isnull().sum())

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


##  Handling Missing Values

### Dropping Missing Values

We could drop a column deck since it has so many empty values. We would create a custom proper transformer to do so



In [5]:
class DropColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Drop specified columns
        X_transformed = X.drop(columns=self.columns, axis=1)
        return X_transformed

In [6]:
DropColumnTransformer(columns=["deck"]).fit_transform(df).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True




### Imputation Techniques (e.g., mean, median, mode)

We may also use a predefined transformers in sklearn



In [7]:
# Create a ColumnTransformer to specify imputation for specific columns
# In this example, we specify imputation for the 'chosen_column' using SimpleImputer with strategy='mean'
# and 'other_column' without imputation


class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy="mean", columns: Optional[List[str]] = None):
        self.strategy = strategy
        self.columns = columns if columns is not None else []
        self.imputer = SimpleImputer(strategy=self.strategy)

    def fit(self, X, y=None):
        self.imputer.fit(X[self.columns])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.columns] = self.imputer.transform(X[self.columns])
        return X_transformed


CustomImputer(strategy="mean", columns=["age"]).fit_transform(df).age.value_counts()

age
29.699118    177
24.000000     30
22.000000     27
18.000000     26
28.000000     25
            ... 
36.500000      1
55.500000      1
0.920000       1
23.500000      1
74.000000      1
Name: count, Length: 89, dtype: int64

In [8]:
df["age"].value_counts()

age
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: count, Length: 88, dtype: int64


## Handling Duplicates


In [9]:
df.duplicated().any()

True

In [10]:
df[df.duplicated()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
47,1,3,female,,0,0,7.7500,Q,Third,woman,False,,Queenstown,yes,True
76,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True
77,0,3,male,,0,0,8.0500,S,Third,man,True,,Southampton,no,True
87,0,3,male,,0,0,8.0500,S,Third,man,True,,Southampton,no,True
95,0,3,male,,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
870,0,3,male,26.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
877,0,3,male,19.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
878,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True
884,0,3,male,25.0,0,0,7.0500,S,Third,man,True,,Southampton,no,True



### Identifying and Removing Duplicates


In [11]:
drop_duplicates = FunctionTransformer(lambda X: X.drop_duplicates(), validate=False)

f"Dataset at begining {len(df)} after drop {len(drop_duplicates.fit_transform(df))}"

'Dataset at begining 891 after drop 784'


## Handling Outliers





### Identifying Outliers



In [12]:
# Handling outliers


numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns
z_scores = stats.zscore(df[numerical_cols])
threshold = 3
outliers = (abs(z_scores) > threshold).any(axis=1)
df[outliers]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
13,0,3,male,39.0,1,5,31.2750,S,Third,man,True,,Southampton,no,False
16,0,3,male,2.0,4,1,29.1250,Q,Third,child,False,,Queenstown,no,False
25,1,3,female,38.0,1,5,31.3875,S,Third,woman,False,,Southampton,yes,False
27,0,1,male,19.0,3,2,263.0000,S,First,man,True,C,Southampton,no,False
50,0,3,male,7.0,4,1,39.6875,S,Third,child,False,,Southampton,no,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
846,0,3,male,,8,2,69.5500,S,Third,man,True,,Southampton,no,False
850,0,3,male,4.0,4,2,31.2750,S,Third,child,False,,Southampton,no,False
858,1,3,female,24.0,0,3,19.2583,C,Third,woman,False,,Cherbourg,yes,False
863,0,3,female,,8,2,69.5500,S,Third,woman,False,,Southampton,no,False


## Exercise 1 

Create a custom transformer to detect and remove outliers. Make threshold its parameter

# Pipeline construction

We can create a pipeline to process the data


In [13]:
data_cleaning = make_pipeline(
    DropColumnTransformer(columns=["deck"]),
    CustomImputer(strategy="mean", columns=["age"]),
    CustomImputer(strategy="most_frequent", columns=["embarked"]),
    FunctionTransformer(lambda X: X.drop_duplicates(), validate=False),
    # add your own outlier remover transformer step here
)

df_cleaned = data_cleaning.fit_transform(df)
df_cleaned.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


In [14]:
data_cleaning

In [15]:
df_cleaned.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 780 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     780 non-null    int64   
 1   pclass       780 non-null    int64   
 2   sex          780 non-null    object  
 3   age          780 non-null    float64 
 4   sibsp        780 non-null    int64   
 5   parch        780 non-null    int64   
 6   fare         780 non-null    float64 
 7   embarked     780 non-null    object  
 8   class        780 non-null    category
 9   who          780 non-null    object  
 10  adult_male   780 non-null    bool    
 11  embark_town  778 non-null    object  
 12  alive        780 non-null    object  
 13  alone        780 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 75.5+ KB


# Data Preprocessing

## Encoding Categorical Variables

Most commonly we need to encode an object typed columns



In [16]:
df_cleaned.select_dtypes(include=["object"]).head()

Unnamed: 0,sex,embarked,who,embark_town,alive
0,male,S,man,Southampton,no
1,female,C,woman,Cherbourg,yes
2,female,S,woman,Southampton,yes
3,female,S,woman,Southampton,yes
4,male,S,man,Southampton,no



### Label Encoding



In [17]:
class CustomLabelEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, columns: List[str]) -> None:
        self.columns = columns
        self.encoders = {}

    def fit(self, X, y=None):
        for column in self.columns:
            self.encoders[column] = LabelEncoder()
            self.encoders[column].fit(X[column])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed[column] = self.encoders[column].transform(X[column])
        return X_transformed


CustomLabelEncoder(columns=["embarked", "embark_town"]).fit_transform(df_cleaned).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,2,Third,man,True,2,no,False
1,1,1,female,38.0,1,0,71.2833,0,First,woman,False,0,yes,False
2,1,3,female,26.0,0,0,7.925,2,Third,woman,False,2,yes,True
3,1,1,female,35.0,1,0,53.1,2,First,woman,False,2,yes,False
4,0,3,male,35.0,0,0,8.05,2,Third,man,True,2,no,True



### One-Hot Encoding

We shall use one hot for column with sex and who


In [18]:
from sklearn.preprocessing import OneHotEncoder


class CustomOneHotEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, columns: List[str]) -> None:
        self.columns = columns
        self.encoders = {}

    def fit(self, X, y=None):
        for column in self.columns:
            self.encoders[column] = OneHotEncoder(sparse_output=False)
            self.encoders[column].fit(X[[column]])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for column in self.columns:
            encoded = pd.DataFrame(
                self.encoders[column].transform(X[[column]]),
                columns=self.encoders[column].get_feature_names_out([column]),
                index=X.index,
            )
            X_transformed = pd.concat(
                [X_transformed.drop(columns=column), encoded], axis=1
            )
        return X_transformed


CustomOneHotEncoder(columns=["sex", "who"]).fit_transform(df_cleaned)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,embarked,class,adult_male,embark_town,alive,alone,sex_female,sex_male,who_child,who_man,who_woman
0,0,3,22.000000,1,0,7.2500,S,Third,True,Southampton,no,False,0.0,1.0,0.0,1.0,0.0
1,1,1,38.000000,1,0,71.2833,C,First,False,Cherbourg,yes,False,1.0,0.0,0.0,0.0,1.0
2,1,3,26.000000,0,0,7.9250,S,Third,False,Southampton,yes,True,1.0,0.0,0.0,0.0,1.0
3,1,1,35.000000,1,0,53.1000,S,First,False,Southampton,yes,False,1.0,0.0,0.0,0.0,1.0
4,0,3,35.000000,0,0,8.0500,S,Third,True,Southampton,no,True,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,39.000000,0,5,29.1250,Q,Third,False,Queenstown,no,False,1.0,0.0,0.0,0.0,1.0
887,1,1,19.000000,0,0,30.0000,S,First,False,Southampton,yes,True,1.0,0.0,0.0,0.0,1.0
888,0,3,29.699118,1,2,23.4500,S,Third,False,Southampton,no,False,1.0,0.0,0.0,0.0,1.0
889,1,1,26.000000,0,0,30.0000,C,First,True,Cherbourg,yes,True,0.0,1.0,0.0,1.0,0.0


``



### Ordinal Encoding

### Exercise 2

Create your own custom Encoder that uses OrdinalEncoder instead of LabelEncoder. Allow it to pass an order as a parameter

for encoder relate to 

```python 
from sklearn.preprocessing import OrdinalEncoder
```

## Scaling Numerical Features



### Standardization



In [19]:
class CustomStandardScaler(BaseEstimator, TransformerMixin):

    def __init__(self, columns: List[str]) -> None:
        self.columns = columns
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.columns] = self.scaler.transform(X[self.columns])
        return X_transformed


CustomStandardScaler(columns=["fare", "age"]).fit_transform(df_cleaned).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,-0.569164,1,0,-0.528033,S,Third,man,True,Southampton,no,False
1,1,1,female,0.598711,1,0,0.697956,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,-0.277195,0,0,-0.515109,S,Third,woman,False,Southampton,yes,True
3,1,1,female,0.379735,1,0,0.349817,S,First,woman,False,Southampton,yes,False
4,0,3,male,0.379735,0,0,-0.512716,S,Third,man,True,Southampton,no,True





### Min-Max Scaling

### Exercise 3

Create a custom transformer with min-max scalling. Check
```python
from sklearn.preprocessing import MinMaxScaler
```

### Exercise 4 

Create a transformer to replace target. You may search for a thing called TargetEncoder.

# Preprocessing pipeline

we now can create a preprocessing pipeline. It requires to provide an initial split for data

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_cleaned.drop(columns=["alone", "alive"]),
    df_cleaned["alive"],
    test_size=0.2,
    random_state=42,
)

In [21]:
preprocessing_pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who"]),
    CustomStandardScaler(columns=["fare", "age"]),
    ## add your own preprocessing steps here
)

In [22]:
y_train

789     no
722     no
141    yes
388     no
56     yes
      ... 
72      no
112     no
287     no
483    yes
108     no
Name: alive, Length: 624, dtype: object

In [23]:
preprocessing_pipeline.fit(X_train, y_train)

In [24]:
results = preprocessing_pipeline.transform(X_test)
results

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,embarked,class,adult_male,embark_town,sex_female,sex_male,who_child,who_man,who_woman
676,0,3,-0.405579,0,0,-0.508814,2,Third,True,2,0.0,1.0,0.0,1.0,0.0
667,0,3,-0.031987,0,0,-0.514001,2,Third,True,2,0.0,1.0,0.0,1.0,0.0
611,0,3,-0.031987,0,0,-0.527675,2,Third,True,2,0.0,1.0,0.0,1.0,0.0
728,0,2,-0.369651,1,0,-0.170247,2,Second,True,2,0.0,1.0,0.0,1.0,0.0
545,0,1,2.432768,0,0,-0.170247,2,First,True,2,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
382,0,3,0.133348,0,0,-0.511172,2,Third,True,2,0.0,1.0,0.0,1.0,0.0
82,1,3,-0.031987,0,0,-0.513765,1,Third,False,1,1.0,0.0,0.0,0.0,1.0
156,1,3,-1.016363,0,0,-0.514787,1,Third,False,1,1.0,0.0,0.0,0.0,1.0
362,0,3,1.067487,0,1,-0.388020,0,Third,False,0,1.0,0.0,0.0,0.0,1.0


In [25]:
y_test

676     no
667     no
611     no
728     no
545     no
      ... 
382     no
82     yes
156    yes
362     no
177     no
Name: alive, Length: 156, dtype: object

# Pass-Fail Exercise 

Complete the exercises presented in this notebook. Then copy this notebook to your student directory and create a Merge request with it. Please do not commit thios file.