In [90]:
import seaborn as sns
import pandas as pd
import numpy as np
from typing import List, Optional
from scipy import stats

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    FunctionTransformer,
    LabelEncoder,
    OneHotEncoder,
    StandardScaler,
)
from sklearn.pipeline import make_pipeline

# Introduction

We shall use X to present on how do we use python tools to process data

In [None]:
# Load Titanic dataset from seaborn
df = sns.load_dataset("titanic")
df.head()

In [None]:
df.shape



# Data Cleaning

Lets check the dataset for missing values



In [None]:
# Checking for missing values
print(df.isnull().sum())

##  Handling Missing Values

### Dropping Missing Values

We could drop a column deck since it has so many empty values. We would create a custom proper transformer to do so



In [94]:
class DropColumnTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Drop specified columns
        X_transformed = X.drop(columns=self.columns, axis=1)
        return X_transformed

In [None]:
DropColumnTransformer(columns=["deck"]).fit_transform(df).head()



### Imputation Techniques (e.g., mean, median, mode)

We may also use a predefined transformers in sklearn



In [None]:
# Create a ColumnTransformer to specify imputation for specific columns
# In this example, we specify imputation for the 'chosen_column' using SimpleImputer with strategy='mean'
# and 'other_column' without imputation


class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy="mean", columns: Optional[List[str]] = None):
        self.strategy = strategy
        self.columns = columns if columns is not None else []
        self.imputer = SimpleImputer(strategy=self.strategy)

    def fit(self, X, y=None):
        self.imputer.fit(X[self.columns])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.columns] = self.imputer.transform(X[self.columns])
        return X_transformed


CustomImputer(strategy="mean", columns=["age"]).fit_transform(df).age.value_counts()

In [None]:
df["age"].value_counts()


## Handling Duplicates


In [None]:
df.duplicated().any()

In [None]:
df[df.duplicated()]


### Identifying and Removing Duplicates


In [None]:
drop_duplicates = FunctionTransformer(lambda X: X.drop_duplicates(), validate=False)

f"Dataset at begining {len(df)} after drop {len(drop_duplicates.fit_transform(df))}"


## Handling Outliers





### Identifying Outliers



In [None]:
# Handling outliers


numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns
z_scores = stats.zscore(df[numerical_cols])
threshold = 3
outliers = (abs(z_scores) > threshold).any(axis=1)
df[outliers]

## Exercise 1 

Create a custom transformer to detect and remove outliers. Make threshold its parameter

In [102]:
class OutlierRemoveTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=3):
        self.threshold = threshold

    def fit(self, X, y=None):
        self.numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
        self.means = X[self.numerical_cols].mean()
        self.stds = X[self.numerical_cols].std()
        return self

    def transform(self, X):
        X_num = X[self.numerical_cols]
        z_scores = ((X_num - self.means) / self.stds).abs()
        outliers = (z_scores > self.threshold).any(axis=1)
        return X[~outliers]

# Pipeline construction

We can create a pipeline to process the data


In [None]:
data_cleaning = make_pipeline(
    DropColumnTransformer(columns=["deck"]),
    CustomImputer(strategy="mean", columns=["age"]),
    CustomImputer(strategy="most_frequent", columns=["embarked"]),
    FunctionTransformer(lambda X: X.drop_duplicates(), validate=False),
    OutlierRemoveTransformer(threshold=3)
)

df_cleaned = data_cleaning.fit_transform(df)
df_cleaned.head()

In [None]:
data_cleaning

In [None]:
df_cleaned.info(verbose=True)

# Data Preprocessing

## Encoding Categorical Variables

Most commonly we need to encode an object typed columns



In [None]:
df_cleaned.select_dtypes(include=["object"]).head()


### Label Encoding



In [None]:
class CustomLabelEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, columns: List[str]) -> None:
        self.columns = columns
        self.encoders = {}

    def fit(self, X, y=None):
        for column in self.columns:
            self.encoders[column] = LabelEncoder()
            self.encoders[column].fit(X[column])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed[column] = self.encoders[column].transform(X[column])
        return X_transformed


CustomLabelEncoder(columns=["embarked", "embark_town"]).fit_transform(df_cleaned).head()


### One-Hot Encoding

We shall use one hot for column with sex and who


In [None]:
from sklearn.preprocessing import OneHotEncoder


class CustomOneHotEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, columns: List[str]) -> None:
        self.columns = columns
        self.encoders = {}

    def fit(self, X, y=None):
        for column in self.columns:
            self.encoders[column] = OneHotEncoder(sparse_output=False)
            self.encoders[column].fit(X[[column]])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for column in self.columns:
            encoded = pd.DataFrame(
                self.encoders[column].transform(X[[column]]),
                columns=self.encoders[column].get_feature_names_out([column]),
                index=X.index,
            )
            X_transformed = pd.concat(
                [X_transformed.drop(columns=column), encoded], axis=1
            )
        return X_transformed


CustomOneHotEncoder(columns=["sex", "who"]).fit_transform(df_cleaned)

``



### Ordinal Encoding

### Exercise 2

Create your own custom Encoder that uses OrdinalEncoder instead of LabelEncoder. Allow it to pass an order as a parameter

for encoder relate to 

```python 
from sklearn.preprocessing import OrdinalEncoder
```


In [109]:
from sklearn.preprocessing import OrdinalEncoder

In [110]:
class CustomOrdinalEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns: List[str], order: Optional[dict] = None) -> None:
        self.columns = columns
        self.order = order if order is not None else {}
        self.encoders = {}

    def fit(self, X, y=None):
        for column in self.columns:
            if column in self.order:
                self.encoders[column] = OrdinalEncoder(categories=[self.order[column]])
            else:
                self.encoders[column] = OrdinalEncoder()
            self.encoders[column].fit(X[[column]])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for column in self.columns:
            X_transformed[column] = self.encoders[column].transform(X[[column]])
        return X_transformed



## Scaling Numerical Features



### Standardization



In [None]:
class CustomStandardScaler(BaseEstimator, TransformerMixin):

    def __init__(self, columns: List[str]) -> None:
        self.columns = columns
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.columns] = self.scaler.transform(X[self.columns])
        return X_transformed


CustomStandardScaler(columns=["fare", "age"]).fit_transform(df_cleaned).head()




### Min-Max Scaling

### Exercise 3

Create a custom transformer with min-max scalling. Check
```python
from sklearn.preprocessing import MinMaxScaler
```

In [112]:
from sklearn.preprocessing import MinMaxScaler

In [113]:
class CustomMinMaxScaler(BaseEstimator, TransformerMixin):

    def __init__(self, columns: List[str]) -> None:
        self.columns = columns
        self.scaler = MinMaxScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns])
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed[self.columns] = self.scaler.transform(X[self.columns])
        return X_transformed

### Exercise 4 

Create a transformer to replace target. You may search for a thing called TargetEncoder.

# Preprocessing pipeline

we now can create a preprocessing pipeline. It requires to provide an initial split for data

In [114]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_cleaned.drop(columns=["alone", "alive"]),
    df_cleaned["alive"],
    test_size=0.2,
    random_state=42,
)

In [115]:
preprocessing_pipeline = make_pipeline(
    CustomLabelEncoder(columns=["embarked", "embark_town"]),
    CustomOneHotEncoder(columns=["sex", "who"]),
    CustomStandardScaler(columns=["fare", "age"]),
    CustomOrdinalEncoder(columns=["class"], order={"class": ["First", "Second", "Third"]}),
    CustomMinMaxScaler(columns=["sibsp", "parch"])
)

In [None]:
y_train

In [None]:
preprocessing_pipeline.fit(X_train, y_train)

In [None]:
results = preprocessing_pipeline.transform(X_test)
results

In [None]:
y_test

# Pass-Fail Exercise 

Complete the exercises presented in this notebook. Then copy this notebook to your student directory and create a Merge request with it. Please do not commit thios file.