# Re-create your own _One Hot Encoder_ 

In [2]:
import pandas as pd
import seaborn as sns


## (1) The Titanic Dataset

In [3]:
# Loading 100% of the dataset.
# Choose 0.5 to load only 50% of the rows randomly

data = sns.load_dataset('titanic').sample(frac = 1)
data.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
294,0,3,male,24.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
248,1,1,male,37.0,1,1,52.5542,S,First,man,True,D,Southampton,yes,False
189,0,3,male,36.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
471,0,3,male,38.0,0,0,8.6625,S,Third,man,True,,Southampton,no,True
573,1,3,female,,0,0,7.75,Q,Third,woman,False,,Queenstown,yes,True


In [4]:
from sklearn.model_selection import train_test_split

X = data.drop(columns = ['survived', 'alive', 'who', 'adult_male', 'pclass'])
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)


In [5]:
X_train


Unnamed: 0,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
736,female,48.0,1,3,34.3750,S,Third,,Southampton,False
24,female,8.0,3,1,21.0750,S,Third,,Southampton,False
11,female,58.0,0,0,26.5500,S,First,C,Southampton,True
708,female,22.0,0,0,151.5500,S,First,,Southampton,True
676,male,24.5,0,0,8.0500,S,Third,,Southampton,True
...,...,...,...,...,...,...,...,...,...,...
225,male,22.0,0,0,9.3500,S,Third,,Southampton,True
42,male,,0,0,7.8958,C,Third,,Cherbourg,True
544,male,50.0,1,0,106.4250,C,First,C,Cherbourg,False
551,male,27.0,0,0,26.0000,S,Second,,Southampton,True


## (2) A first pipeline

‚ùì Create a basic Pipeline which ***encodes categorical features*** and ***scales numerical features*** ‚ùì

üí° Use [`make_pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html) and [`make_column_transformer`](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_transformer.html)

In [6]:
num_features = ['age','fare','sibsp','parch']
cat_features = ['embarked','class','embark_town']
binary_features = ['sex', 'alone']


In [13]:
# YOUR CODE HERE
# Create a base pipeline with a numeric transformer and a categorical transformer
# Use make_column_transformer to combine the two transformers

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

num_transformer = make_pipeline(SimpleImputer(strategy="median"),
                                StandardScaler())
cat_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessor = make_column_transformer(
    (num_transformer, num_features),
    (cat_transformer, cat_features),
    (cat_transformer, binary_features)
)


<details>
    <summary>üë©üèª‚Äçüè´ <i>Pipeline</i> vs. <i>make_pipeline</i></summary>

* When you create a Pipeline with `Pipeline()`, you have to:
    - specify all the ***sequential steps of the pipeline*** in a list
    - each step is a tuple with:
        - "name_of_the_step"
        - official Scikit-Learn name of the step
    
```python
Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
```
  
* When you create a Pipeline with `make_pipeline()`,
    - you don't have give a name to each step
    - you can simply chain all the steps together using their official Scikit-Learn name
    - the names of the steps are automatically induced by `make_pipeline`
    
```python
make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)
```
    
</details>

<details>
    <summary>üë©üèª‚Äçüè´ <i>ColumnTransformer</i> vs. <i>make_column_transformer</i></summary>

* When you create a ColumnTransformer with `ColumnTransformer()`, you have to:
    - specify all the ***parallel steps of the columns' transformer*** in a list
    - each step is a tuple with:
        - "name_of_the_transformer"
        - the transformer
        - the columns which will be impacted by the transformer
    
```python
ColumnTransformer([
    ('num_transformer', num_transformer, num_features),
    ('cat_transformer', cat_transformer, cat_features)
])
```
  
* When you create a ColumnTransformer with `make_column_transformer()`,
    - you don't have give a name to each parallel step
    - each step is a tuple with:
        - the transformer
        - the columns which will be impacted by the transformer
    
```python
make_column_transformer(
    (num_transformer, num_features),
    (cat_transformer, cat_features)
)
```
    
</details>

‚ùì Chain this preprocessing pipeline with a classifier and optimize it ‚ùì

In [18]:
# YOUR CODE HERE
# Chain the preprocessing pipeline with a classifier and optimize the hyperparameters
# of the classifier using GridSearchCV

from sklearn.neighbors import KNeighborsClassifier as KNNClassifier
from sklearn.model_selection import GridSearchCV

pipeline = make_pipeline(preprocessor, KNNClassifier())

pipeline.get_params()

param_grid = {
    "kneighborsclassifier__n_neighbors": [3, 5, 7, 9, 11],
    "kneighborsclassifier__weights": ["uniform", "distance"],
    "kneighborsclassifier__metric": ["euclidean", "manhattan"],
    #"simpleimputer__strategy": ["mean", "median", "most_frequent"],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


‚ùì What are the best params and the best score ‚ùì

In [19]:
# YOUR CODE HERE
grid_search.best_params_


{'kneighborsclassifier__metric': 'manhattan',
 'kneighborsclassifier__n_neighbors': 9,
 'kneighborsclassifier__weights': 'uniform'}

## (3) How could we design a Custom Encoder to keep track of the columns' names?

In [20]:
# By default, OneHotEncoder works with Numpy and loses track of columns' names...
ohe = OneHotEncoder(sparse_output=False)
ohe.fit_transform(X_train[['sex']])


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [21]:
# ... however, we can access the one-hot-encoded names as follows
ohe.get_feature_names_out()


array(['sex_female', 'sex_male'], dtype=object)

‚ùì Try to create your own OneHotEncoder so that it preserves the columns names ‚ùì

In [28]:
# YOUR CODE HERE
# Create custom OneHotEncoder that keeps track of column names
# Use the code above as a starting point

from sklearn.base import BaseEstimator, TransformerMixin

class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ohe = OneHotEncoder(sparse=False)
        self.columns = None

    def fit(self, X, y=None):
        self.ohe.fit(X)
        self.columns = self.ohe.get_feature_names_out()
        print("Column names: ", self.columns)

        return self

    def transform(self, X, y=None):
        X = self.ohe.transform(X)
        return pd.DataFrame(X, columns=self.columns)


custom_ohe = CustomOneHotEncoder()

cat_transformer1 = make_pipeline(
    #SimpleImputer(strategy="most_frequent"),
    custom_ohe
)


preprocessor1 = make_column_transformer(
    (num_transformer, num_features),
    (cat_transformer1, cat_features),
    (cat_transformer1, binary_features)
)

pipeline = make_pipeline(preprocessor1, KNNClassifier(n_neighbors=9, weights="uniform", metric="manhattan"))

pipeline.fit(X_train, y_train)

pipeline.predict(X_test)

pipeline.score(X_test, y_test)


Column names:  ['embarked_C' 'embarked_Q' 'embarked_S' 'embarked_nan' 'class_First'
 'class_Second' 'class_Third' 'embark_town_Cherbourg'
 'embark_town_Queenstown' 'embark_town_Southampton' 'embark_town_nan']
Column names:  ['sex_female' 'sex_male' 'alone_False' 'alone_True']




0.7873134328358209

üèÅ If you want to build a very advanced pipeline, feel free to explore the Optional Challenge dealing the `cars dataset` !

üíæ Don't forget to git add/commit/push your notebook.

üëè Congratulations, you are now a master at Pipeline and ColumnTransformer.