<a href="https://colab.research.google.com/github/tigureis/Notes_and_exercises_on_Data_Structuring/blob/main/Pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn import datasets, set_config
from sklearn.feature_selection import mutual_info_classif
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OrdinalEncoder

import plotly.express as px
import missingno as msno

In [2]:
#import data
df=pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv', index_col = 'PassengerId')
# data enrichment
df['personal_title']=df.Name.str.extract('.+?, (.*?)\.', expand=False)
df=df.drop(columns=['Name', 'Ticket','Cabin']).dropna()
#separate the data on target/feature
df_features = df.drop(columns = 'Survived')
target = df['Survived']

df_features.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,personal_title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,male,22.0,1,0,7.25,S,Mr
2,1,female,38.0,1,0,71.2833,C,Mrs
3,3,female,26.0,0,0,7.925,S,Miss
4,1,female,35.0,1,0,53.1,S,Mrs
5,3,male,35.0,0,0,8.05,S,Mr


Select numerical and categorical features

In [3]:
numerical_features=df_features.select_dtypes('number').columns
categorical_features=df_features.select_dtypes(exclude='number').columns

print(f'Numerical features: {numerical_features}')
print(f'Categorical features: {categorical_features}')

Numerical features: Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')
Categorical features: Index(['Sex', 'Embarked', 'personal_title'], dtype='object')


# Applying [column transformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html)

In [4]:
preprocessing_pipeline = make_column_transformer(
    (StandardScaler(), numerical_features),
    (OrdinalEncoder(), categorical_features)
)

preprocessing_pipeline.fit(df_features)

Creating a DataFrame with the `ColumnTransformer` to better visualize, this is not needed on the pipeline

In [5]:
pd.DataFrame(
    preprocessing_pipeline.transform(df_features),
    columns=preprocessing_pipeline.get_feature_names_out()
)

Unnamed: 0,standardscaler__Pclass,standardscaler__Age,standardscaler__SibSp,standardscaler__Parch,standardscaler__Fare,ordinalencoder__Sex,ordinalencoder__Embarked,ordinalencoder__personal_title
0,0.908600,-0.527669,0.522511,-0.506787,-0.516380,1.0,2.0,11.0
1,-1.482983,0.577094,0.522511,-0.506787,0.694046,0.0,0.0,12.0
2,0.908600,-0.251478,-0.552714,-0.506787,-0.503620,0.0,2.0,8.0
3,-1.482983,0.369951,0.522511,-0.506787,0.350326,0.0,2.0,12.0
4,0.908600,0.369951,-0.552714,-0.506787,-0.501257,1.0,2.0,11.0
...,...,...,...,...,...,...,...,...
707,0.908600,0.646142,-0.552714,5.350885,-0.102875,0.0,1.0,12.0
708,-0.287191,-0.182430,-0.552714,-0.506787,-0.407687,1.0,2.0,14.0
709,-1.482983,-0.734812,-0.552714,-0.506787,-0.086335,0.0,2.0,8.0
710,-1.482983,-0.251478,-0.552714,-0.506787,-0.086335,1.0,0.0,11.0


In [14]:
numerical_features

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

# [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

Aplying `make_pipeline` inside a `make_column_transformer `function

In [15]:
preprocessing_pipeline2 = make_column_transformer(
    [make_pipeline(PowerTransformer(), StandardScaler()), numerical_features],
    [OrdinalEncoder(), categorical_features]
)
preprocessing_pipeline2.fit(df_features)

Vizualizing results

In [16]:
pd.DataFrame(
    preprocessing_pipeline2.transform(df_features),
    columns=preprocessing_pipeline2.get_feature_names_out()
)


Unnamed: 0,pipeline__Pclass,pipeline__Age,pipeline__SibSp,pipeline__Parch,pipeline__Fare,ordinalencoder__Sex,ordinalencoder__Embarked,ordinalencoder__personal_title
0,0.940654,-0.469528,1.290366,-0.609138,-0.997650,1.0,2.0,11.0
1,-1.409998,0.609720,1.290366,-0.609138,1.287757,0.0,0.0,12.0
2,0.940654,-0.186239,-0.716478,-0.609138,-0.901073,0.0,2.0,8.0
3,-1.409998,0.417070,1.290366,-0.609138,1.024276,0.0,2.0,12.0
4,0.940654,0.417070,-0.716478,-0.609138,-0.884112,1.0,2.0,11.0
...,...,...,...,...,...,...,...,...
707,0.940654,0.673133,-0.716478,1.785665,0.455301,0.0,1.0,12.0
708,-0.430593,-0.117045,-0.716478,-0.609138,-0.369789,1.0,2.0,14.0
709,-1.409998,-0.689838,-0.716478,-0.609138,0.484313,0.0,2.0,8.0
710,-1.409998,-0.186239,-0.716478,-0.609138,0.484313,1.0,0.0,11.0
