<a href="https://colab.research.google.com/github/uday-routhu/week4/blob/master/Column_Transformer_Core.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Column Transformer (Core):

* Author: Udayakumar Routhu

Create a Column Transformer

* Define 3 tuples (one for each pipeline that includes the name, the pipeline object, and the list of columns to apply it to.)
* Create one column transformer object that includes the 3 preprocessing pipelines you created in the previous assignment.
* Fit the column transformer object to the training data.
* Save the transformed training data as X_train_processed
* Display the .head() of X_train_processed
* Save the transformed testing data as X_ test_processed
* Display the .head() of the X_test_processed

In [158]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',100)
import missingno
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
# Set pandas as the default output for sklearn
from sklearn import set_config
set_config(transform_output='pandas')

In [159]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [179]:
# Load Ames Prep for ML
fpath = "/content/drive/MyDrive/CodingDojo/02-MachineLearning/Week05/Data/cereal-kaggle-crawford-modified - sheet 1.csv"
df = pd.read_csv(fpath)
use_cols = ['mfr', 'type', 'calories', 'protein', 'fat', 'fiber', 'sugars', 'shelf','rating']
df = df[use_cols]
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mfr       75 non-null     object 
 1   type      77 non-null     object 
 2   calories  72 non-null     float64
 3   protein   77 non-null     int64  
 4   fat       70 non-null     float64
 5   fiber     71 non-null     float64
 6   sugars    71 non-null     float64
 7   shelf     75 non-null     object 
 8   rating    77 non-null     float64
dtypes: float64(5), int64(1), object(3)
memory usage: 5.5+ KB


Unnamed: 0,mfr,type,calories,protein,fat,fiber,sugars,shelf,rating
0,N,C,,4,1.0,10.0,6.0,top,68.402973
1,Q,C,120.0,3,5.0,2.0,8.0,top,33.983679
2,K,C,70.0,4,1.0,9.0,5.0,top,59.425505
3,K,C,50.0,4,0.0,14.0,0.0,top,93.704912
4,R,C,,2,2.0,1.0,8.0,,34.384843


* Format for ML and Train Test Split

In [180]:
# Define features and target
X = df.drop(columns='rating')
y = df['rating']
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [181]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57 entries, 30 to 51
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mfr       55 non-null     object 
 1   type      57 non-null     object 
 2   calories  57 non-null     float64
 3   protein   57 non-null     int64  
 4   fat       52 non-null     float64
 5   fiber     52 non-null     float64
 6   sugars    52 non-null     float64
 7   shelf     57 non-null     object 
dtypes: float64(4), int64(1), object(3)
memory usage: 4.0+ KB


### Define groups of features using lists:



In [163]:
numeric_cols = X_train.select_dtypes('number').columns
numeric_cols

Index(['calories', 'protein', 'fat', 'fiber', 'sugars'], dtype='object')

In [164]:
ordinal_cols = ['shelf']
ordinal_cols

['shelf']

In [165]:
categorical_cols = X_train.select_dtypes('object').drop(columns='shelf').columns
categorical_cols

Index(['mfr', 'type'], dtype='object')

#Define 3 tuples (one for each pipeline that includes the name, the pipeline object, and the list of columns to apply it to.)

# Numeric Pipeline


  * Our numeric pipeline will:
    * Impute missing values with the median.
    * Scale the data.

In [182]:
# PREPROCESSING PIPELINE FOR NUMERIC DATA
# Save list of column names
num_cols = X_train.select_dtypes("number").columns
print("Numeric Columns:", numeric_cols)
# instantiate preprocessors
impute_median = SimpleImputer(strategy='median')
scaler = StandardScaler()
# Make a numeric preprocessing pipeline
num_pipe = make_pipeline(impute_median, scaler)
num_pipe

Numeric Columns: Index(['calories', 'protein', 'fat', 'fiber', 'sugars'], dtype='object')


In [183]:
# Making a numeric tuple for ColumnTransformer
num_tuple = ('numeric', num_pipe, numeric_cols)
num_tuple

('numeric',
 Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                 ('standardscaler', StandardScaler())]),
 Index(['calories', 'protein', 'fat', 'fiber', 'sugars'], dtype='object'))

In [184]:
df['shelf'].unique()

array(['top', nan, 'bottom', 'middle'], dtype=object)

#Ordinal Pipeline


 * Our ordinal pipeline will:
   * Impute missing values with "NA".
   * Encode the features ordinally.
   * Scale the encoded features.

In [185]:
# PREPROCESSING PIPELINE FOR ORDINAL DATA
# Save list of column names
print("Ordinal Columns:", ordinal_cols)
# Create imputer for ordinal data
impute_na_ord = SimpleImputer(strategy='constant', fill_value='NA')
## Making the OrdinalEncoder
# Specifying order of categories for our  Ordinal Qual/Cond Columms
qual_cond_order = ['top', 'NA', 'bottom', 'middle']
# Making the list of order lists for OrdinalEncoder
ordinal_category_orders = [qual_cond_order]
ord_encoder = OrdinalEncoder(categories=ordinal_category_orders)
# Making a final scaler to scale category #'s
scaler_ord = StandardScaler()
## Making an ord_pipe
ord_pipe = make_pipeline(impute_na_ord, ord_encoder, scaler_ord)
ord_pipe

Ordinal Columns: ['shelf']


In [186]:
# Making an ordinal_tuple for ColumnTransformer
ord_tuple = ('ordinal', ord_pipe, ordinal_cols)
ord_tuple
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57 entries, 30 to 51
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mfr       55 non-null     object 
 1   type      57 non-null     object 
 2   calories  57 non-null     float64
 3   protein   57 non-null     int64  
 4   fat       52 non-null     float64
 5   fiber     52 non-null     float64
 6   sugars    52 non-null     float64
 7   shelf     57 non-null     object 
dtypes: float64(4), int64(1), object(3)
memory usage: 4.0+ KB


#Categorical Pipeline

* Our categorical pipeline will:
 * Impute missing values with "NA".
 * One-Hot-Encode the categories for each feature.

In [187]:
# PREPROCESSING PIPELINE FOR ONE-HOT-ENCODED DATA
# Save list of column names"
ohe_cols = X_train.select_dtypes("object").drop(columns=ordinal_cols).columns
print("OneHotEncoder Columns:", ohe_cols)
# Instantiate the individual preprocessors
impute_na = SimpleImputer(strategy='constant', fill_value = "NA")
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Make pipeline with imputer and encoder
ohe_pipe = make_pipeline(impute_na, ohe_encoder)
ohe_pipe

OneHotEncoder Columns: Index(['mfr', 'type'], dtype='object')


In [188]:
# Making a ohe_tuple for ColumnTransformer
ohe_tuple = ('categorical', ohe_pipe, ohe_cols)
ohe_tuple

('categorical',
 Pipeline(steps=[('simpleimputer',
                  SimpleImputer(fill_value='NA', strategy='constant')),
                 ('onehotencoder',
                  OneHotEncoder(handle_unknown='ignore', sparse_output=False))]),
 Index(['mfr', 'type'], dtype='object'))

#Create one column transformer object that includes the 3 preprocessing pipelines you created in the previous assignment.

In [173]:
# Instantiate with verbose_feature_names_out=False
col_transformer = ColumnTransformer([num_tuple, ord_tuple, ohe_tuple],verbose_feature_names_out=False)

#Fit the column transformer object to the training data.

In [189]:
col_transformer.fit(X_train)

#Save the transformed training data as X_train_processed

In [190]:
# Transform the training data
X_train_processed = col_transformer.transform(X_train)

#Display the .head() of X_train_processed

In [191]:
X_train_processed.head()

Unnamed: 0,calories,protein,fat,fiber,sugars,shelf,mfr_A,mfr_G,mfr_K,mfr_N,mfr_NA,mfr_P,mfr_Q,mfr_R,type_C,type_H
30,-0.319703,-0.524507,-1.01092,-0.849221,1.999871,0.522859,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
40,0.172812,-0.524507,0.036761,-0.849221,-0.7861,1.307147,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
39,1.650358,0.354813,0.036761,0.002241,0.606886,-1.045718,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16,-0.319703,-0.524507,-1.01092,-0.42349,-1.018264,0.522859,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
65,-0.812218,0.354813,-1.01092,0.427972,-1.482593,0.522859,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


#Save the transformed testing data as X_ test_processed

In [192]:
# Transform the testing data
X_test_processed = col_transformer.transform(X_test)

#Display the .head() of the X_test_processed

In [193]:
X_test_processed.head()

Unnamed: 0,calories,protein,fat,fiber,sugars,shelf,mfr_A,mfr_G,mfr_K,mfr_N,mfr_NA,mfr_P,mfr_Q,mfr_R,type_C,type_H
4,0.172812,-0.524507,1.084442,-0.42349,0.374721,-0.261429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
35,0.172812,-1.403826,1.084442,-0.42349,1.071214,-0.261429,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
10,0.172812,-1.403826,1.084442,-0.849221,1.303379,1.307147,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
0,0.172812,1.234133,0.036761,3.408087,-0.089607,-1.045718,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
45,0.172812,1.234133,2.132122,0.427972,1.071214,-1.045718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
