In [1]:
import pandas as pd

In [2]:
# Create a tabular data structure
X = pd.DataFrame({'city':['tokyo', None, 'london', 'seattle', 'san francisco', 'tokyo'], 
                  'boolean':['yes', 'no', None, 'no', 'no', 'yes'], 
                  'ordinal_column':['somewhat like', 'like', 'somewhat like', 'like', 'somewhat like', 'dislike'], 
                  'quantitative_column':[1, 11, -.5, 10, None, 20]})
X

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,,no,like,11.0
2,london,,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,
5,tokyo,yes,dislike,20.0


In [3]:
X.isnull().sum()

city                   1
boolean                1
ordinal_column         0
quantitative_column    1
dtype: int64

In [4]:
# Let's impute some values, imputer has a most_frequent option, but it only works if categories are integers
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [5]:
# Most common category
X['city'].value_counts().index[0]

'tokyo'

In [6]:
# Fill empty slots with most common category
X['city'].fillna(X['city'].value_counts().index[0])

0            tokyo
1            tokyo
2           london
3          seattle
4    san francisco
5            tokyo
Name: city, dtype: object

## Custom category imputer

1. Pipelines allow us to sequentially apply a list of transforms and a final estimator
2. Intermediate steps of the pipeline must be transforms, meaning they must implement <b>fit</b> and <b>transform</b> methods
3. The final estimator only needs to implement <b>fit</b>

In [7]:
from sklearn.base import TransformerMixin

In [8]:
class CustomCategoryImputer(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        
    def transform(self, df):
        X = df.copy()
        for col in self.cols:
            X[col].fillna(X[col].value_counts().index[0], inplace=True)
        return X
    
    def fit(self, *_):
        return self

In [9]:
cci = CustomCategoryImputer(cols=['city', 'boolean'])

cci.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,tokyo,no,like,11.0
2,london,no,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,
5,tokyo,yes,dislike,20.0


## Custom quantitative imputer

In [10]:
# Let's make an imputer that can apply a strategy to select columns by name
class CustomQuantitativeImputer(TransformerMixin):
    def __init__(self, cols=None, strategy='mean'):
        self.cols = cols
        self.strategy = strategy
        
    def transform(self, df):
        X = df.copy()
        impute = SimpleImputer(strategy=self.strategy)
        for col in self.cols:
            X[col] = impute.fit_transform(X[[col]])
        return X
    
    def fit(self, *_):
        return self

In [11]:
cqi = CustomQuantitativeImputer(cols=['quantitative_column'], strategy='mean')

In [12]:
cqi.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,,no,like,11.0
2,london,,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,8.3
5,tokyo,yes,dislike,20.0


In [13]:
# Setup Pipeline() so that we can transform our dataset in one go 
imputer = Pipeline([('quant', cqi), ('category', cci)])

# Ready for action
imputer.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,tokyo,no,like,11.0
2,london,no,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,8.3
5,tokyo,yes,dislike,20.0


## Custom dummifier for nominal level

To transform our categorical data into dummy variables:
1. Utilize pandas to automatically find the categorical variables and dummy code them
2. Create our own custom transformer using dummy variables to work in a pipeline

<b>Dummy variables</b> take the value zero or one to indicate the absence or presence of a category. They are proxy variables, or numerical stand-ins, for quantitative data. It's important to be aware of and avoid the <b>dummy variable trap</b>. The dummy variable trap is when you have independent variables that are multicollinear, or highly correlated. Simply put, these variables can be predicted from each other. 

In [14]:
# Will automatically find categorical variables
# The seperator between the prefix (column name) and cell value
pd.get_dummies(X, prefix_sep='__')

Unnamed: 0,quantitative_column,city__london,city__san francisco,city__seattle,city__tokyo,boolean__no,boolean__yes,ordinal_column__dislike,ordinal_column__like,ordinal_column__somewhat like
0,1.0,0,0,0,1,0,1,0,0,1
1,11.0,0,0,0,0,1,0,0,1,0
2,-0.5,1,0,0,0,0,0,0,0,1
3,10.0,0,0,1,0,1,0,0,1,0
4,,0,1,0,0,1,0,0,0,1
5,20.0,0,0,0,1,0,1,1,0,0


In [15]:
# For ordinal columns, we don't want to dummify
pd.get_dummies(X,
               # Which columns to dummify
               columns = ['city', 'boolean'],
               # The seperator between the prefix (column name) and cell value
               prefix_sep='__')

Unnamed: 0,ordinal_column,quantitative_column,city__london,city__san francisco,city__seattle,city__tokyo,boolean__no,boolean__yes
0,somewhat like,1.0,0,0,0,1,0,1
1,like,11.0,0,0,0,0,1,0
2,somewhat like,-0.5,1,0,0,0,0,0
3,like,10.0,0,0,1,0,1,0
4,somewhat like,,0,1,0,0,1,0
5,dislike,20.0,0,0,0,1,0,1


In [16]:
class CustomDummifier(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        
    def transform(self, X):
        return pd.get_dummies(X, columns=self.cols)
    
    def fit(self, *_):
        return self

In [17]:
cd = CustomDummifier(cols=['city', 'boolean'])

cd.fit_transform(X)

Unnamed: 0,ordinal_column,quantitative_column,city_london,city_san francisco,city_seattle,city_tokyo,boolean_no,boolean_yes
0,somewhat like,1.0,0,0,0,1,0,1
1,like,11.0,0,0,0,0,1,0
2,somewhat like,-0.5,1,0,0,0,0,0
3,like,10.0,0,0,1,0,1,0
4,somewhat like,,0,1,0,0,1,0
5,dislike,20.0,0,0,0,1,0,1


## Custom encoder for ordinal level

At the ordinal level, since there is meaning in the data having a specific order, it does not make sense to use dummy variables. To maintain the order, we still use a <b>label encoder</b> that each label in our ordinal data will have a numerical value associated to it.

In [18]:
# What about the ordinal_column, we still want to use it and it's a string.
# 0 for dislike, 1 for somewhat like, and 2 for like
ordering = ['dislike', 'somewhat like', 'like'] 

print(X['ordinal_column'])
print(X['ordinal_column'].map(lambda x: ordering.index(x)))

0    somewhat like
1             like
2    somewhat like
3             like
4    somewhat like
5          dislike
Name: ordinal_column, dtype: object
0    1
1    2
2    1
3    2
4    1
5    0
Name: ordinal_column, dtype: int64


In [19]:
class CustomEncoder(TransformerMixin):
    def __init__(self, col, ordering=None):
        self.ordering = ordering
        self.col = col
        
    def transform(self, df):
        X = df.copy()
        X[self.col] = X[self.col].map(lambda x: self.ordering.index(x))
        return X
    
    def fit(self, *_):
        return self

In [20]:
ce = CustomEncoder(col='ordinal_column', ordering = ['dislike', 'somewhat like', 'like'])

ce.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,1,1.0
1,,no,2,11.0
2,london,,1,-0.5
3,seattle,no,2,10.0
4,san francisco,no,1,
5,tokyo,yes,0,20.0


## Custom cutter for bucketing continous features into categories

In [21]:
# Name of category is the bin by default
pd.cut(X['quantitative_column'], bins=3)

0     (-0.52, 6.333]
1    (6.333, 13.167]
2     (-0.52, 6.333]
3    (6.333, 13.167]
4                NaN
5     (13.167, 20.0]
Name: quantitative_column, dtype: category
Categories (3, interval[float64, right]): [(-0.52, 6.333] < (6.333, 13.167] < (13.167, 20.0]]

In [22]:
# Use no labels
pd.cut(X['quantitative_column'], bins=3, labels=False)

0    0.0
1    1.0
2    0.0
3    1.0
4    NaN
5    2.0
Name: quantitative_column, dtype: float64

In [23]:
# Use pre-made labels
group_names = ['Low', 'Okay', 'Good']
pd.cut(X['quantitative_column'], bins=3, labels=group_names)

0     Low
1    Okay
2     Low
3    Okay
4     NaN
5    Good
Name: quantitative_column, dtype: category
Categories (3, object): ['Low' < 'Okay' < 'Good']

In [24]:
class CustomCutter(TransformerMixin):
    def __init__(self, col, bins, labels=False):
        self.labels = labels
        self.bins = bins
        self.col = col
        
    def transform(self, df):
        X = df.copy()
        X[self.col] = pd.cut(X[self.col], bins=self.bins, labels=self.labels)
        return X
    
    def fit(self, *_):
        return self

In [25]:
cc = CustomCutter(col='quantitative_column', bins=3)

# Note that the output of this is an ordinal column, meaning there is no need to dummify them
cc.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,0.0
1,,no,like,1.0
2,london,,somewhat like,0.0
3,seattle,no,like,1.0
4,san francisco,no,somewhat like,
5,tokyo,yes,dislike,2.0


## Pipeline

1. <b>Imputer</b>: fill in missing values for both categorical and numerical data using fillna() or Imputer()
2. <b>Dummifier</b>: nominal data
3. <b>Encoder</b>: ordinal data
4. <b>Cutter</b>: bucket the quantitative data

In [26]:
# boolean, city: dummy encoding
# ordinal_column: label encoding
# quantitative_column: ordinal level data
pipe = Pipeline([("imputer", imputer), ('dummify', cd), ('encode', ce), ('cut', cc)])

In [27]:
# Take a look at the original data
X

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somewhat like,1.0
1,,no,like,11.0
2,london,,somewhat like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,somewhat like,
5,tokyo,yes,dislike,20.0


In [28]:
pipe.fit(X)

In [29]:
pipe.transform(X)

Unnamed: 0,ordinal_column,quantitative_column,city_london,city_san francisco,city_seattle,city_tokyo,boolean_no,boolean_yes
0,1,0,0,0,0,1,0,1
1,2,1,0,0,0,1,1,0
2,1,0,1,0,0,0,1,0
3,2,1,0,0,1,0,1,0
4,1,1,0,1,0,0,1,0
5,0,2,0,0,0,1,0,1
