# Imputing Values

In [1]:
import pandas as pd

In [2]:
X=pd.DataFrame({'city':['tokyo',None,'london','seattle','san francisco','tokyo'],'boolean':['yes','no',None,'no','no','yes'],'ordinal_column':['somthing like','like','something like','like','something like','dislike'],'quantitative_column':[1,11,-.5,10,None,20]})

Description of each column:
1. boolean: This column is represented by binary categorical data (yes/no), and is at the nominal level
* city: This column is represented by categorical data, also at the nominal level
* ordinal_column: As you may have guessed by the column name, this column is represented by ordinal data, at the ordinal level
* quantitative_column: This column is represented by integers at the ratio level

In [3]:
print(X)

            city boolean  ordinal_column  quantitative_column
0          tokyo     yes   somthing like                  1.0
1           None      no            like                 11.0
2         london    None  something like                 -0.5
3        seattle      no            like                 10.0
4  san francisco      no  something like                  NaN
5          tokyo     yes         dislike                 20.0


In [4]:
# take a look at the missing values
X.isnull().sum()

city                   1
boolean                1
ordinal_column         0
quantitative_column    1
dtype: int64

In [5]:
# find out what the most common category is in our city column
X['city'].value_counts().index[0]

'tokyo'

In [6]:
# fill empty slots with most common category
X['city'].fillna(X['city'].value_counts().index[0])

0            tokyo
1            tokyo
2           london
3          seattle
4    san francisco
5            tokyo
Name: city, dtype: object

In [7]:
# TransformerMixin base class to create our own custom categorical imputer
from sklearn.base import TransformerMixin

# instantiate custom class with __init__,fit, and transform methods
class CustomCategoryImputer(TransformerMixin):
    def __init__(self,cols=None):
        self.cols=cols
    
    def transform(self,df):
        X=df.copy()
        for col in self.cols:
            X[col].fillna(X[col].value_counts().index[0],inplace=True)
            return X
            
    def fit(self,*_):
        return self

In [8]:
# Implement our custom categorical imputer on our categorical columns
cci = CustomCategoryImputer(cols=['city','boolean'])

In [9]:
cci.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somthing like,1.0
1,tokyo,no,like,11.0
2,london,,something like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,something like,
5,tokyo,yes,dislike,20.0


In [10]:
# Let's make an imputer that can apply a strategy to select columns by name
from sklearn.preprocessing import Imputer
class CustomQuantitativeImputer(TransformerMixin):
    def __init__(self, cols=None, strategy='mean'):
        self.cols = cols
        self.strategy = strategy
    def transform(self,df):
        X=df.copy()
        impute = Imputer(strategy=self.strategy)
        for col in self.cols:
            X[col] = impute.fit_transform(X[[col]])
            return X
    def fit(self,*_):
        return self

In [11]:
cqi = CustomQuantitativeImputer(cols=['quantitative_column'], strategy='mean')
cqi.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somthing like,1.0
1,,no,like,11.0
2,london,,something like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,something like,8.3
5,tokyo,yes,dislike,20.0


In [12]:
# import Pipeline from sklearn
from sklearn.pipeline import Pipeline

In [13]:
imputer = Pipeline([('quant',cqi),('category',cci)])

# take a look at what dataset looks like after pipeline tranformations
imputer.fit_transform(X)

Unnamed: 0,city,boolean,ordinal_column,quantitative_column
0,tokyo,yes,somthing like,1.0
1,tokyo,no,like,11.0
2,london,,something like,-0.5
3,seattle,no,like,10.0
4,san francisco,no,something like,8.3
5,tokyo,yes,dislike,20.0


# Encode Categorical Variables

[Link](https://subscription.packtpub.com/book/big_data_and_business_intelligence/9781787287600/4/ch04lvl1sec33/encoding-categorical-variables)

In [14]:
pd.get_dummies(X, columns=['city','boolean'], # which coulmns to dummify
               prefix_sep='__' # the separator between the prefix (column name) and cell value
              )

Unnamed: 0,ordinal_column,quantitative_column,city__london,city__san francisco,city__seattle,city__tokyo,boolean__no,boolean__yes
0,somthing like,1.0,0,0,0,1,0,1
1,like,11.0,0,0,0,0,1,0
2,something like,-0.5,1,0,0,0,0,0
3,like,10.0,0,0,1,0,1,0
4,something like,,0,1,0,0,1,0
5,dislike,20.0,0,0,0,1,0,1
