# Refer site : https://towardsdatascience.com/preprocessing-with-sklearn-a-complete-and-comprehensive-guide-670cb98fcfb9

In [2]:
import numpy as np
import pandas as pd
import sklearn
sklearn.__version__

'0.22.2.post1'

#Missing values

In [16]:
X = pd.DataFrame(
    np.array([5,7,8, np.NaN, np.NaN, np.NaN, -5,
              0,25,999,1,-1, np.NaN, 0, np.NaN])\
              .reshape((5,3)))
X.columns = ['f1', 'f2', 'f3'] #feature 1, feature 2, feature 3
X

Unnamed: 0,f1,f2,f3
0,5.0,7.0,8.0
1,,,
2,-5.0,0.0,25.0
3,999.0,1.0,-1.0
4,,0.0,


###MissingIndicator

In [17]:
X.dropna(axis=0, thresh=1, inplace=True)
X.reset_index(inplace=True)
X.drop(['index'], axis=1, inplace=True)
X

Unnamed: 0,f1,f2,f3
0,5.0,7.0,8.0
1,-5.0,0.0,25.0
2,999.0,1.0,-1.0
3,,0.0,


In [18]:
from sklearn.impute import MissingIndicator
X.replace({999.0 : np.NaN}, inplace=True)
indicator = MissingIndicator(missing_values=np.NaN)
indicator = indicator.fit_transform(X)
indicator = pd.DataFrame(indicator, columns=['m1', 'm3'])
indicator

Unnamed: 0,m1,m3
0,False,False
1,False,False
2,True,False
3,True,True


##Imputing values

In [21]:
X.fillna(X.mean(), inplace=True)
X

Unnamed: 0,f1,f2,f3
0,5.0,7.0,8.0
1,-5.0,0.0,25.0
2,0.0,1.0,-1.0
3,0.0,0.0,10.666667


In [19]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit_transform(X)

array([[ 5.        ,  7.        ,  8.        ],
       [-5.        ,  0.        , 25.        ],
       [ 0.        ,  1.        , -1.        ],
       [ 0.        ,  0.        , 10.66666667]])

#Polynomial features ?

In [25]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=True)
X, poly.fit_transform(X)
# polynomials = pd.DataFrame(poly.fit_transform(X), 
#                            columns=['0','1','2','3', 
#                                     'p1', 'p2', 'p3', 'p4'])\
#                                         [['p1', 'p2', 'p3', 'p4']]

(    f1   f2         f3
 0  5.0  7.0   8.000000
 1 -5.0  0.0  25.000000
 2  0.0  1.0  -1.000000
 3  0.0  0.0  10.666667,
 array([[   1.        ,    5.        ,    7.        ,    8.        ,
           35.        ,   40.        ,   56.        ],
        [   1.        ,   -5.        ,    0.        ,   25.        ,
           -0.        , -125.        ,    0.        ],
        [   1.        ,    0.        ,    1.        ,   -1.        ,
            0.        ,   -0.        ,   -1.        ],
        [   1.        ,    0.        ,    0.        ,   10.66666667,
            0.        ,    0.        ,    0.        ]]))

#Categorical features

In [31]:
X = pd.DataFrame(
    np.array(['M', 'O-', 'medium',
             'M', 'O-', 'high',
              'F', 'O+', 'high',
              'F', 'AB', 'low',
              'F', 'B+', np.NaN])
              .reshape((5,3)))
X.columns = ['sex', 'blood_type', 'edu_level']
X

Unnamed: 0,sex,blood_type,edu_level
0,M,O-,medium
1,M,O-,high
2,F,O+,high
3,F,AB,low
4,F,B+,


##OrdinalEncoder sklearn

In [32]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
X.edu_level = encoder.fit_transform(X.edu_level.values.reshape(-1, 1))
X

Unnamed: 0,sex,blood_type,edu_level
0,M,O-,2.0
1,M,O-,0.0
2,F,O+,0.0
3,F,AB,1.0
4,F,B+,3.0


In [39]:
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(dtype=np.int, sparse=True)
onehot.fit_transform(X[['sex', 'blood_type']]).toarray()

array([[0, 1, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 1],
       [1, 0, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 0],
       [1, 0, 0, 1, 0, 0]])

In [40]:
nominals = pd.DataFrame(
    onehot.fit_transform(X[['sex', 'blood_type']])\
    .toarray(),
    columns=['F', 'M', 'AB', 'B+','O+', 'O-'])
nominals['edu_level'] = X.edu_level
nominals

Unnamed: 0,F,M,AB,B+,O+,O-,edu_level
0,0,1,0,0,0,1,2.0
1,0,1,0,0,0,1,0.0
2,1,0,0,0,1,0,0.0
3,1,0,1,0,0,0,1.0
4,1,0,0,1,0,0,3.0


#Numerical features

#Feature scaling

In [52]:
X = pd.DataFrame(
    np.array([5,7,8, 2, -3, 20, -5,
              0,25,12,1,-1, 20, 0, -1]).reshape((5,3)))
X.columns = ['f1', 'f2', 'f3'] #feature 1, feature 2, feature 3
X

Unnamed: 0,f1,f2,f3
0,5,7,8
1,2,-3,20
2,-5,0,25
3,12,1,-1
4,20,0,-1


##Standardization(x_scaled = (x — u) / s)

In [53]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(X.f3.values.reshape(-1, 1))

array([[-0.20590443],
       [ 0.91721066],
       [ 1.38517529],
       [-1.04824076],
       [-1.04824076]])

##MinMax Scaler(x_scaled = (x-min(x)) / (max(x)–min(x)))

In [49]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-3,3))
scaler.fit_transform(X.f2.values.reshape(-1, 1))

array([[ 3.        ],
       [        nan],
       [-3.        ],
       [-2.14285714],
       [-3.        ]])

##MaxAbs Scaler(x_scaled = x / max(abs(x)))

In [54]:
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
scaler.fit_transform(X.f3.values.reshape(-1, 1))

array([[ 0.32],
       [ 0.8 ],
       [ 1.  ],
       [-0.04],
       [-0.04]])

##Robust Scaler

In [55]:
from sklearn.preprocessing import RobustScaler
robust = RobustScaler(quantile_range = (0.1,0.9))
robust.fit_transform(X.f3.values.reshape(-1, 1))

array([[ 0.],
       [12.],
       [17.],
       [-9.],
       [-9.]])