In [9]:
import pandas as pd
import numpy as np
import os


from sklearn.model_selection import train_test_split

from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

In [10]:
# get the current working directory
current_dir= os.getcwd()
current_dir


'/home/vish/Documents/A_Drive/Feature_Scaling/Practice'

In [11]:
work_dir= '/home/vish/Documents/A_Drive/Feature_Scaling/data'

In [12]:
# Read the csv file
df = pd.read_csv(work_dir + '/income_evaluation.csv', na_values=' ?')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [13]:
df.shape

(32561, 15)

In [14]:
# Find all missing values
df.isnull().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

In [15]:
# to remove the extra space from variables names
df.columns= df.columns.str.strip()
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [16]:
# for the tesing:
# Randomly pull 40 rows from Age and Infuse/Replace missing values into this.

np.random.seed(15)

#randomly pull 40 index
r= np.random.choice(df.index, size= 40,replace= False)
print(r)

# replace the 40 values with NAN
df.loc[r,'age']= np.nan

[10125 11478  4224  6592 21910 25737 32017 13595 16559 12752 13786 18301
 21099 12408 28467 12247 27397 26561 29923 14741 25613 26132 17718 15920
 17821 30011  8622  9580 30129 14653 26693 29771 23620  9708   807  5212
  2463 14875  4138   767]


In [17]:
# Infuse missing values into the  hours-per-week
np.random.seed(25)

s= np.random.choice(df.index,size= 40, replace= True)
print(s)
df.loc[s,'hours-per-week']= np.nan

[29828 24894  6618 26767 32061  2934 24831  1175 23468 26546 21245  1160
  7324 26500 18265 19709  2975  8692 31173  6785 28839 16131 15448 31671
  7427 13140 15149 30851 23297  6550 21407 20784  5679 31945 24208 30188
  2157  4658  9173 12836]


In [18]:
df.isnull().sum()

age                 40
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week      40
native-country     583
income               0
dtype: int64

In [19]:
df.shape

(32561, 15)

In [20]:
# Train test Split

X_train, X_test, y_train, y_test= train_test_split(df.drop('income',axis= 1),
                                                   df.income, test_size= 0.2,
                                                   random_state= 0)

In [21]:
num_col= [col for col in X_train.columns if (X_train[col].dtype !='O')]
num_col

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

In [22]:
char_col= [col for col in X_train.columns if X_train[col].dtype =='O']
char_col

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

# Unscalable approach

In [23]:
ct1= ColumnTransformer([
    ('si_num', SimpleImputer(strategy= 'median', add_indicator= True), num_col),
    ('rob_num', RobustScaler(), num_col)
], remainder= 'drop')

In [24]:
pd.DataFrame(ct1.fit_transform(X_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,36.0,174308.0,7.0,0.0,0.0,40.0,0.0,0.0,-0.052632,-0.030971,-1.000000,0.0,0.0,0.0
1,35.0,198202.0,9.0,0.0,0.0,54.0,0.0,0.0,-0.105263,0.169550,-0.333333,0.0,0.0,2.8
2,38.0,52963.0,13.0,0.0,0.0,50.0,0.0,0.0,0.052632,-1.049314,1.000000,0.0,0.0,2.0
3,50.0,138270.0,9.0,0.0,0.0,40.0,0.0,0.0,0.684211,-0.333407,-0.333333,0.0,0.0,0.0
4,68.0,116903.0,11.0,0.0,2149.0,40.0,0.0,0.0,1.631579,-0.512721,0.333333,0.0,2149.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,43.0,33331.0,14.0,0.0,0.0,40.0,0.0,0.0,0.315789,-1.214069,1.333333,0.0,0.0,0.0
26044,44.0,98466.0,6.0,0.0,0.0,35.0,0.0,0.0,0.368421,-0.667447,-1.333333,0.0,0.0,-1.0
26045,23.0,45317.0,10.0,0.0,0.0,40.0,0.0,0.0,-0.736842,-1.113480,0.000000,0.0,0.0,0.0
26046,45.0,215862.0,16.0,7688.0,0.0,45.0,0.0,0.0,0.421053,0.317755,2.000000,7688.0,0.0,1.0


# We cannot use the same columns name in the every steps in column transfromation, it will lead to transform the 2nd step to the value  generated by the 1st step

# Another process

In [25]:
# Always use to different variables in Column transfortion in every Step
# here we use SimpleImputer for num_col and then to the char_col

In [26]:
# Inpute the missing values
ct2= ColumnTransformer([
    ('si_num', SimpleImputer(strategy='median', add_indicator= True), num_col),
    ('si_cat', SimpleImputer(strategy='constant', fill_value= 'missing', add_indicator= True), char_col)
], remainder= 'drop' )

In [27]:
pd.DataFrame(ct2.fit_transform(X_train))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,36,174308,7,0,0,40,0,0,Private,11th,Divorced,Transport-moving,Not-in-family,White,Male,United-States,False,False,False
1,35,198202,9,0,0,54,0,0,Private,HS-grad,Never-married,Exec-managerial,Not-in-family,White,Female,United-States,False,False,False
2,38,52963,13,0,0,50,0,0,Private,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Female,United-States,False,False,False
3,50,138270,9,0,0,40,0,0,Private,HS-grad,Married-civ-spouse,Sales,Wife,Black,Female,United-States,False,False,False
4,68,116903,11,0,2149,40,0,0,Self-emp-not-inc,Assoc-voc,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,43,33331,14,0,0,40,0,0,Local-gov,Masters,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,False,False,False
26044,44,98466,6,0,0,35,0,0,Private,10th,Never-married,Farming-fishing,Unmarried,White,Male,United-States,False,False,False
26045,23,45317,10,0,0,40,0,0,Private,Some-college,Separated,Sales,Own-child,White,Female,United-States,False,False,False
26046,45,215862,16,7688,0,45,0,0,Local-gov,Doctorate,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,False,False,False


In [28]:
list(range(6))

[0, 1, 2, 3, 4, 5]

In [29]:
list(range(8,18))

[8, 9, 10, 11, 12, 13, 14, 15, 16, 17]

In [30]:
# RobustScaler for numerical and Onehot  encoding for Character
ct3= ColumnTransformer([
        ('rob_num', RobustScaler(),list(range(6) )),
    ('ohe_cat', OneHotEncoder(sparse= False, handle_unknown='ignore'), list(range(8,18)))
], remainder= 'drop')

In [31]:
# use the input of ct2 into ct3
xf1= ct2.fit_transform(X_train)

In [32]:
pd.DataFrame(ct3.fit_transform(xf1))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,102,103,104,105,106,107,108,109,110,111
0,-0.052632,-0.030971,-1.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,-0.105263,0.169550,-0.333333,0.0,0.0,2.8,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.052632,-1.049314,1.000000,0.0,0.0,2.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,0.684211,-0.333407,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,1.631579,-0.512721,0.333333,0.0,2149.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26043,0.315789,-1.214069,1.333333,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
26044,0.368421,-0.667447,-1.333333,0.0,0.0,-1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
26045,-0.736842,-1.113480,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
26046,0.421053,0.317755,2.000000,7688.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


# Scalable Approach

In [33]:
#create a seperate Pipe for numerical and characte variables
# 1. numerical imputing
# 2. RobustScaler
pp_num= Pipeline([('num_imp',SimpleImputer(strategy= 'median', add_indicator= False)),
                  ('rob_scl', RobustScaler())
                 ])

# 1. character imputer
# 2. Onehot  encoding

pp_char= Pipeline([('char_imp', SimpleImputer(strategy='constant', add_indicator= False, fill_value='missing')),
                 ('Ohte', OneHotEncoder(sparse= False, handle_unknown='ignore'))
                  ])

                  

            

In [34]:
# getting missing column indicator

from sklearn.impute import MissingIndicator
mi= MissingIndicator()

In [35]:
ct= ColumnTransformer([
    ('misind', mi, X_train.columns),
    ('pipe1',pp_num, num_col),
    ('pipe2', pp_char, char_col)
], remainder= 'drop')

In [36]:
ct

ColumnTransformer(transformers=[('misind', MissingIndicator(),
                                 Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')),
                                ('pipe1',
                                 Pipeline(steps=[('num_imp',
                                                  SimpleImputer(strategy='median')),
                                                 ('rob_scl', RobustScaler())]),
                                 ['age', 'fnlwgt', 'education-num',
                                  'capital-gain', 'capital-loss',
                                  'hours-per-week']),
                                ('pipe2',
                                 Pipeline(steps=[('char_imp',
                                                  SimpleImputer(fill_value='missing',
                     

In [37]:
xt=ct.fit_transform(X_train)

In [38]:
pd.DataFrame(xt).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,103,104,105,106,107,108,109,110,111,112
0,0.0,0.0,0.0,0.0,0.0,-0.052632,-0.030971,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,-0.105263,0.16955,-0.333333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.052632,-1.049314,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.684211,-0.333407,-0.333333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.631579,-0.512721,0.333333,0.0,2149.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [39]:
# now transform the Test data, we dont have to fit the test data we only transform it to avoid  data leakeage

In [40]:
xtest= ct.transform(X_test)

In [41]:
pd.DataFrame(xtest).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,103,104,105,106,107,108,109,110,111,112
0,0.0,0.0,0.0,0.0,0.0,-0.526316,-0.007381,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,-0.526316,0.32295,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,-0.631579,0.656806,0.666667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.473684,-0.254772,-2.333333,0.0,1902.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.421053,-0.043442,-1.0,0.0,2824.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# Use  the model


In [42]:
final_pipe= Pipeline([
    ('ct_step', ct),
    ('model', RandomForestClassifier())
])

In [43]:
final_pipe.fit(X_train,y_train)

Pipeline(steps=[('ct_step',
                 ColumnTransformer(transformers=[('misind', MissingIndicator(),
                                                  Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')),
                                                 ('pipe1',
                                                  Pipeline(steps=[('num_imp',
                                                                   SimpleImputer(strategy='...
                                                   'education-num',
                                                   'capital-gain',
                                                   'capital-loss',
                                                   'hours-per-week']),
                                                 ('pipe2',
                                         

In [44]:
# Predict the test data
y_pred= final_pipe.predict(X_test)

In [45]:
# Check the accquracy
accuracy_score(y_test, y_pred)

0.8513741747274681

# To enter into the internal  of final  pipe line

named_steps :-   It is for Pipeline name see the attribue


named_transformers:- it is for the  ColumnTransformers attribute to see variable deatail

In [46]:
final_pipe.named_steps

{'ct_step': ColumnTransformer(transformers=[('misind', MissingIndicator(),
                                  Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
        'marital-status', 'occupation', 'relationship', 'race', 'sex',
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
       dtype='object')),
                                 ('pipe1',
                                  Pipeline(steps=[('num_imp',
                                                   SimpleImputer(strategy='median')),
                                                  ('rob_scl', RobustScaler())]),
                                  ['age', 'fnlwgt', 'education-num',
                                   'capital-gain', 'capital-loss',
                                   'hours-per-week']),
                                 ('pipe2',
                                  Pipeline(steps=[('char_imp',
                                                   SimpleImputer(fill_value='missi

In [47]:
final_pipe.named_steps['ct_step']

ColumnTransformer(transformers=[('misind', MissingIndicator(),
                                 Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')),
                                ('pipe1',
                                 Pipeline(steps=[('num_imp',
                                                  SimpleImputer(strategy='median')),
                                                 ('rob_scl', RobustScaler())]),
                                 ['age', 'fnlwgt', 'education-num',
                                  'capital-gain', 'capital-loss',
                                  'hours-per-week']),
                                ('pipe2',
                                 Pipeline(steps=[('char_imp',
                                                  SimpleImputer(fill_value='missing',
                     

In [48]:
final_pipe.named_steps['ct_step'].named_transformers_

{'misind': MissingIndicator(),
 'pipe1': Pipeline(steps=[('num_imp', SimpleImputer(strategy='median')),
                 ('rob_scl', RobustScaler())]),
 'pipe2': Pipeline(steps=[('char_imp',
                  SimpleImputer(fill_value='missing', strategy='constant')),
                 ('Ohte', OneHotEncoder(handle_unknown='ignore', sparse=False))])}

In [49]:
final_pipe.named_steps['ct_step'].named_transformers_['pipe2']

Pipeline(steps=[('char_imp',
                 SimpleImputer(fill_value='missing', strategy='constant')),
                ('Ohte', OneHotEncoder(handle_unknown='ignore', sparse=False))])

In [50]:
final_pipe.named_steps['ct_step'].named_transformers_['pipe2'].named_steps['Ohte']

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [51]:
# Get all the  features name after One  hot encoding

final_pipe.named_steps['ct_step'].named_transformers_['pipe2'].named_steps['Ohte'].get_feature_names()

array(['x0_ Federal-gov', 'x0_ Local-gov', 'x0_ Never-worked',
       'x0_ Private', 'x0_ Self-emp-inc', 'x0_ Self-emp-not-inc',
       'x0_ State-gov', 'x0_ Without-pay', 'x0_missing', 'x1_ 10th',
       'x1_ 11th', 'x1_ 12th', 'x1_ 1st-4th', 'x1_ 5th-6th',
       'x1_ 7th-8th', 'x1_ 9th', 'x1_ Assoc-acdm', 'x1_ Assoc-voc',
       'x1_ Bachelors', 'x1_ Doctorate', 'x1_ HS-grad', 'x1_ Masters',
       'x1_ Preschool', 'x1_ Prof-school', 'x1_ Some-college',
       'x2_ Divorced', 'x2_ Married-AF-spouse', 'x2_ Married-civ-spouse',
       'x2_ Married-spouse-absent', 'x2_ Never-married', 'x2_ Separated',
       'x2_ Widowed', 'x3_ Adm-clerical', 'x3_ Armed-Forces',
       'x3_ Craft-repair', 'x3_ Exec-managerial', 'x3_ Farming-fishing',
       'x3_ Handlers-cleaners', 'x3_ Machine-op-inspct',
       'x3_ Other-service', 'x3_ Priv-house-serv', 'x3_ Prof-specialty',
       'x3_ Protective-serv', 'x3_ Sales', 'x3_ Tech-support',
       'x3_ Transport-moving', 'x3_missing', 'x4_ Husband',
   