In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv('../datasets/adult.csv')

In [3]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [5]:
#convert question mark symbols '?' to NaN
df.replace('?', np.nan, inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  31978 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
#map the target column from string to number
le = LabelEncoder()
df.income = le.fit_transform(df.income)

In [8]:
#creating pipeline for numerical features
numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scaler', StandardScaler()),
])

#creating pipeline for categorical features
categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

#creating column transformer component
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipe, make_column_selector(dtype_include=['int', 'float'])),
    ('categorical', categorical_pipe, make_column_selector(dtype_include=['object'])),
])

#creating main pipeline
pipe = Pipeline([
    ('column_transformer', preprocessor),
    ('model', KNeighborsClassifier())
])

In [9]:
#creating X and y variables
X = df.drop('income', axis=1)
y = df.income

#spliting data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#fitting pipeline with train data and predicting test data
pipe.fit(X_train, y_train)
predictions = pipe.predict(X_test)

#checking pipeline's accuracy
accuracy_score(y_test, predictions)

0.8322166387493021

In [10]:
pipe

Pipeline(steps=[('column_transformer',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f918ee69210>),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('one_hot',
                                                                   OneHotEncoder(handle_unknown='ignore'

In [51]:
%%time

#defining the hyperparameter space for searching
parameters = {
    'column_transformer__numerical__imputer__strategy': ['mean', 'median'],
    'column_transformer__numerical__scaler': [StandardScaler(), MinMaxScaler()],
    'model__n_neighbors': [3, 6, 10, 15],
    'model__weights': ['uniform', 'distance'],
    'model__leaf_size': [30, 40]
}

#defining a scorer and a GridSearchCV instance
my_scorer = make_scorer(accuracy_score, greater_is_better=True)
search = GridSearchCV(pipe, parameters, cv=3, scoring=my_scorer, n_jobs=-1, verbose=1)

#search for the best hiperparameter combination within our defined hyperparameter space
search.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
CPU times: user 8min 53s, sys: 1min 20s, total: 10min 13s
Wall time: 10min 13s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('column_transformer',
                                        ColumnTransformer(transformers=[('numerical',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         <sklearn.compose._column_transformer.make_column_selector object at 0x7fa8536e6450>),
                                                                        ('categorical',
                                                                         Pipeline(steps=[('imputer',
                                                                 

In [59]:
#change pipeline parameters
pipe.set_params(**search.best_params_)

#making predictions
predictions = pipe.predict(X_test)

#checking accuracy
accuracy_score(y_test, predictions)

0.8408710217755444

In [60]:
pipe

Pipeline(steps=[('column_transformer',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fa8536e6450>),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('one_hot',
                                                                   OneHotEncoder(handle_unknown='ignore'