* Importar librerias
* Leer dataset
* Preparar column transformer para categorical y para numerical
* Crear pipeline
* Hacer fine tuning sobre el pipeline

Dataset link Kaggle --> https://www.kaggle.com/uciml/adult-census-income
License -> CC0: Public Domain

In [165]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans

In [166]:
df = pd.read_csv('../datasets/adult.csv')

In [167]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [168]:
for col in df.columns:
    df.loc[df[col] == '?', col] = np.nan

In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32561 non-null  float64
 1   workclass       30725 non-null  object 
 2   fnlwgt          32561 non-null  float64
 3   education       32561 non-null  object 
 4   education.num   32561 non-null  float64
 5   marital.status  32561 non-null  object 
 6   occupation      30718 non-null  object 
 7   relationship    32561 non-null  object 
 8   race            32561 non-null  object 
 9   sex             32561 non-null  object 
 10  capital.gain    32561 non-null  float64
 11  capital.loss    32561 non-null  float64
 12  hours.per.week  32561 non-null  float64
 13  native.country  31978 non-null  object 
 14  income          32561 non-null  object 
dtypes: float64(6), object(9)
memory usage: 3.7+ MB


In [137]:
le = LabelEncoder()
df.income = le.fit_transform(df.income)

X = df.drop('income', axis=1)
y = df.income

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [160]:
numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scaler', StandardScaler()),
    ('cluster', KMeans(n_clusters=6))
])

categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('one_hot', OneHotEncoder())
])

preprocessor = ColumnTransformer([
    ('numerical', numerical_pipe, make_column_selector(dtype_include=['int', 'float'])),
    ('categorical', categorical_pipe, make_column_selector(dtype_include=['object'])),
])

pipe = Pipeline([
    ('column_transformer', preprocessor),
    ('model', LogisticRegression(max_iter=1000))
])

In [161]:
pipe.fit(X_train, y_train)
predictions = pipe.predict(X_test)

In [162]:
accuracy_score(y_test, predictions)

0.8512004466778336

In [163]:
pipe

Pipeline(steps=[('column_transformer',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler()),
                                                                  ('cluster',
                                                                   KMeans(n_clusters=6))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f241cbcfa10>),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
              