In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv('income_evaluation.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df=data.copy()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlwgt          32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [5]:
df.isnull().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [7]:
## Pipeline

numeric_columns=df.select_dtypes(include="number").columns
numeric_columns


Index(['age', ' fnlwgt', ' education-num', ' capital-gain', ' capital-loss',
       ' hours-per-week'],
      dtype='object')

In [8]:
ord_cat_columns = df[[' occupation', ' income']].columns

ohe_cat_columns = df[[' race', ' sex', ' native-country', ' relationship', ' workclass',' marital-status']].columns
print(ord_cat_columns)
print()
print(ohe_cat_columns)

Index([' occupation', ' income'], dtype='object')

Index([' race', ' sex', ' native-country', ' relationship', ' workclass',
       ' marital-status'],
      dtype='object')


In [9]:
numerical_Pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scl', StandardScaler())
])


ord_cat_Pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder())
])


ohe_cat_Pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])



preprocessor = ColumnTransformer(
    transformers=[
        ('ohe_categorical', ohe_cat_Pipeline, ohe_cat_columns),
        ('numerical', numerical_Pipeline, numeric_columns),
        ('ordinal', ord_cat_Pipeline, ord_cat_columns)
    ])

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

pipe.fit(df)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('ohe_categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Index([' race', ' sex', ' native-country', ' relationship', ' workclass',
       ' marital-status'],
      dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scl',
                                                    

In [13]:
import pipename as pn

columns=pn.get_feature_names(preprocessor)
columns

['ohe__x0_ Amer-Indian-Eskimo',
 'ohe__x0_ Asian-Pac-Islander',
 'ohe__x0_ Black',
 'ohe__x0_ Other',
 'ohe__x0_ White',
 'ohe__x1_ Female',
 'ohe__x1_ Male',
 'ohe__x2_ ?',
 'ohe__x2_ Cambodia',
 'ohe__x2_ Canada',
 'ohe__x2_ China',
 'ohe__x2_ Columbia',
 'ohe__x2_ Cuba',
 'ohe__x2_ Dominican-Republic',
 'ohe__x2_ Ecuador',
 'ohe__x2_ El-Salvador',
 'ohe__x2_ England',
 'ohe__x2_ France',
 'ohe__x2_ Germany',
 'ohe__x2_ Greece',
 'ohe__x2_ Guatemala',
 'ohe__x2_ Haiti',
 'ohe__x2_ Holand-Netherlands',
 'ohe__x2_ Honduras',
 'ohe__x2_ Hong',
 'ohe__x2_ Hungary',
 'ohe__x2_ India',
 'ohe__x2_ Iran',
 'ohe__x2_ Ireland',
 'ohe__x2_ Italy',
 'ohe__x2_ Jamaica',
 'ohe__x2_ Japan',
 'ohe__x2_ Laos',
 'ohe__x2_ Mexico',
 'ohe__x2_ Nicaragua',
 'ohe__x2_ Outlying-US(Guam-USVI-etc)',
 'ohe__x2_ Peru',
 'ohe__x2_ Philippines',
 'ohe__x2_ Poland',
 'ohe__x2_ Portugal',
 'ohe__x2_ Puerto-Rico',
 'ohe__x2_ Scotland',
 'ohe__x2_ South',
 'ohe__x2_ Taiwan',
 'ohe__x2_ Thailand',
 'ohe__x2_ Trinadad

In [30]:
dataset = pipe.transform(df)

In [31]:
dataset.toarray()

array([[ 0.        ,  0.        ,  0.        , ..., -0.03542945,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ..., -2.22215312,
         4.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ..., -0.03542945,
         6.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.03542945,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ..., -1.65522476,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ..., -0.03542945,
         4.        ,  1.        ]])

In [32]:
dataframe=pd.DataFrame(dataset.toarray(), columns = columns)

dataframe

Unnamed: 0,ohe__x0_ Amer-Indian-Eskimo,ohe__x0_ Asian-Pac-Islander,ohe__x0_ Black,ohe__x0_ Other,ohe__x0_ White,ohe__x1_ Female,ohe__x1_ Male,ohe__x2_ ?,ohe__x2_ Cambodia,ohe__x2_ Canada,...,ohe__x5_ Separated,ohe__x5_ Widowed,numerical__age,numerical__ fnlwgt,numerical__ education-num,numerical__ capital-gain,numerical__ capital-loss,numerical__ hours-per-week,ordinal__ occupation,ordinal__ income
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153,4.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429,6.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429,6.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429,10.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409,13.0,0.0
32557,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429,7.0,1.0
32558,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429,1.0,0.0
32559,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225,1.0,0.0


In [33]:
X=dataframe.iloc[:,:-1]
y=dataframe.iloc[:,-1]

X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.23,random_state=42)

from xgboost import XGBClassifier

xgbt = XGBClassifier(max_depth = 2,
             learning_rate = 0.2,
             objective  = "multi:softmax",
             num_class = 2,
             booster = "gbtree",
             n_estimarors = 10,
             random_state = 123)

xgbt.fit(X_train, y_train)

xgbt_pred = xgbt.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy_score(y_test, xgbt_pred)

Parameters: { "n_estimarors" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




0.8683578104138852

In [31]:
xgbt.score(X_train, y_train)

0.8670176698177177

In [32]:
xgbt.score(X_test, y_test)

0.8683578104138852