In [2]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import pandas as pd 
import numpy as np

In [3]:
income = pd.read_csv('income.csv')
income.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
income.rename(columns={'income': 'class'}, inplace=True)

In [5]:
income.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
class             object
dtype: object

In [6]:
for cat in [ 'workclass', 'education', 'marital-status', 'occupation','relationship','race','sex','native-country']:
    print("Number of levels in category '{0}': \b {1:2.2f} ".format(cat, income[cat].unique().size))

Number of levels in category 'workclass':  9.00 
Number of levels in category 'education':  16.00 
Number of levels in category 'marital-status':  7.00 
Number of levels in category 'occupation':  15.00 
Number of levels in category 'relationship':  6.00 
Number of levels in category 'race':  5.00 
Number of levels in category 'sex':  2.00 
Number of levels in category 'native-country':  41.00 


In [7]:
for cat in ['relationship', 'race','sex']:
    print("Levels for catgeory '{0}': {1}".format(cat, income[cat].unique()))

Levels for catgeory 'relationship': [' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
Levels for catgeory 'race': [' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other']
Levels for catgeory 'sex': [' Male' ' Female']


In [17]:
income['sex'] = income['sex'].map({' Male':0,' Female':1})
income['race'] =income['race'].map({' White':0,' Black':1,' Asian-Pac-Islander':2,' Amer-Indian-Eskimo':3,' Other':4})
income['relationship'] =income['relationship'].map({' Not-in-family':0,' Husband':1,' Wife':2,' Own-child':3,' Unmarried':4,' Other-relative':5})

In [10]:
income = income.fillna(-999)
pd.isnull(income).any()

age               False
workclass         False
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation        False
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country    False
class             False
dtype: bool

In [35]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
workclass_trans = mlb.fit_transform([{str(val)} for val in income['workclass'].values])
education_trans = mlb.fit_transform([{str(val)} for val in income['education'].values])
marital_trans = mlb.fit_transform([{str(val)} for val in income['marital-status'].values])
occupation_trans = mlb.fit_transform([{str(val)} for val in income['occupation'].values])
native_trans = mlb.fit_transform([{str(val)} for val in income['native-country'].values])

 

In [49]:
income_new = income.drop(['workclass','education','marital-status','occupation','native-country'], axis=1)
#assert (len(income['native-country'].unique()) == len(mlb.classes_)), "Not Equal" 

In [50]:
#income_new.values
income_new = np.hstack((income_new.values ,workclass_trans, education_trans,marital_trans, occupation_trans, native_trans))
income_new

array([[39, 77516, 13, ..., 1, 0, 0],
       [50, 83311, 13, ..., 1, 0, 0],
       [38, 215646, 9, ..., 1, 0, 0],
       ..., 
       [24, 390867, 14, ..., 1, 0, 0],
       [31, 101697, 11, ..., 1, 0, 0],
       [36, 279721, 9, ..., 1, 0, 0]], dtype=object)

In [52]:
income_new[0].size

98

In [53]:
income_class = income['class'].values

In [54]:
income_class

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', -999], dtype=object)