In [1]:
import numpy as np
import pandas as pd
train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test' , skiprows = 1, header = None)


In [2]:
col_labels = ['age', 'workclass', 'fnlwgt', 'education','education_num','marital_status', 'occupation', 'relationship', 'race' , 'sex', 'capital_gain', 'capital_loss' , 'hours_per_week' , 'native_country', 'wage_class']

In [3]:
train_set.columns = col_labels
test_set.columns = col_labels

In [4]:
train_set.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
wage_class        object
dtype: object

In [5]:
train_set.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [6]:
train_set.rename(columns = {'wage_class':'income'}, inplace = True)
test_set.rename(columns = {'wage_class':'income'}, inplace = True)

In [7]:
def clean_income(x):
    if '<=50K' in x:
        return 0
    return 1

In [8]:
train_set['income']=train_set['income'].apply(clean_income)


In [9]:
test_set['income']=test_set['income'].apply(clean_income)

In [10]:
test_set['income'].value_counts()

0    12435
1     3846
Name: income, dtype: int64

# Normalization

In [11]:
cols = ['age', 'education_num', 'capital_gain','capital_loss','hours_per_week' ]
for col in cols:
    # find minimum and maximum of that column
    minimum = min(train_set[col])
    maximum = max(train_set[col])
    train_set[col] = (train_set[col] - minimum) / (maximum - minimum)

In [12]:
train_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.30137,State-gov,77516,Bachelors,0.8,Never-married,Adm-clerical,Not-in-family,White,Male,0.02174,0.0,0.397959,United-States,0
1,0.452055,Self-emp-not-inc,83311,Bachelors,0.8,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,0.122449,United-States,0
2,0.287671,Private,215646,HS-grad,0.533333,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,0.397959,United-States,0
3,0.493151,Private,234721,11th,0.4,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,0.397959,United-States,0
4,0.150685,Private,338409,Bachelors,0.8,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,0.397959,Cuba,0


In [13]:
cols = ['age', 'education_num', 'capital_gain','capital_loss','hours_per_week', ]
for col in cols:
    # find minimum and maximum of that column
    minimum = min(test_set[col])
    maximum = max(test_set[col])
    test_set[col] = (test_set[col] - minimum) / (maximum - minimum)

In [14]:
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.109589,Private,226802,11th,0.4,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,0.397959,United-States,0
1,0.287671,Private,89814,HS-grad,0.533333,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,0.5,United-States,0
2,0.150685,Local-gov,336951,Assoc-acdm,0.733333,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,0.397959,United-States,1
3,0.369863,Private,160323,Some-college,0.6,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0.076881,0.0,0.397959,United-States,1
4,0.013699,?,103497,Some-college,0.6,Never-married,?,Own-child,White,Female,0.0,0.0,0.295918,United-States,0


In [15]:
#Or use this technique for normalization

"""from sklearn.preprocessing import MinMaxScaler
scaler= MinMaxScaler()
numerical = ['age', 'education_num', 'capital_gain','capital_loss','hours_per_week']
minmax_transform= pd.DataFrame(train_set)
minmax_transform[numerical]= scaler.fit_transform(train_set[numerical])
minmax_transform.head()"""

"from sklearn.preprocessing import MinMaxScaler\nscaler= MinMaxScaler()\nnumerical = ['age', 'education_num', 'capital_gain','capital_loss','hours_per_week']\nminmax_transform= pd.DataFrame(train_set)\nminmax_transform[numerical]= scaler.fit_transform(train_set[numerical])\nminmax_transform.head()"

# Applying Label Encoding

In [16]:
train_set.head()
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for col in train_set.columns:
    if train_set[col].dtypes == 'object':
        train_set[col]= le.fit_transform(train_set[col])

In [17]:
train_set.head()
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for col in test_set.columns:
    if test_set[col].dtypes == 'object':
        test_set[col]= le.fit_transform(test_set[col])

In [18]:
train_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.30137,7,77516,9,0.8,4,1,1,4,1,0.02174,0.0,0.397959,39,0
1,0.452055,6,83311,9,0.8,2,4,0,4,1,0.0,0.0,0.122449,39,0
2,0.287671,4,215646,11,0.533333,0,6,1,4,1,0.0,0.0,0.397959,39,0
3,0.493151,4,234721,1,0.4,2,6,0,2,1,0.0,0.0,0.397959,39,0
4,0.150685,4,338409,9,0.8,2,10,5,2,0,0.0,0.0,0.397959,5,0


In [19]:
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0.109589,4,226802,1,0.4,4,7,3,2,1,0.0,0.0,0.397959,38,0
1,0.287671,4,89814,11,0.533333,2,5,0,4,1,0.0,0.0,0.5,38,0
2,0.150685,2,336951,7,0.733333,2,11,0,4,1,0.0,0.0,0.397959,38,1
3,0.369863,4,160323,15,0.6,2,7,0,2,1,0.076881,0.0,0.397959,38,1
4,0.013699,0,103497,15,0.6,4,0,3,4,0,0.0,0.0,0.295918,38,0
