In [24]:
import pandas as pd 
import numpy as np
from sklearn import preprocessing

attrs = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country','target'] 


data = pd.read_csv("dataset/adult/adult.test", names=attrs, skipinitialspace=True) 


In [25]:
data = data.replace(to_replace ='?', 
                 value =np.NaN)

In [26]:
data.isna().sum()

age                 0
workclass         964
fnlwgt              1
education           1
education_num       1
marital_status      1
occupation        967
relationship        1
race                1
sex                 1
capital_gain        1
capital_loss        1
hours_per_week      1
native_country    275
target              1
dtype: int64

In [27]:
data.dropna(inplace=True)

In [28]:
data.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
target            0
dtype: int64

In [29]:
data['target'].value_counts()

<=50K.    11360
>50K.      3700
Name: target, dtype: int64

In [30]:
for column in data.columns:
    print("Column:", column, " -->", data[column].unique())

Column: age  --> ['25' '38' '28' '44' '34' '63' '24' '55' '65' '36' '26' '48' '43' '20'
 '37' '45' '22' '23' '54' '32' '46' '56' '17' '29' '39' '52' '18' '21'
 '42' '33' '30' '47' '41' '19' '69' '50' '31' '59' '49' '58' '40' '27'
 '57' '61' '51' '73' '53' '80' '62' '35' '72' '64' '68' '66' '60' '67'
 '71' '70' '90' '77' '81' '74' '78' '82' '75' '85' '76' '89' '83' '79'
 '88' '87' '84']
Column: workclass  --> ['Private' 'Local-gov' 'Self-emp-not-inc' 'Federal-gov' 'State-gov'
 'Self-emp-inc' 'Without-pay']
Column: fnlwgt  --> [226802.  89814. 336951. ... 350977. 349230.  83891.]
Column: education  --> ['11th' 'HS-grad' 'Assoc-acdm' 'Some-college' '10th' 'Prof-school'
 '7th-8th' 'Bachelors' 'Masters' '5th-6th' 'Assoc-voc' '9th' 'Doctorate'
 '12th' '1st-4th' 'Preschool']
Column: education_num  --> [ 7.  9. 12. 10.  6. 15.  4. 13. 14.  3. 11.  5. 16.  8.  2.  1.]
Column: marital_status  --> ['Never-married' 'Married-civ-spouse' 'Widowed' 'Separated' 'Divorced'
 'Married-spouse-absent' 'Mar

In [8]:
data.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
target            object
dtype: object

In [9]:
g = data.columns.to_series().groupby(data.dtypes).groups

In [10]:
data['native_country'].value_counts(dropna=False)

United-States                 27504
Mexico                          610
Philippines                     188
Germany                         128
Puerto-Rico                     109
Canada                          107
India                           100
El-Salvador                     100
Cuba                             92
England                          86
Jamaica                          80
South                            71
China                            68
Italy                            68
Dominican-Republic               67
Vietnam                          64
Guatemala                        63
Japan                            59
Columbia                         56
Poland                           56
Iran                             42
Taiwan                           42
Haiti                            42
Portugal                         34
Nicaragua                        33
Peru                             30
Greece                           29
France                      

In [11]:
target_mapping = {'<=50K': -1, '>50K': 1}
data.replace({"target": target_mapping}, inplace=True)
data['native_country'] = data['native_country'].apply(lambda x: 'USA' if x in ['United-States'] else 'NON-USA')
data['education'] = data['education'].apply(lambda x: 'pre-middle-school' if x in ["Preschool", "1st-4th", "5th-6th", "7th-8th"] else 'high-school')

data['native_country'].value_counts(dropna=False)

USA        27504
NON-USA     2658
Name: native_country, dtype: int64

In [12]:
data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
0,39,State-gov,77516,high-school,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,USA,-1
1,50,Self-emp-not-inc,83311,high-school,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,USA,-1
2,38,Private,215646,high-school,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,USA,-1
3,53,Private,234721,high-school,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,USA,-1
4,28,Private,338409,high-school,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,NON-USA,-1


In [13]:
encoded_object_df = pd.DataFrame()

for column in ['workclass', 'sex', 'education', 'marital_status', 'occupation','relationship', 'race', 'native_country']:
    encoded_object_df = pd.concat([encoded_object_df,pd.get_dummies(data[column], prefix=column, drop_first=True)] ,axis=1)

encoded_object_df.head(5)

Unnamed: 0,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,sex_Male,education_pre-middle-school,marital_status_Married-AF-spouse,marital_status_Married-civ-spouse,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,native_country_USA
0,0,0,0,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,1
1,0,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,1
2,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,1
3,0,1,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,1
4,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,1,0,0,0


In [14]:
min_max_scaler = preprocessing.MinMaxScaler()

cols_to_scale = ['age','fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'capital_loss', 'target']

encoded_int_df = data[cols_to_scale]

encoded_int_df[cols_to_scale] = min_max_scaler.fit_transform(encoded_int_df[cols_to_scale])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [15]:
encoded_int_df.head(5)

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,capital_loss.1,target
0,0.30137,0.043338,0.8,0.02174,0.0,0.0,0.0
1,0.452055,0.047277,0.8,0.0,0.0,0.0,0.0
2,0.287671,0.137244,0.533333,0.0,0.0,0.0,0.0
3,0.493151,0.150212,0.4,0.0,0.0,0.0,0.0
4,0.150685,0.220703,0.8,0.0,0.0,0.0,0.0


In [16]:
final_df = pd.concat([encoded_object_df, encoded_int_df], axis=1)

In [17]:
final_df.shape

(30162, 44)

In [18]:
final_df.columns

Index(['workclass_Local-gov', 'workclass_Private', 'workclass_Self-emp-inc',
       'workclass_Self-emp-not-inc', 'workclass_State-gov',
       'workclass_Without-pay', 'sex_Male', 'education_pre-middle-school',
       'marital_status_Married-AF-spouse', 'marital_status_Married-civ-spouse',
       'marital_status_Married-spouse-absent', 'marital_status_Never-married',
       'marital_status_Separated', 'marital_status_Widowed',
       'occupation_Armed-Forces', 'occupation_Craft-repair',
       'occupation_Exec-managerial', 'occupation_Farming-fishing',
       'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct',
       'occupation_Other-service', 'occupation_Priv-house-serv',
       'occupation_Prof-specialty', 'occupation_Protective-serv',
       'occupation_Sales', 'occupation_Tech-support',
       'occupation_Transport-moving', 'relationship_Not-in-family',
       'relationship_Other-relative', 'relationship_Own-child',
       'relationship_Unmarried', 'relationship_Wife'

In [19]:
final_df['target'].value_counts()

0.0    22654
1.0     7508
Name: target, dtype: int64

In [20]:
y = np.where(final_df['target'] == 0, -1, 1)

In [21]:
X_control = final_df['sex_Male']

In [22]:
X = final_df.drop(['target','sex_Male'], axis=1)

In [23]:
X.shape

(30162, 42)