# DropNa Test Data 

Drop all instances containing missing values

In [1]:
import pandas as pd, numpy as np

In [2]:
col_names = ['age','workclass','fnlwgt','education', 'education_num','marital_status',
             'occupation','relationship','race','sex','capital_gain','capital_loss', 
             'hours_per_week','native_country','50k']

In [3]:
df_test_raw_w_nans = pd.read_csv('census-income.test.csv', names = col_names)

In [4]:
df_test_raw_w_nans.replace(' ?', np.nan, inplace = True)

In [5]:
df_test_raw = df_test_raw_w_nans.dropna()

# Missing Values 

In [6]:
print('test data set contains {} rows with missing values'.format(len(df_test_raw_w_nans) - len(df_test_raw)))

test data set contains 1221 rows with missing values


In [7]:
print('negative instances accounted for {}% of our times'.format(1221/(12435+3846)*100))

negative instances accounted for 7.499539340335361% of our times


# Unbalanced Data

In [8]:
target_w_nans = pd.get_dummies(df_test_raw_w_nans).iloc[:,-1]
target_w_nans.value_counts()

0    12435
1     3846
Name: 50k_ >50K., dtype: int64

In [9]:
print('test data set contains {} rows with missing values'.format(len(df_test_raw_w_nans) - len(df_test_raw)))

test data set contains 1221 rows with missing values


In [10]:
df_test_raw

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,50k
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K.
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K.
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K.
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K.
10,65,Private,184454,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,6418,0,40,United-States,>50K.
11,36,Federal-gov,212465,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,<=50K.


# Encode Target Column 

In [11]:
test_target = pd.get_dummies(df_test_raw).iloc[:,-1]

In [12]:
test_target.head()

0    0
1    0
2    1
3    1
5    0
Name: 50k_ >50K., dtype: uint8

# Unbalanced Data

In [13]:
test_target.value_counts()

0    11360
1     3700
Name: 50k_ >50K., dtype: int64

# Continuous Columns 

age, education_num, fnlwgt, capital_gain, capital_loss, hours_per_week

In [14]:
test_features_raw = df_test_raw.iloc[:,:-1]

In [15]:
len(test_features_raw.columns)

14

In [16]:
df_continuous = pd.concat([test_features_raw.age,
           test_features_raw.fnlwgt,
           test_features_raw.capital_gain,
           test_features_raw.capital_loss,
           test_features_raw.hours_per_week], axis=1)

# Categorical Columns 0's and 1's (No NA's)

In [17]:
test_features_raw = df_test_raw.iloc[:,:-1]

In [18]:
test_features_raw.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States


### workclass

In [19]:
workclass = pd.get_dummies(test_features_raw.workclass) 
workclass.head()

len(workclass.columns)

7

In [20]:
workclass.isnull().any()

 Federal-gov         False
 Local-gov           False
 Private             False
 Self-emp-inc        False
 Self-emp-not-inc    False
 State-gov           False
 Without-pay         False
dtype: bool

### education 

In [21]:
education = pd.get_dummies(test_features_raw.education)
education.head()

len(education.columns)

16

### marital

In [22]:
marital_status = pd.get_dummies(test_features_raw.marital_status)
marital_status.head()

len(marital_status.columns)

7

### occupation 

In [23]:
occupation = pd.get_dummies(test_features_raw.occupation) 
occupation.head()

len(occupation.columns)

14

### relationship 

In [24]:
relationship = pd.get_dummies(test_features_raw.relationship)
relationship.head()

len(relationship.columns)

6

### race

In [25]:
race = pd.get_dummies(test_features_raw.iloc[:,8])
race.head()

len(race.columns)

5

### sex

In [26]:
sex = pd.get_dummies(test_features_raw.iloc[:,9])
sex.head() # Male = 1, Female = 0 

Unnamed: 0,Female,Male
0,0,1
1,0,1
2,0,1
3,0,1
5,0,1


# native_country

One less column than train data

In [27]:
native_country = pd.get_dummies(test_features_raw.native_country)
native_country.head()

len(native_country.columns)

40

In [28]:
missing_country = pd.Series(np.zeros(len(native_country)).astype(int))

In [29]:
native_country.insert(loc = 14, column = ' Holand-Netherlands', value=missing_country)

In [30]:
native_country.iloc[:,14] = int(0)

In [31]:
len(native_country.columns)

41

# Put dataset together

In [32]:
df1 = df_continuous.merge(sex, left_index=True, right_index = True)

In [33]:
df2 = df1.merge(race, left_index=True, right_index = True)

In [34]:
df3 = df2.merge(relationship, left_index=True, right_index = True)

In [35]:
df4 = df3.merge(marital_status, left_index=True, right_index = True)

In [36]:
df5 = df4.merge(native_country, left_index=True, right_index = True)

In [37]:
df6 = df5.merge(workclass, left_index=True, right_index = True)

In [38]:
df = df6.merge(occupation, left_index=True, right_index = True)

In [39]:
len(df.columns)

87

### Insert Target Variable

In [40]:
df.insert(loc=87, column = '>50k', value =test_target)

In [41]:
len(df.columns)

88

In [42]:
df.columns = [x.strip() for x in df.columns]

In [43]:
df.head()

Unnamed: 0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,Female,Male,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,...,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving,>50k
0,25,226802,0,0,40,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1,38,89814,0,0,50,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28,336951,0,0,40,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,44,160323,7688,0,40,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,1
5,34,198693,0,0,30,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [44]:
df.columns

Index(['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week',
       'Female', 'Male', 'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black',
       'Other', 'White', 'Husband', 'Not-in-family', 'Other-relative',
       'Own-child', 'Unmarried', 'Wife', 'Divorced', 'Married-AF-spouse',
       'Married-civ-spouse', 'Married-spouse-absent', 'Never-married',
       'Separated', 'Widowed', 'Cambodia', 'Canada', 'China', 'Columbia',
       'Cuba', 'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England',
       'France', 'Germany', 'Greece', 'Guatemala', 'Haiti',
       'Holand-Netherlands', 'Honduras', 'Hong', 'Hungary', 'India', 'Iran',
       'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico', 'Nicaragua',
       'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines', 'Poland',
       'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand',
       'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia',
       'Federal-gov', 'Local-gov', 'Private', 'Self-em

# Normalize continuous variables 

In [45]:
df.age = (df.age - 38.437901995888865) / 13.134664776856338

In [46]:
df.fnlwgt = (df.fnlwgt - 189793.83393011073) / 105652.97152851959

In [47]:
df.capital_gain = (df.capital_gain  - 1092.0078575691268) / 7406.346496681988

In [48]:
df.capital_loss = (df.capital_loss  - 88.37248856176646) / 404.2983704862744

In [49]:
df.hours_per_week = (df.hours_per_week - 40.93123798156621) / 11.979984229273281

In [50]:
df.head()

Unnamed: 0,age,fnlwgt,capital_gain,capital_loss,hours_per_week,Female,Male,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,...,Handlers-cleaners,Machine-op-inspct,Other-service,Priv-house-serv,Prof-specialty,Protective-serv,Sales,Tech-support,Transport-moving,>50k
0,-1.023087,0.35028,-0.147442,-0.218582,-0.077733,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1,-0.033339,-0.946304,-0.147442,-0.218582,0.756993,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.794684,1.392835,-0.147442,-0.218582,-0.077733,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,0.423467,-0.27894,0.890586,-0.218582,-0.077733,0,1,0,0,1,...,0,1,0,0,0,0,0,0,0,1
5,-0.337877,0.08423,-0.147442,-0.218582,-0.912458,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [51]:
df.to_csv('test_dropna.csv')