In [2]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn
import re

# Importing data

In [5]:
!pip install ucimlrepo



In [6]:
from ucimlrepo import fetch_ucirepo

In [7]:
# fetch dataset 
raw_dataset = fetch_ucirepo(id=2) 

In [8]:
print(type(raw_dataset))

<class 'ucimlrepo.dotdict.dotdict'>


This object is an instance of "dotdict" which is a custom dictionary-like structure from UCIMRepo library. Unlike the normal dictionary where indexing is used to access values, dot notation is used in this type of object.

In [10]:
# Import training features "X" and target column "y"
X = raw_dataset.data.features 
y = raw_dataset.data.targets

In [11]:
# Check type of the datasets
print(type(X))
print(type(y))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


Both are pandas dataframe type.

In [12]:
# Evaluate training features
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [14]:
# Basic statistics of the training dataset
X.describe(include="all")

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
count,48842.0,47879,48842.0,48842,48842.0,48842,47876,48842,48842,48842,48842.0,48842.0,48842.0,48568
unique,,9,,16,,7,15,6,5,2,,,,42
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States
freq,,33906,,15784,,22379,6172,19716,41762,32650,,,,43832
mean,38.643585,,189664.1,,10.078089,,,,,,1079.067626,87.502314,40.422382,
std,13.71051,,105604.0,,2.570973,,,,,,7452.019058,403.004552,12.391444,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,
25%,28.0,,117550.5,,9.0,,,,,,0.0,0.0,40.0,
50%,37.0,,178144.5,,10.0,,,,,,0.0,0.0,40.0,
75%,48.0,,237642.0,,12.0,,,,,,0.0,0.0,45.0,


We can see that there are 48842 entries in the training dataset. There 14 unique feature columns that can be used for training.  
Above statistics show missing values each of the column if any.

In [15]:
# Evaluate target column
y.head()

Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K


In [16]:
y.describe(include="all")

Unnamed: 0,income
count,48842
unique,4
top,<=50K
freq,24720


As seen before, there are 48842 entries in the target column with 4 unique entries. 

In [21]:
# Merge X (features) and y (target) into a single DataFrame
dataframe = pd.concat([X, y], axis=1)

In [22]:
dataframe.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [23]:
# check of merged dataset
len(dataframe)

48842

Merged good since there are still 48842 entries.

In [26]:
# Looking at unique target columns
dataframe["income"].unique()

array(['<=50K', '>50K', '<=50K.', '>50K.'], dtype=object)

There are the four unique targets.

In [27]:
# Save to CSV file
dataframe.to_csv("dataframe.csv", index=False)