In [23]:
# We will be working on the adult dataset
# The dataset is acquired from the following link
# http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
# You can download this file and place it in the current folder

# To make sense of the data, we will be using Pandas
import pandas as pd

#import the pretty html display feature of IPython
from IPython.display import display, HTML

In [24]:
# First thing we do is that we load the data from that file
# As we know that the data in that file is CSV, we do the following

# We got the column names from the following link
# http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names
# We will save them into an array called cols
cols = ['age',
'workclass',
'fnlwgt',
'education',
'education-num',
'marital-status',
'occupation',
'relationship',
'race',
'sex',
'capital-gain',
'capital-loss',
'hours-per-week',
'native-country',
'income']

# We then read that csv file by passing the cols array
# an telling that the header does not exist in that file
data = pd.read_csv('adult.data', header=None, names=cols)

In [25]:
#The type of the read file is something called as DataFrame
type(data)

pandas.core.frame.DataFrame

In [37]:
# We can have a look at the pretty output on the page this way
HTML(display(data.sample(5)))

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
9316,47,Private,161558,10th,6,Married-spouse-absent,Transport-moving,Not-in-family,Black,Male,0,0,45,United-States,<=50K
20436,47,?,186805,HS-grad,9,Married-civ-spouse,?,Not-in-family,White,Female,0,0,35,United-States,<=50K
12050,54,Private,188136,Bachelors,13,Divorced,Sales,Not-in-family,White,Female,0,1408,38,United-States,<=50K
9543,27,Private,244315,HS-grad,9,Divorced,Craft-repair,Other-relative,Other,Male,0,0,40,United-States,<=50K
3813,46,Private,201217,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,<=50K


<IPython.core.display.HTML object>

In [33]:
# We can list 10 random age of the people from the dataset
data.age.sample(10)

1342     47
28920    38
1425     46
18400    27
23532    24
30037    30
19902    17
20812    20
9610     37
26106    48
Name: age, dtype: int64

In [36]:
# Let us now list a sample of all people who are above 30yrs
data[data.age > 30].sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
22052,52,Private,191529,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1740,60,United-States,<=50K
9722,52,Private,329733,Some-college,10,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,40,United-States,<=50K
30232,34,Private,112212,HS-grad,9,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,1485,40,United-States,<=50K
5789,61,Private,248448,7th-8th,4,Divorced,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
28036,31,Self-emp-not-inc,144949,Bachelors,13,Divorced,Craft-repair,Not-in-family,White,Male,0,0,60,United-States,<=50K


In [46]:
# Let us list a sample of all people who are Females
data[data.sex.str.contains('Female')].sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
15081,20,Private,374116,HS-grad,9,Never-married,Prof-specialty,Other-relative,White,Female,0,0,40,United-States,<=50K
29210,24,Private,233280,Assoc-acdm,12,Never-married,Sales,Own-child,White,Female,0,0,37,United-States,<=50K
22117,39,Private,255027,Assoc-voc,11,Divorced,Exec-managerial,Unmarried,Black,Female,0,0,40,United-States,<=50K
3950,29,?,315026,HS-grad,9,Divorced,?,Unmarried,White,Female,0,0,40,United-States,<=50K
7954,23,Private,176486,Assoc-voc,11,Never-married,Adm-clerical,Own-child,White,Female,0,0,36,United-States,<=50K


In [44]:
# Percentagnse of males in the dataset
len(data[data.sex.str.contains('Male')]) / len(data) * 100

66.92054912318419

In [49]:
# Percentage of people who are married
len(data[data['marital-status'].str.contains('Married')]) / len(data) * 100

47.34805442093302

In [74]:
# Percentage of males who are married
married_males = len(data[data.sex.str.contains('Male') & data['marital-status'].str.contains('Married')]) / len(data[data.sex.str.contains('Male')])       * 100

In [75]:
# Percentage of females who are married
married_females = len(data[data.sex.str.contains('Female') & data['marital-status'].str.contains('Married')])       / len(data[data.sex.str.contains('Female')])       * 100

In [80]:
# 62% Males and 17% Females? Thats a really odd number

# If we sum them
total = married_males + married_females

In [82]:
100 - total
# These % people are Unmarried or divorced

20.439676439834344

In [91]:
# Lets now check the race of those who have annual income > 50K
income_more_than_50 = data[data.income.str.contains('>50K')]
len(income_more_than_50[income_more_than_50.race.str.contains('White')]) / len(income_more_than_50) * 100
# Below is the percentage of peo

90.76648386685372