In [1]:
import pandas as pd

col_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship',
             'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'makes-more-50K']
df = pd.read_csv("data/adult.data", names=col_names, skipinitialspace=True)

df.replace({'makes-more-50K': {'<=50K': False, '>50K': True}}, inplace=True)

How many men and women are represented in the dataset?

In [2]:
df['sex'].value_counts()

Male      21790
Female    10771
Name: sex, dtype: int64

What is the average age of women?

In [3]:
df[df['sex'] == 'Female']['age'].mean()

36.85823043357163

What is the percentage of German people?

In [4]:
df[df['native-country'] == 'Germany'].count() / df.count()

age               0.004207
workclass         0.004207
fnlwgt            0.004207
education         0.004207
education-num     0.004207
marital-status    0.004207
occupation        0.004207
relationship      0.004207
race              0.004207
sex               0.004207
capital-gain      0.004207
capital-loss      0.004207
hours-per-week    0.004207
native-country    0.004207
makes-more-50K    0.004207
dtype: float64

What are the mean and standard deviation of age for those who earn more than 50K per year (salary feature) and those who earn less than 50K per year?

In [5]:
df.groupby(['makes-more-50K'])['age'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
makes-more-50K,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,24720.0,36.783738,14.020088,17.0,25.0,34.0,46.0,90.0
True,7841.0,44.249841,10.519028,19.0,36.0,44.0,51.0,90.0


Is it true that people who earn more than 50K have at least high school education? (education – Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters or Doctorate feature)

In [6]:
df[df['makes-more-50K'] == True]['education'].value_counts()

Bachelors       2221
HS-grad         1675
Some-college    1387
Masters          959
Prof-school      423
Assoc-voc        361
Doctorate        306
Assoc-acdm       265
10th              62
11th              60
7th-8th           40
12th              33
9th               27
5th-6th           16
1st-4th            6
Name: education, dtype: int64

Display age statistics for each race (race feature) and each gender (sex feature). Use groupby() and describe(). Find the maximum age of men of Amer-Indian-Eskimo race.

In [7]:
df.groupby(['race', 'sex'])['age'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
race,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Amer-Indian-Eskimo,Female,119.0,37.117647,13.114991,17.0,27.0,36.0,46.0,80.0
Amer-Indian-Eskimo,Male,192.0,37.208333,12.049563,17.0,28.0,35.0,45.0,82.0
Asian-Pac-Islander,Female,346.0,35.089595,12.300845,17.0,25.0,33.0,43.75,75.0
Asian-Pac-Islander,Male,693.0,39.073593,12.883944,18.0,29.0,37.0,46.0,90.0
Black,Female,1555.0,37.854019,12.637197,17.0,28.0,37.0,46.0,90.0
Black,Male,1569.0,37.6826,12.882612,17.0,27.0,36.0,46.0,90.0
Other,Female,109.0,31.678899,11.631599,17.0,23.0,29.0,39.0,74.0
Other,Male,162.0,34.654321,11.355531,17.0,26.0,32.0,42.0,77.0
White,Female,8642.0,36.811618,14.329093,17.0,25.0,35.0,46.0,90.0
White,Male,19174.0,39.652498,13.436029,17.0,29.0,38.0,49.0,90.0


Among whom is the proportion of those who earn a lot (>50K) greater: married or single men (marital-status feature)? Consider as married those who have a marital-status starting with Married (Married-civ-spouse, Married-spouse-absent or Married-AF-spouse), the rest are considered bachelors.

In [18]:
married = df[df['marital-status'].apply(lambda status: status.startswith('Married')) == True]
married['makes-more-50K'].value_counts(normalize=True)

False    0.56308
True     0.43692
Name: makes-more-50K, dtype: float64

In [19]:
single = df[df['marital-status'].apply(lambda status: status.startswith('Married')) == False]
single['makes-more-50K'].value_counts(normalize=True)

False    0.935546
True     0.064454
Name: makes-more-50K, dtype: float64

What is the maximum number of hours a person works per week (hours-per-week feature)? How many people work such a number of hours, and what is the percentage of those who earn a lot (>50K) among them?

In [20]:
df['hours-per-week'].max()

99

In [24]:
df[df['hours-per-week'] == df['hours-per-week'].max()]['makes-more-50K'].describe()

count        85
unique        2
top       False
freq         60
Name: makes-more-50K, dtype: object

Count the average time of work (hours-per-week) for those who earn a little and a lot (salary) for each country (native-country). What will these be for Japan?

In [36]:
df.groupby(['makes-more-50K', 'native-country'])['hours-per-week'].mean()


makes-more-50K  native-country 
False           ?                  40.164760
                Cambodia           41.416667
                Canada             37.914634
                China              37.381818
                Columbia           38.684211
                                     ...    
True            Thailand           58.333333
                Trinadad&Tobago    40.000000
                United-States      45.505369
                Vietnam            39.200000
                Yugoslavia         49.500000
Name: hours-per-week, Length: 82, dtype: float64