In [82]:
import pandas as pd
import numpy as np

pd.set_option("display.precision", 2)

col_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship',
             'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'makes-more-50K']
df = pd.read_csv("data/adult.data", names=col_names, skipinitialspace=True)

df.replace({'makes-more-50K': {'<=50K': False, '>50K': True}}, inplace=True)

How many men and women are represented in the dataset?

In [83]:
df['sex'].value_counts()

Male      21790
Female    10771
Name: sex, dtype: int64

What is the average age of women?

In [84]:
df[df['sex'] == 'Female']['age'].mean()

36.85823043357163

What is the percentage of German people?

In [85]:
#df[df['native-country'] == 'Germany'].count() / df.count()
(df['native-country'] == 'Germany').sum() / df.shape[0]

0.004207487485028101

What are the mean and standard deviation of age for those who earn more than 50K per year (salary feature) and those who earn less than 50K per year?

In [86]:
poor = df.loc[df['makes-more-50K'] == False, 'age']
rich = df.loc[df['makes-more-50K'] == True, 'age']
print(f"Rich: {rich.mean()} +- {rich.std()}, poor: {poor.mean()} +- {poor.std()}")
# df.groupby(['makes-more-50K'])['age'].mean()

Rich: 44.24984058155847 +- 10.519027719851826, poor: 36.78373786407767 +- 14.02008849082488


Is it true that people who earn more than 50K have at least high school education? (education – Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters or Doctorate feature)

In [87]:
# df[df['makes-more-50K'] == True]['education'].value_counts()
df.loc[df['makes-more-50K'] == True, 'education'].unique()

array(['HS-grad', 'Masters', 'Bachelors', 'Some-college', 'Assoc-voc',
       'Doctorate', 'Prof-school', 'Assoc-acdm', '7th-8th', '12th',
       '10th', '11th', '9th', '5th-6th', '1st-4th'], dtype=object)

Display age statistics for each race (race feature) and each gender (sex feature). Use groupby() and describe(). Find the maximum age of men of Amer-Indian-Eskimo race.

In [88]:
df.groupby(['race', 'sex'])['age'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
race,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Amer-Indian-Eskimo,Female,119.0,37.12,13.11,17.0,27.0,36.0,46.0,80.0
Amer-Indian-Eskimo,Male,192.0,37.21,12.05,17.0,28.0,35.0,45.0,82.0
Asian-Pac-Islander,Female,346.0,35.09,12.3,17.0,25.0,33.0,43.75,75.0
Asian-Pac-Islander,Male,693.0,39.07,12.88,18.0,29.0,37.0,46.0,90.0
Black,Female,1555.0,37.85,12.64,17.0,28.0,37.0,46.0,90.0
Black,Male,1569.0,37.68,12.88,17.0,27.0,36.0,46.0,90.0
Other,Female,109.0,31.68,11.63,17.0,23.0,29.0,39.0,74.0
Other,Male,162.0,34.65,11.36,17.0,26.0,32.0,42.0,77.0
White,Female,8642.0,36.81,14.33,17.0,25.0,35.0,46.0,90.0
White,Male,19174.0,39.65,13.44,17.0,29.0,38.0,49.0,90.0


Among whom is the proportion of those who earn a lot (>50K) greater: married or single men (marital-status feature)? Consider as married those who have a marital-status starting with Married (Married-civ-spouse, Married-spouse-absent or Married-AF-spouse), the rest are considered bachelors.

In [89]:
#married = df[df['marital-status'].apply(lambda status: status.startswith('Married')) == True]
#married['makes-more-50K'].value_counts(normalize=True)
males = df['sex'] == 'Male'
married = df['marital-status'].str.startswith('Married')
print("Married:")
df.loc[males & married, 'makes-more-50K'].value_counts(normalize=True)

Married:


False    0.56
True     0.44
Name: makes-more-50K, dtype: float64

In [90]:
single = df['marital-status'].isin(['Never-married', 'Separated', 'Divorced', 'Widowed'])
#single = df[df['marital-status'].apply(lambda status: status.startswith('Married')) == False]
df.loc[males & single, 'makes-more-50K'].value_counts(normalize=True)

False    0.92
True     0.08
Name: makes-more-50K, dtype: float64

What is the maximum number of hours a person works per week (hours-per-week feature)? How many people work such a number of hours, and what is the percentage of those who earn a lot (>50K) among them?

In [91]:
max_load = df['hours-per-week'].max()
print(f"Max load = {max_load} hours/week")

Max load = 99 hours/week


In [92]:
#df[df['hours-per-week'] == df['hours-per-week'].max()]['makes-more-50K'].describe()
workaholics = df[df['hours-per-week'] == max_load]
print(f"Workaholics: {workaholics.shape[0]}")

print("Make more than 50K:")
workaholics['makes-more-50K'].value_counts(normalize=True)

Workaholics: 85
Make more than 50K:


False    0.71
True     0.29
Name: makes-more-50K, dtype: float64

Count the average time of work (hours-per-week) for those who earn a little and a lot (salary) for each country (native-country). What will these be for Japan?

In [95]:
pd.crosstab(df['native-country'], df['makes-more-50K'], values=df['hours-per-week'], aggfunc=np.mean)


makes-more-50K,False,True
native-country,Unnamed: 1_level_1,Unnamed: 2_level_1
?,40.16,45.55
Cambodia,41.42,40.0
Canada,37.91,45.64
China,37.38,38.9
Columbia,38.68,50.0
Cuba,37.99,42.44
Dominican-Republic,42.34,47.0
Ecuador,38.04,48.75
El-Salvador,36.03,45.0
England,40.48,44.53
