In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max.columns', 100)
# to draw pictures in jupyter notebook
%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns

In [29]:
data = pd.read_csv('../../ML_datasets/adult.data.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
data['sex'].value_counts()

Male      21790
Female    10771
Name: sex, dtype: int64

In [5]:
#average age of women
females = data.loc[data['sex']=='Female']
females['age'].mean()

36.85823043357163

In [11]:
#proportion of German citizens
all_citizens = data['native-country'].count()
german_citizens = float((data['native-country'] == 'Germany').sum())
german_citizens/all_citizens

0.004207487485028101

In [13]:
#mean value and standard deviation of the age of people with salary <=50K and >50K
group1 = data.loc[data['salary'] == '<=50K', 'age']
group2 = data.loc[data['salary'] == '>50K', 'age']
print(round(group1.mean()), round(group1.std()))
print(round(group2.mean()), round(group2.std()))

37 14
44 11


In [14]:
#Is it true that people who receive >=50k have at least high school education?
data.loc[data['salary'] == '>50K', 'education'].unique()
#No

array(['HS-grad', 'Masters', 'Bachelors', 'Some-college', 'Assoc-voc',
       'Doctorate', 'Prof-school', 'Assoc-acdm', '7th-8th', '12th',
       '10th', '11th', '9th', '5th-6th', '1st-4th'], dtype=object)

In [17]:
#Age statistics for each race and each gender.
#Maximum age of men of Amer-Indian-Eskimo race.

for (race, sex), sub_data in data.groupby(['race','sex']):
    print('Race:', race, '| Sex:', sex)
    print(sub_data['age'].describe())

Race: Amer-Indian-Eskimo | Sex: Female
count    119.000000
mean      37.117647
std       13.114991
min       17.000000
25%       27.000000
50%       36.000000
75%       46.000000
max       80.000000
Name: age, dtype: float64
Race: Amer-Indian-Eskimo | Sex: Male
count    192.000000
mean      37.208333
std       12.049563
min       17.000000
25%       28.000000
50%       35.000000
75%       45.000000
max       82.000000
Name: age, dtype: float64
Race: Asian-Pac-Islander | Sex: Female
count    346.000000
mean      35.089595
std       12.300845
min       17.000000
25%       25.000000
50%       33.000000
75%       43.750000
max       75.000000
Name: age, dtype: float64
Race: Asian-Pac-Islander | Sex: Male
count    693.000000
mean      39.073593
std       12.883944
min       18.000000
25%       29.000000
50%       37.000000
75%       46.000000
max       90.000000
Name: age, dtype: float64
Race: Black | Sex: Female
count    1555.000000
mean       37.854019
std        12.637197
min        17.0

In [21]:
# Among whom is the proportion of those who earn >50K greater: married or single men?
#data['marital-status'].value_counts()
all_married_men = data['salary'].loc[(data['marital-status'].str.startswith('Married'))
                                     & (data['sex'] == 'Male')].count()
married_men = data['salary'].loc[(data['marital-status'].str.startswith('Married')) 
                                 & (data['sex'] == 'Male') & (data['salary'] == '>50K')].count()
print(married_men / all_married_men)

0.4405139945351156


In [22]:
data['marital-status'].unique()
arr = ['Never-married', 'Separated', 'Divorced','Widowed']

all_notmarried_men = data['salary'].loc[(data['marital-status'].isin(arr))
                                     & (data['sex'] == 'Male')].count()
notmarried_men = data['salary'].loc[(data['marital-status'].isin(arr)) 
                                 & (data['sex'] == 'Male') & (data['salary'] == '>50K')].count()
print(notmarried_men / all_notmarried_men)
#among married men

0.08449509031397745


In [25]:
#Find maximum number of hours a person works per week
#How many people work maxHours, and what is the percentage of those who earn >50K among them?
max_hours = data['hours-per-week'].max()
people_max_hours = data[data['hours-per-week'] == max_hours].shape[0]
rich_people = data[(data['hours-per-week'] == max_hours) 
                                 &(data['salary'] == '>50K')].shape[0]

print('Max hours-per-week -', max_hours)
print('Num of people working like that -', people_max_hours)
print('Percentage of rich -', int(100 * (rich_people / people_max_hours)))

Max hours-per-week - 99
Num of people working like that - 85
Percentage of rich - 29


In [30]:
#Count the average time of work (hours-per-week) for those who earn a little and a lot for each country.
#What will these be for Japan?

for (country, salary), sub_data in data.groupby(['native-country', 'salary']):
    print(country, salary, round(sub_data['hours-per-week'].mean()))

? <=50K 40
? >50K 46
Cambodia <=50K 41
Cambodia >50K 40
Canada <=50K 38
Canada >50K 46
China <=50K 37
China >50K 39
Columbia <=50K 39
Columbia >50K 50
Cuba <=50K 38
Cuba >50K 42
Dominican-Republic <=50K 42
Dominican-Republic >50K 47
Ecuador <=50K 38
Ecuador >50K 49
El-Salvador <=50K 36
El-Salvador >50K 45
England <=50K 40
England >50K 45
France <=50K 41
France >50K 51
Germany <=50K 39
Germany >50K 45
Greece <=50K 42
Greece >50K 51
Guatemala <=50K 39
Guatemala >50K 37
Haiti <=50K 36
Haiti >50K 43
Holand-Netherlands <=50K 40
Honduras <=50K 34
Honduras >50K 60
Hong <=50K 39
Hong >50K 45
Hungary <=50K 31
Hungary >50K 50
India <=50K 38
India >50K 46
Iran <=50K 41
Iran >50K 48
Ireland <=50K 41
Ireland >50K 48
Italy <=50K 40
Italy >50K 45
Jamaica <=50K 38
Jamaica >50K 41
Japan <=50K 41
Japan >50K 48
Laos <=50K 40
Laos >50K 40
Mexico <=50K 40
Mexico >50K 47
Nicaragua <=50K 36
Nicaragua >50K 38
Outlying-US(Guam-USVI-etc) <=50K 42
Peru <=50K 35
Peru >50K 40
Philippines <=50K 38
Philippines >50K 