# Demographic Data Analyzer
In this challenge you must analyze demographic data using Pandas. You are given a dataset of demographic data that was extracted from the 1994 Census database.

[link to freeCodeCamp project](https://www.freecodecamp.org/learn/data-analysis-with-python/data-analysis-with-python-projects/demographic-data-analyzer)

[link to Replit](https://replit.com/@tasyadew/boilerplate-demographic-data-analyzer#demographic_data_analyzer.py)

## demographic_data_analyzer.py

Read the csv file

In [2]:
import pandas as pd


file_path = 'adult.data.csv'
data = pd.read_csv(file_path)
df = pd.DataFrame(data)

print(df.head())


   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country salary  
0          2174             0              40  United-States  <=50K  
1             0             0             

In [3]:
# How many of each race are represented in this dataset? This should be a Pandas series with race names as the index labels.
race_count = df['race'].value_counts()

print(race_count)

race
White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64


In [87]:
# What is the average age of men?
average_age_men = df['age'][df['sex'] == 'Male'].mean()

print(average_age_men)

39.43354749885268


In [25]:
# What is the percentage of people who have a Bachelor's degree?
bachelors_total = (df['education'] == 'Bachelors').sum() 
all_total = len(df['education'])
percentage_bachelors = (bachelors_total / all_total) * 100

print('bacelors total:', bachelors_total)
print('all total:', all_total)
print('percentage:', percentage_bachelors)

bacelors total: 5355
all total: 32561
percentage: 16.44605509658794


In [43]:
# What percentage of people with advanced education (`Bachelors`, `Masters`, or `Doctorate`) make more than 50K?
# What percentage of people without advanced education make more than 50K?
# with and without `Bachelors`, `Masters`, or `Doctorate`

list = ['Bachelors', 'Masters', 'Doctorate' ]

higher_education = df['education'].isin(list)
lower_education = ~higher_education

# percentage with salary >50K
higher_education_rich = (df[higher_education]['salary'] == '>50K').mean() * 100
lower_education_rich = (df[lower_education]['salary'] == '>50K').mean() * 100

# all vs >50k
print('higher education:', higher_education.value_counts(), (df[higher_education]['salary'] == '>50K').value_counts())
print('lower education:', lower_education.value_counts(), (df[lower_education]['salary'] == '>50K').value_counts())

print('higher education rich:', higher_education_rich)
print('lower education rich:', lower_education_rich)    

higher education: education
False    25070
True      7491
Name: count, dtype: int64 salary
False    4005
True     3486
Name: count, dtype: int64
lower education: education
True     25070
False     7491
Name: count, dtype: int64 salary
False    20715
True      4355
Name: count, dtype: int64
higher education rich: 46.535843011613935
lower education rich: 17.3713601914639
46.535843011613935
17.3713601914639


In [45]:
# What is the minimum number of hours a person works per week (hours-per-week feature)?
min_work_hours = df['hours-per-week'].min()

print(min_work_hours)

1


In [97]:
# What percentage of the people who work the minimum number of hours per week have a salary of >50K?

#calculation here

# minimum hours == 1
# need to find how many people work 1 hour per week

min_hours_workers = df[(df['hours-per-week'] == min_work_hours)]['salary'].value_counts().sum()

# so, 20 people only work for 1 hour per week
# now, how to find which of those people make >50k?

min_hours_rich = df[(df['hours-per-week'] == min_work_hours) & (df['salary'] == '>50K')]['salary'].value_counts().sum()

# so, 2 of those 20 people make >50k

#this is for output
rich_percentage = min_hours_rich / min_hours_workers * 100

print('min_hours_workers:', min_hours_workers)
print('min_hours_rich:', min_hours_rich)
print('rich_percentage:', rich_percentage)

min_hours_workers: 20
min_hours_rich: 2
rich_percentage: 10.0


In [106]:
# What country has the highest percentage of people that earn >50K?

# the percentage is by country... need to seperate by country so it will not affect top earner with less population
group_country_rich = df.groupby(['native-country', 'salary']).size().reset_index(name='count')

group_country_pivot = group_country_rich.pivot(index='native-country', columns='salary', values='count')

group_country_pivot['percentage_rich'] = (group_country_pivot['>50K'] / (group_country_pivot['>50K'] + group_country_pivot['<=50K'])) * 100


# output here
highest_earning_country = group_country_pivot['percentage_rich'].idxmax()
highest_earning_country_percentage = group_country_pivot['percentage_rich'].max()

print(group_country_pivot['percentage_rich'])
print('highest_earning_country:', highest_earning_country)
print('highest_earning_country_percentage:', highest_earning_country_percentage)


native-country
?                             25.042882
Cambodia                      36.842105
Canada                        32.231405
China                         26.666667
Columbia                       3.389831
Cuba                          26.315789
Dominican-Republic             2.857143
Ecuador                       14.285714
El-Salvador                    8.490566
England                       33.333333
France                        41.379310
Germany                       32.116788
Greece                        27.586207
Guatemala                      4.687500
Haiti                          9.090909
Holand-Netherlands                  NaN
Honduras                       7.692308
Hong                          30.000000
Hungary                       23.076923
India                         40.000000
Iran                          41.860465
Ireland                       20.833333
Italy                         34.246575
Jamaica                       12.345679
Japan                    

In [84]:
# Identify the most popular occupation for those who earn >50K in India.

# native-country: india
# salary: >50k 
# top occupation

india_rich = df[(df['native-country'] == 'India') & (df['salary'] == '>50K')]['occupation'].value_counts()
top_IN_occupation = india_rich.idxmax()

print('india_rich:', india_rich)
print('top_IN_occupation:', top_IN_occupation)

india_rich: occupation
Prof-specialty      25
Exec-managerial      8
Other-service        2
Tech-support         2
Transport-moving     1
Sales                1
Adm-clerical         1
Name: count, dtype: int64
top_IN_occupation: Prof-specialty
