In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Week 4: Lecture 5 Census and Charts

Creating a new table:

In [None]:
rodents_array = make_array('Gophers', 'Marmots', 'Capybaras')
Table().with_column('Rodents', rodents_array)

**Question:** Why doesn't this work?

In [None]:
Table.with_column('Rodents', rodents_array)

## Discussion Question: NBA Salaries

In [None]:
nba = Table.read_table('data/nba_salaries.csv')
nba = nba.relabeled(3, 'SALARY').drop('TEAM')
nba.show(5)

**Question 1:** Create an **array** containing the names of all point guards (`PG`) making more than $15mil/year

In [None]:
# ...

**Question 2:** After evaluating the two expressions

`nba.drop('POSITION')`

`nba.num_columns`

in order, what is the result of the second expression?

In [None]:
# ...

## Census: Population Trends ##

In [None]:
# As of August 2021, this census file is online here: 
data = 'http://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2019/nc-est2019-agesex-res.csv'

# A description of the table appears online.
# https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2019/nc-est2019-agesex-res.pdf

full_census_table = Table.read_table(data)
full_census_table

[A description of the table appears online.](https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2019/nc-est2019-agesex-res.pdf)

The Census dataset contain estimates of the US population in each category of sex and age in the years 2010 through 2019. The Census is decennial: it takes place every 10 years. The most recent Census was held in 2020 and the one before that in 2010. The Census Bureau also estimates population changes each year. As explained in the Bureau’s description of its methodology, it “adds [the estimated changes] to the last decennial census to produce updated population estimates every year.”



In [None]:
# Focus on estimates for 2014 and 2019 population
partial_census_table = full_census_table.select('SEX', 'AGE', 'POPESTIMATE2014', 'POPESTIMATE2019')
partial_census_table.sort("SEX",descending=True)

**Question:** if the above table had another column `POPESTIMATE2023`, can you think of anyone who would be represented in the first row?

In [None]:
# Relabel the columns with easier-to-read names
us_pop = partial_census_table.relabeled('POPESTIMATE2014', '2014').relabeled('POPESTIMATE2019', '2019')
us_pop

**Question:** if we only keep rows where `SEX == 0`, what information does the table contain?

In [None]:
us_pop_by_age = us_pop.where('SEX', are.equal_to(0)).drop('SEX')
us_pop_by_age

In [None]:
# Focus on people 98 or older?
us_pop_by_age.where('AGE', are.between(98, 101))

**Question 1:** why not use `are.between(98, 100)`?

**Question 2:** why not use `are.above(97)`?

**Question 3:** why does the `AGE == 100` row have the largest populations?

In [None]:
# Calculate population change from 2014 to 2019
change = us_pop_by_age.column('2019') - us_pop_by_age.column('2014')
change

In [None]:
# Calculate the population change from 2014 to 2019 as a percentage
change_percent = change / us_pop_by_age.column('2014')

In [None]:
# Add array of population changes back into the table as a new column
us_pop_change = us_pop_by_age.with_columns(
    'Change', change,
    'Percent Change', change_percent
)
us_pop_change.set_format('Percent Change', PercentFormatter)

In [None]:
us_pop_change.where('AGE', are.equal_to(999))

**Question:** what information does the above table contain?

In [None]:
# Find which ages had the greatest percent growth from 2014 to 2019
us_pop_change.where(
    'AGE', are.below(999)
).sort('Change', descending=True)

**Discussion Question:** Take a look at the top few rows. While the percent change is about 3% for the overall population, it jumps to well over 20% for the people in their late sixties and early seventies. What could explain this large increase? 

In [None]:
# ...

## Sex Ratios

The Census form asks respondents to provide the sex of each household member by checking one of two boxes labeled Male and Female. The SEX column contains numeric codes: 1 for male, 2 for female, and 0 for the total.



In [None]:
# Construct table of total males + females in 2019
us_pop_2019 = us_pop.drop('2014')
all_ages = us_pop_2019.where('AGE', are.equal_to(999))
all_ages

In [None]:
# Calculate proportion of population with each sex
pop_2019 = all_ages.column('2019').item(0)
proportion_array = all_ages.column('2019') / pop_2019 

# View this information in the table
all_ages.with_column(
    'Proportion', proportion_array
).set_format('Proportion', PercentFormatter)

How does this proportion vary for different age groups?

In [None]:
infants = us_pop_2019.where('AGE', are.equal_to(0))
infants

In [None]:
infants_2019 = infants.column('2019').item(0)
infants.with_column(
    'Proportion', infants.column('2019')/infants_2019
).set_format('Proportion', PercentFormatter)

**Goal:** create a table with the female to male population ratio for each age group

In [None]:
# Step 1: create table of female populations for each age group
females_all_rows = us_pop_2019.where('SEX', are.equal_to(2))
females = females_all_rows.where('AGE', are.not_equal_to(999))
females

In [None]:
# Step 2: create table of male populations for each age group
males_all_rows = us_pop_2019.where('SEX', are.equal_to(1))
males = males_all_rows.where('AGE', are.not_equal_to(999))
males

In [None]:
# Step 3: create table with F:M population ratio
ratio_array = females.column('2019')/males.column('2019')
ratios = Table().with_columns(
    'AGE', females.column('AGE'),
    '2019 F:M RATIO', ratio_array
)
ratios.show()

**Question:** create a table of F:M population ratios for people older than 75.

In [None]:
# ...

## Line Plots ##

In [None]:
ratios.plot('AGE')

In [None]:
us_pop

In [None]:
# Remove the age totals
no_999 = us_pop.where('AGE', are.below(999))
no_999.sort('AGE', descending = True)

In [None]:
# Remove male and female (keep only combined)
total_pop_by_year = no_999.where('SEX', 0).drop('SEX')
total_pop_by_year

In [None]:
# Let's plot it!
total_pop_by_year.plot('AGE', '2014')

This plot is hard to interpret without labels---let's label it.

In [None]:
total_pop_by_year.plot('AGE', '2014')
plots.title('US Population');

**Question:** create a plot for total population by age for the year 2019.

In [None]:
# ...

**Bonus question:** create a plot that shows both 2014 and 2019. Add a title!

In [None]:
# ...

## Males and Females in 2019 ##

In [None]:
# Let's compare male and female populations by age
males = no_999.where('SEX', 1).drop('SEX')
females = no_999.where('SEX', 2).drop('SEX')

In [None]:
pop_2019 = Table().with_columns(
    'Age', males.column('AGE'),
    'Males', males.column('2019'),
    'Females', females.column('2019')
)
pop_2019

In [None]:
pop_2019.plot('Age')

In [None]:
# Calculate the percent female for each age
total = pop_2019.column('Males') + pop_2019.column('Females')
pct_female = pop_2019.column('Females') / total * 100
pct_female

In [None]:
# Round it to 3 so that it's easier to read
pct_female = np.round(pct_female, 3)
pct_female

In [None]:
# Add female percent to our table
pop_2019 = pop_2019.with_column('Percent female', pct_female)
pop_2019

In [None]:
pop_2019.plot('Age', 'Percent female')

In [None]:
# ^^ Look at the y-axis! Trend is not as dramatic as you might think
pop_2019.plot('Age', 'Percent female')
plots.ylim(0, 100);

## Scatter Plots ##

In [None]:
# Actors and their highest grossing movies
actors = Table.read_table('data/actors.csv')
actors

In [None]:
actors.scatter('Number of Movies', 'Total Gross')

In [None]:
actors.scatter('Number of Movies', 'Average per Movie')

In [None]:
# Can we identify the outlier?
actors.where('Average per Movie', are.above(400))