In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Census ##

The Census form asks respondents to provide the sex of each household member by checking one of two boxes labeled Male and Female. The SEX column contains numeric codes: 1 for male, 2 for female, and 0 for the total.


In [None]:
# As of August 2021, this census file is online here: 
data = 'http://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2019/nc-est2019-agesex-res.csv'

# A description of the table appears online.
# https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2019/nc-est2019-agesex-res.pdf

full_census_table = Table.read_table(data)
#full_census_table.sort('SEX', descending = True)
#full_census_table.group('SEX')

full_census_table


In [None]:
# Keep only the columns we care about
partial_census_table = full_census_table.select('SEX', 'AGE', 'POPESTIMATE2014', 'POPESTIMATE2019')
partial_census_table

In [None]:
# Make things easier to read
us_pop = partial_census_table.relabeled('POPESTIMATE2014', '2014').relabeled('POPESTIMATE2019', '2019')
us_pop

In [None]:
# Sort by age
us_pop.sort('AGE')

In [None]:
# Sort by age (another way)
us_pop.sort('AGE', descending=True)

# first row is male + female, all ages pop estimate in 2014 and 2019
# 318 million in 2014, 328 million in 2019

# second row ....



# third row ...

## Sex Ratios

In [None]:
# let's look at the 2019 population estimate
# for males and females combined, males only, and females only
us_pop_2019 = us_pop.drop('2014')
all_ages = us_pop_2019.where('AGE', are.equal_to(999)) # 999 = all ages from 0 to 100+
all_ages

# first row ....total population male + female 328 million est in 2019

# second row...

# third row....

In [None]:
# let's add a proportin column
# to view the proportion of males and females in the total population

pop_2019 = all_ages.column('2019').item(0) # extracted 2019 column as an array, and grabbed first element, 
#position 0

all_ages.with_column(
    'Percentage', all_ages.column('2019')/pop_2019
).set_format('Percentage', PercentFormatter)

# estimated in 2019, males make up 49.25% of pop
# females make up 50.75%

In [None]:
# let's go back to the us_pop_2019 table
# and find all infants (less than 1 year old)
infants = us_pop_2019.where('AGE', are.equal_to(0))
infants
# 3.7 mill infants est. in 2019

In [None]:
# find all teens in the 2019 table: 13-19 years old
teens = us_pop_2019.where('AGE', are.between_or_equal_to(13,19))
teens.sort('AGE', descending = True)
#4.3 million 19 year olds, male + female

In [None]:
# let's add a proportion column 
# which calculates the percentage of male and female infants in the population
infants_2019 = infants.column('2019').item(0)
infants.with_column(
    'Proportion', infants.column('2019')/infants_2019
).set_format('Proportion', PercentFormatter)
# at the beginning of life
# estimated in 2019
# we had more male infants born

In [None]:
# all the females in our us_pop_2019 table
females_all_rows = us_pop_2019.where('SEX', are.equal_to(2))
females = females_all_rows.where('AGE', are.not_equal_to(999)) # grab age groups 0  to 100 + , no age totals 999
females

In [None]:
#all males in us_pop_2019
males_all_rows = us_pop_2019.where('SEX', are.equal_to(1))
males = males_all_rows.where('AGE', are.not_equal_to(999))
males

In [None]:
# instead of proportion, what if we want to measure the 
# ratio of female to male?

In [None]:
males.column('AGE')

In [None]:
females.column('AGE')

In [None]:
# Ratio table of Female to Male
ratios = Table().with_columns(
    'AGE', females.column('AGE'),
    '2019 F:M RATIO', females.column('2019')/males.column('2019')
)
ratios

# we saw previously we had a little more than 50% of our infants estimated to be born in 2019
# be male babies

# so our ratio of 0.95, or less than 1 makes sense
# since our male babies outnumbered our females babies slightly



In [None]:
ratios.where('AGE', are.above(75)).show()
# what do you notice?

In [None]:
males.where('AGE', are.contained_in(make_array(92, 93, 99)))

In [None]:
females.where('AGE', are.contained_in(make_array(92, 93, 99)))

In [None]:
ratios.plot('AGE')


In [None]:
# in a sentence or two, describe the relationship between age 
# and the ratio of Female:Male in the 2019 population estimate



## Line Plots ##

In [None]:
us_pop

In [None]:
# Remove the age totals
no_999 = us_pop.where('AGE', are.below(999))
no_999.sort('AGE', descending = True)

In [None]:
# Remove male and female (keep only combined)
everyone = no_999.where('SEX', 0).drop('SEX')

In [None]:
everyone

In [None]:
everyone.plot('AGE', '2014')
#1,000,000

In [None]:
# ^^ That plot should be labeled! Here are 3 ways to label it:

In [None]:
# US Population  <--- Just add a comment

everyone.plot('AGE', '2014')

In [None]:
everyone.plot('AGE', '2014')
print('US Population')  # <--- Print out what it is

In [None]:
everyone.plot('AGE', '2014')
plots.title('US Population');    # <--- OPTIONAL; not needed for DS1

In [None]:
# Age distribution for two different years
everyone.plot('AGE')

In [None]:
everyone

## Males and Females in 2019 ##

In [None]:
# Let's compare male and female counts per age
males = no_999.where('SEX', 1).drop('SEX')
females = no_999.where('SEX', 2).drop('SEX')

In [None]:
pop_2019 = Table().with_columns(
    'Age', males.column('AGE'),
    'Males', males.column('2019'),
    'Females', females.column('2019')
)
pop_2019

In [None]:
pop_2019.plot('Age')

# what do you notice in this graph? in the trends between male and female pop
# over time?



In [None]:
older_than_90 = pop_2019.where('Age', are.above(90))

older_than_90
older_than_90.plot('Age')
#ratios_older_90 = older_than_90.with_columns(
#    '2019 F:M RATIO', older_than_90.column('Females')/older_than_90.column('Males')
#)
#ratios_older_90
#ratios_older_90.plot('Age')

In [None]:
# Calculate the percent female for each age
total = pop_2019.column('Males') + pop_2019.column('Females')
pct_female = pop_2019.column('Females') / total * 100
pct_female

In [None]:
# Round it to 3 so that it's easier to read
pct_female = np.round(pct_female, 3)
pct_female

In [None]:
# Add female percent to our table
pop_2019 = pop_2019.with_column('Percent female', pct_female)
pop_2019

In [None]:
pop_2019.plot('Age', 'Percent female')

In [None]:
# ^^ Look at the y-axis! Trend is not as dramatic as you might think
pop_2019.plot('Age', 'Percent female')
plots.ylim(0, 100);  # Optional for DS 1

## Scatter Plots ##

In [None]:
# Actors and their highest grossing movies
actors = Table.read_table('data/actors.csv')
actors.sort('Number of Movies', descending = True)

In [None]:
actors.where('Total Gross', are.above(4500))

In [None]:
actors.scatter('Number of Movies', 'Total Gross')

In [None]:
#actors.scatter('Number of Movies', 'Average per Movie')
actors.plot('Number of Movies', 'Average per Movie') #line plot

In [None]:
actors.where('Average per Movie', are.above(400))

In [None]:
# find Anthony Hopkins
# no Anthony Hopkins in here :()
actors.where('Actor', are.containing('Anthony'))

In [None]:
help(Table.where)

## Bar Charts ##

In [None]:
# Highest grossing movies as of 2017
top_movies = Table.read_table('data/top_movies_2017.csv')
top_movies

In [None]:
top10_adjusted = top_movies.take(np.arange(10))
top10_adjusted

In [None]:
np.round(top10_adjusted.column('Gross (Adjusted)') / 1000000, 3)

In [None]:
# Convert to millions of dollars for readability
millions = np.round(top10_adjusted.column('Gross (Adjusted)') / 1000000, 3)
# extract Gross (Adjusted) column as an array
# divide by a million
# round to third decimal place
top10_adjusted = top10_adjusted.with_column('Millions', millions) # adding array back to table
top10_adjusted

In [None]:
# A line plot doesn't make sense here: don't do this!
top10_adjusted.plot('Year', 'Millions') # line plot
top10_adjusted.scatter('Year', 'Millions')

In [None]:
top10_adjusted.barh('Title', 'Millions')

In [None]:
# 2022 - year value

Age = 2022 - top10_adjusted.column('Year')
Age

top10_adjusted = top10_adjusted.with_column('Age', Age)

In [None]:
# Generate the chart shown in the slides:
# bar chart of age (# years since release) for the 10 highest grossing movies (non-adjusted)

# how do you get age column? it's not already in there.....
# 2022 - year value
# extract Year as an array..... how do you do this? HINT: use .column




# below is our goal
top10_adjusted.barh('Title', 'Age')