In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

from IPython.display import display, Image

# Week 4: Lecture 6 Visualization

## Line Graphs ##

Let's first re-load the census data from last lecture:

In [None]:
# Download data from census.gov and import it into a table
data_url = 'http://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2019/nc-est2019-agesex-res.csv'
full_census_table = Table.read_table(data_url)

# Select population estimates for 2019
census_table_2019 = full_census_table.select('SEX', 'AGE', 'POPESTIMATE2019')

# Re-name the 2019 column to something a little simpler
census_table_2019 = census_table_2019.relabeled('POPESTIMATE2019', '2019')

# Remove age code 999. Why?
no_999 = census_table_2019.where('AGE', are.below(999))
no_999

Now we'll create a table with a separate column for male and female populations.

In [None]:
# First, extract the male and female populations into arrays
male_pop_array = no_999.where('SEX', 1).column('2019')
female_pop_array = no_999.where('SEX', 2).column('2019')

In [None]:
# Next, make a new table with columns for age, male population, and female population
pop_2019 = Table().with_columns(
    'Age', np.arange(101),
    'Male Pop', male_pop_array,
    'Female Pop', female_pop_array)
pop_2019

Use the `plot` method to create a line graph of the populations by age.

In [None]:
pop_2019.plot('Age')
# The 'Age' argument specifies that the 'Age' column should form the x-axis.
# All other columns are plotted on the y-axis.

Now we'll try to plot how the percentage of the population that is female changes by age:

In [None]:
pop_2019

In [None]:
# Calculate the percent female for each age
total_pop = pop_2019.column('Male Pop') + pop_2019.column('Female Pop')
pct_female_array = 100 * pop_2019.column('Female Pop') / total_pop

# Add this array to the pop_2019 table
pop_2019 = pop_2019.with_columns('Percent Female', pct_female_array)
pop_2019

In [None]:
pop_2019.plot('Age')

Uh oh! Since populations are several orders of magnitude larger than percentages, we shouldn't plot these on the same y-axis.

In [None]:
# Give .plot a second argument 'Percent Female', 
# to only plot the 'Percent Female' column on the y-axis.
pop_2019.plot('Age', 'Percent Female')

## Scatter Plots ##

In [None]:
# Actors and their highest grossing movies
actors = Table.read_table('data/actors.csv')
actors

Is there an association between the `Number of Movies` an actor / actress has been in, and the `Total Gross` of their movies? Use the `scatter` method to visually check if these variables are correlated.

In [None]:
actors.scatter('Number of Movies', 'Total Gross')
# 1st argument puts 'Number of Movies' on the x-axis
# 2nd argument puts 'Total Gross' on the y-axis

In [None]:
actors.scatter('Number of Movies', 'Average per Movie')

Can we identify the outlier?

In [None]:
actors.where('Average per Movie', are.above(400))

In [None]:
display(Image(url='https://upload.wikimedia.org/wikipedia/en/5/5c/C-3PO_droid.png'))

## Bar Charts ##

In [None]:
# Highest grossing movies as of 2017
top_movies = Table.read_table('data/top_movies_2017.csv')
top_movies 

In [None]:
# Convert to millions of dollars for readability
millions = np.round(top_movies.column('Gross (Adjusted)') / 1000000, 3)
top_movies = top_movies.with_column('Millions', millions)
top_movies

In [None]:
# Let's focus on the top 10, sorted by Gross (Adjusted)
top10_adjusted = top_movies.sort('Gross (Adjusted)', descending=True).take(np.arange(10))
top10_adjusted

Make a bar chart with the `barh` method:

In [None]:
top10_adjusted.barh('Title', 'Millions')

In [None]:
top10_not_adjusted = top_movies.sort('Gross', descending=True).take(np.arange(10))
top10_not_adjusted

**Discussion question:** generate the chart shown in the slides!
I.e., create a bar chart showing the age (# years since release) for the 10 highest grossing movies (non-adjusted).

In [None]:
# ...

## Categorical Distributions

In [None]:
top_movies

We can count how many times each studio appears in the table using the `group` method:

In [None]:
studio_distribution = top_movies.group('Studio')
studio_distribution

In [None]:
studio_distribution.sort('count', descending=True)

In [None]:
sum(studio_distribution.column('count'))

## Bar Charts

We can use the `barh` method to visualize the distribution of a categorical variable:

In [None]:
studio_distribution.barh('Studio')

In [None]:
studio_distribution.sort('count', descending=True).barh('Studio')

## Numerical Distributions

In [None]:
top_movies

In [None]:
ages = 2023 - top_movies.column('Year')
top_movies = top_movies.with_column('Age', ages)

In [None]:
top_movies

In [None]:
top_movies.sort('Age')

Can we use a bar chart to visualize the distribution of a *numerical* variable?

In [None]:
# Create bar chart for the distribution of gross revenue
top_movies.group('Gross').barh('Gross')

## Binning ##

In [None]:
top_movies

In [None]:
# Figure out the smallest and largest values of "Age" in the dataset
ages = top_movies.column('Age')
min(ages), max(ages)

In [None]:
my_bins = make_array(0, 5, 10, 15, 25, 40, 65, 102)
# The upper bound on the LAST BIN ONLY is inclusive!

**Question:** how many bins are there?

In [None]:
# ...

Use the table method `bin` to count the number of movies in each bin:

In [None]:
binned_data = top_movies.bin('Age', bins = my_bins)
binned_data

**Question:** how many movies are in the bin [5, 10)?

In [None]:
# ...

**Question:** does the `binned_data` table contain as many rows as you would expect?

In [None]:
# ...

In [None]:
sum(binned_data.column('Age count'))

Let's try some new bins:

In [None]:
binned_data = top_movies.bin('Age', bins = np.arange(0, 102, 25))
binned_data

Are all 200 of the movies accounted for?

In [None]:
sum(binned_data.column('Age count'))

**Question:** why is there a movie missing?

In [None]:
# ...

## Histograms ##

In [None]:
binned_data = top_movies.bin('Age', bins = my_bins)
binned_data

Use the table method `hist` to generate a histogram that visualizes the distribution of movie ages:

In [None]:
# Let's make our first histogram!
top_movies.hist('Age', bins = my_bins, unit = 'Year')

**Question:** which bin contains the largest number of movies? 

In [None]:
# ...

Bins with unequal widths are valid, but typically we use equally-spaced bins.

**Question:** how do we easily generate an array of bins from 0 to 110 (inclusive), all of width 10?

In [None]:
even_bins = ...

In [None]:
# Let's try equally spaced bins instead.
top_movies.hist('Age', bins=even_bins, unit = 'Year')

If you don't want to manually define bins, the `hist` method can do it automatically!

In [None]:
# Use .hist without providing an argument for bins
top_movies.hist('Age', unit='Year')

The three histograms above all visualize the same distribution of movie ages, but they look quite different due to choice of bins.

In [None]:
# Add a column containing what percent of movies are in each bin
age_count_array = binned_data.column('Age count')
percent_in_each_bin = 100 * age_count_array / sum(age_count_array)
binned_data = binned_data.with_column('Percent', percent_in_each_bin)
binned_data

## Height ##

### Question: What is the height of the [40, 65] bin?

In [None]:
# Step 1: Calculate % of movies in the [40, 65) bin
# we know it should 57/200
age_count_array = binned_data.column('Age count')
age_count_array
percent = age_count_array.item(5)/sum(age_count_array)
percent # we are expecting 57/200 = 0.285 or 28.5%

In [None]:
# Step 2: Calculate the width of the 40-65 bin
width = 65 - 40

In [None]:
# Step 3: Area of rectangle = height * width
#         --> height = percent / width
height = percent/width
height

### What are the heights of the rest of the bins?

In [None]:
binned_data

Use the `np.diff()` function to calculate the bin widths:

In [None]:
bin_widths_array = np.diff(binned_data.column('bin'))
bin_widths_array

Use the `exclude` table method to remove the last row from the `binned_data` table, so we can add the bin widths to the table:

In [None]:
binned_data_and_widths = binned_data.exclude(-1).with_column('width', bin_widths_array)
binned_data_and_widths

Calculate bin heights using the formula
\begin{equation}
    \text{height} = \frac{\text{% in bin}}{\text{width of bin}}
\end{equation}

In [None]:
# Get the bin heights
bin_heights_array = binned_data_and_widths.column('Percent') / binned_data_and_widths.column('width')
binned_data_and_widths = binned_data_and_widths.with_column('height', bin_heights_array)
binned_data_and_widths

In [None]:
# Do these heights look right?
top_movies.hist('Age', bins = my_bins, unit = 'Year')