# Lecture 25 – Visualizing Categorical Variables

## Data 94, Spring 2021

In [None]:
from datascience import *
import numpy as np

Table.interactive_plots()

## Bar charts

In [None]:
schools = Table.read_table('data/r1_with_students.csv')

In [None]:
schools

In [None]:
schools.group('Control')

In [None]:
schools.group('Control').barh('Control')

### Example 1: Top 10 songs on Spotify

You can download an up-to-date copy of this data [here](https://spotifycharts.com/regional).

In [None]:
streams = Table.read_table('data/regional-global-daily-latest.csv', header = 1)
top_10 = streams.select('Track Name', 'Streams').take(np.arange(10))

In [None]:
top_10

In [None]:
top_10.barh('Track Name')

### Example 2: Artists with the most songs in the Spotify Top 200 right now

In [None]:
streams

In [None]:
streams.group('Artist') \
       .sort('count', descending = True) \
       .where('count', are.above(2))

In [None]:
streams.group('Artist') \
       .sort('count', descending = True) \
       .where('count', are.above(2)) \
       .barh('Artist')

### Example 3: Number of students at the 15 largest universities (in our dataset)

In [None]:
schools

In [None]:
schools.select('University', 'Number_students') \
       .sort('Number_students', descending = True) \
       .take(np.arange(15))

In [None]:
schools.select('University', 'Number_students') \
       .sort('Number_students', descending = True) \
       .take(np.arange(15)) \
       .barh('University')

### Quick Check 1

In [None]:
schools

In [None]:
schools.where('State', are.contained_in(['CA', 'TX', 'FL', 'NY', 'PA'])) \
       .group('State', ...) \
       .select(..., ...) \
       .barh(...)

### Note: bar order

### Disclaimer

In [None]:
schools.take(np.arange(5)).barh('Control')

## Grouped bar charts

In [None]:
# Run this cell.
def remove_comma(s):
    return int(s.replace(',', ''))

nominal = Table.read_table('data/gdp-nominal.csv')
ppp = Table.read_table('data/gdp-ppp.csv').drop(3)
gdp = nominal.join('Country/Territory', ppp) \
       .drop(1, 3) \
       .relabeled(['GDP(US$million)', 'GDP(millions of current Int$)'], ['GDP Nominal', 'GDP PPP'])
gdp = gdp.with_columns(
    'GDP Nominal', gdp.apply(remove_comma, 'GDP Nominal'),
    'GDP PPP', gdp.apply(remove_comma, 'GDP PPP')
)
gdp = gdp.sort('GDP Nominal', descending = True)

In [None]:
gdp

In [None]:
gdp.select('Country/Territory', 'GDP Nominal') \
   .take(np.arange(15)) \
   .barh('Country/Territory')

In [None]:
gdp.select('Country/Territory', 'GDP PPP') \
   .take(np.arange(15)) \
   .barh('Country/Territory')

In [None]:
gdp

In [None]:
gdp.take(np.arange(15)).barh('Country/Territory')

We can sort by GDP PPP, too:

In [None]:
gdp.sort('GDP PPP', descending = True).take(np.arange(15)).barh('Country/Territory')

Another example:

In [None]:
schools

In [None]:
schools.pivot('Control', 'State')

In [None]:
schools.pivot('Control', 'State') \
       .where('Private (non-profit)', are.above(0)) \
       .where('Public', are.above(0))

In [None]:
schools.pivot('Control', 'State') \
       .where('Private (non-profit)', are.above(0)) \
       .where('Public', are.above(0)) \
       .barh('State')

## Customization

In [None]:
schools.pivot('Control', 'State') \
       .where('Private (non-profit)', are.above(0)) \
       .where('Public', are.above(0)) \
       .barh('State')

In [None]:
schools.pivot('Control', 'State') \
       .where('Private (non-profit)', are.above(0)) \
       .where('Public', are.above(0)) \
       .barh('State', xaxis_title = 'Number of Universities',
                      title = 'Number of Private and Public R1 Universities in Each State',
                      width = 700,
                      height = 700)

### Quick Check 2

In [None]:
top_gdp = gdp.take(np.arange(7))
top_gdp