In [None]:
# Here is a crack at a choropleth map from the altair documentation.
# states is a built-in datastructure that contains the map, and
# statedata is a sequence of 50 random numbers that I will paint on the map.
# Altiar wants the map and the data in the same data structure, so we
# have to use transform_lookup (the equivalent of pd.join or VLOOKUP)
# to either merge the map into the data or the data into the map
# before there is enough data in one place to attempt a map

import altair as alt
from vega_datasets import data
import pandas as pd
import numpy as np
states = alt.topo_feature(data.us_10m.url, feature='states')  # This is the map
statedata = pd.DataFrame({"value":np.random.random(51)})      # This is bogus data
statedata["id"] = statedata.index
alt.Chart(states).mark_geoshape(
    stroke='white'
  ).encode(
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(statedata, 'id', ["value"])
).project('albersUsa').properties(
    width=500,
    height=300
)

In [None]:
# That's nice.  But a little too blue. 
# Did I forget to specify .encode(color) ? 
states = alt.topo_feature(data.us_10m.url, feature='states')
statedata = pd.DataFrame({"value":np.random.random(51)})
statedata["id"] = statedata.index
alt.Chart(states).mark_geoshape(
    stroke='white'
  ).encode(color="value:Q"
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(statedata, 'id', ["value"])
).project('albersUsa').properties(
    width=500,
    height=300
)


In [None]:
# What's wrong with West Virginia, Virginia, Wyoming, and Washington?
# Oh, the map has more than 50 regions.  How am I supposed to put 
# state data in the right place?  
# Elsewhere in the documentation, the following toy dataset is used for 
# choropleth maps, and it seems to contain the  Statename -> magic map id number mapping

# I can open it in a browser if my browser has a JSON viewer plugin.

income = pd.read_json("https://vega.github.io/vega-datasets/data/income.json")

In [None]:
income.head()

In [None]:
# It has lots of income data I don't care about, but it does have states and IDs.
decoder={}
for row in income.iloc():
    decoder[row["name"]] = row["id"]
print(decoder)

In [None]:
print(len(decoder.values()))

In [None]:
# Ok, so it's DC and PR in this dataset, and there are gaps in the numbering where you 
# might expect American Samoa and Guam. 

In [None]:
# After fruitlessly searching the census website for 10 minutes, I got a 
# population chart from https://en.wikipedia.org/wiki/2010_United_States_census
# and as you will see, I will come to regret it.

population=pd.read_csv("state-population.csv")

In [None]:
population

In [None]:
# A word on separators.  Simple "flat" files have three puncuation marks:
# line separators, which ususally don't give trouble, 
# field separators like tab, space, and comma, and
# quotation marks.
# Q:  Why are the quotation marks necessary?
# A:  Because fields sometimes contain spaces, commas, tabs, or, sadly, newline characters.

In [None]:
# An arcane piece of data lore for you: when you have to fit unruly variable-length 
# data into a flat format (not something structured like XML or JSON),
# *put the variable-length field at the end* of the line.  One peice of very useful bioinformatics
# workhorse software for a long time used spaces as field separators and had one field
# that was allowed to contain spaces.  It was at the end of the line, so graduate students
# around the world wrote parsers that separated fields with the first 17 spaces and left 
# the remaining spaces in the 18th field.

In [None]:
# D'oh.  Set field separator to tab.
population=pd.read_csv("state-population.csv", sep="\t")
population

In [None]:
# You can't fool me, I've seen what commas do to the digestive system of a pandas.
# I will remove them and add them as new comma-free columns "2000" and "2010"
population["2000"] = population["2000 Population"].str.replace(",","")
population["2010"] = population["2010 Population"].str.replace(",","")
population.head()

In [None]:
population.sort_values("2010")

In [None]:
# Something is not right here.. my populations are sorted in alphabetical
# (jargon lexciographic) order.  Let us ignore that. 

In [None]:
cumulative = population.sort_values("2010")["2010"].cumsum()
cumulative

In [None]:
# shouldn't have ignored that.  cumsum() is not summing numbers but concatenating
# strings.  

In [None]:
population.sort_values("2010")

In [None]:
# Now really create numerical population columns
population["2000"] = pd.to_numeric(population["2000 Population"].str.replace(",",""))
population["2010"] = pd.to_numeric(population["2010 Population"].str.replace(",",""))


In [None]:
cumulative = population.sort_values("2010")["2010"].cumsum()
cumulative

In [None]:
# This is better.  Let's plot it.
import matplotlib.pyplot as plt 
plt.plot(np.arange(len(cumulative)), cumulative)

In [None]:
plt.bar(np.arange(len(cumulative)), cumulative)
# Note this graph, *because of the way we constructed it*, has
# "first-derivative-like" first differences positive and 
# "second-derivative-like" second differences also positive.
#  Eeenteresting.  Not suspicion-inspiring at all.

In [None]:
# so the middle value is 
cumulative[25]

In [None]:
# and the final value is 
cumulative[0]


In [None]:
# Insert more-cattle-than-people joke here
cumulative[50]

In [None]:
# The fraction of the population in the smallest 26 states:
cumulative[25]/cumulative[0]

In [None]:
# I can't draw my population data on a map until I integrate the numerical 
# map IDs with the state name, and best practice when you have two pieces 
# of data in different data frames is to join them.  Let us examine our fields:


In [None]:
population.head(1)

In [None]:
income.head()

In [None]:
# create a new column named "State" to facilitate joining 
income["State"] = income.name

In [None]:
population.join(income, on="State")

In [None]:
# pd.join is complaining about data types.  Can I clean up the types?
population.State = population.State.astype(str)
income.State = income.State.astype(str)

In [None]:
# No that doesn't work ... seearch engine...  specify join fields with 
# DataFrame.set_index("fieldname")
population.set_index("State").join(income.set_index("State"))

In [None]:
# Wait a minute, that looked like it joined but it didn't.  Why could it 
# be that "California" does not join with "California" ?


In [None]:
print("*"+population.State[0]+"*")
print("*"+income.State[0]+"*")

In [None]:
# That would do it.  population has trailing spaces in its state name column.
# Can I just use square bracket notation on population.State?
test = population.State[:-1]
print("*"+test[0]+"*")

In [None]:
# apparently not.

In [None]:
# Can I use pd.Series.str and square brackets?
test = population.State.str[:-1]
print("*"+test[0]+"*")

In [None]:
# Yes, I can.  


In [None]:
# I have to remember not to run that cell again or I'll be 
# vacationing in Michiga next summer.

population["State"] = population.State.str[:-1]


In [None]:
# New dataframe joinpop, containing everything worthwhile from population and income:
joinpop = population.set_index("State").join(income.set_index("State"), how="left")
joinpop.shape

In [None]:
joinpop.head()

In [None]:
# And add a Boolean column with "bigger or smaller than 26th smallest state"
joinpop["flag"] = (joinpop["2010"] <=population.loc[25]["2010"])

joinpop["flag"].value_counts()

In [None]:
# including lots of duplicated population data.  (As long as I'm not summing it
# I should be fine.)
joinpop.id.dtype

In [None]:
income.id.dtype

In [None]:
joinpop

In [None]:
joinpop.shape

In [None]:
# Back to the map.  Merge joinpop.flag 

states = alt.topo_feature(data.us_10m.url, feature='states')
alt.Chart(states).mark_geoshape(
    stroke='white'
  ).encode(
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(joinpop, 'id', ["flag"])
).project('albersUsa').properties(
    width=500,
    height=300
)

In [None]:
# Merge joinpop["flag" ]

states = alt.topo_feature(data.us_10m.url, feature='states')
alt.Chart(states).mark_geoshape(
    stroke='white'
  ).encode( 
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(joinpop, 'id', ["flag"])
).project('albersUsa').properties(
    width=500,
    height=300
)

In [None]:
# Include .encode(color=)

states = alt.topo_feature(data.us_10m.url, feature='states')
alt.Chart(states).mark_geoshape(
    stroke='white'
  ).encode( color=alt.Color("flag:N")
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(joinpop, 'id', ["flag"])
).project('albersUsa').properties(
    width=500,
    height=300
)

In [None]:
# And I could fine-tune the colors and the labels and get rid of the missing values 
# for PR and DC but I'm ready to declare victory for today.

In [None]:
# We have the actual numbers, we could plot the population data itself, 
# not just a large-small marker:

states = alt.topo_feature(data.us_10m.url, feature='states')
alt.Chart(states).mark_geoshape(
    stroke='white'
  ).encode( color=alt.Color("2010:N")
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(joinpop, 'id', ["flag", "2010"])
).project('albersUsa').properties(
    width=500,
    height=300
)

In [None]:
# Ha, ha.  Not going to get full marks for that one. Population is not a categorical
# variable, and bad things happen if you instruct altiar otherwise.  There are only 10
# colors, so each color appears 5 times.

In [None]:
# Population must be quantitative ":Q" in color specificaiton 
states = alt.topo_feature(data.us_10m.url, feature='states')
alt.Chart(states).mark_geoshape(
    stroke='white'
  ).encode( color=alt.Color("2010:Q")
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(joinpop, 'id', ["flag", "2010"])
).project('albersUsa').properties(
    width=500,
    height=300
)

In [None]:
# Insert anti-Californian joke here, since they are eating most of the blue
# ink being a large state with 33 million people.

# What's the difference between an anti-Californian joke and an anti-Wyomingan joke?

In [None]:
population.loc[0]["2010"]/population.loc[50]["2010"]

In [None]:
# The anti-Californian joke gets 60x more poeple mad at you.

In [None]:
# We need to add a title 
states = alt.topo_feature(data.us_10m.url, feature='states')
alt.Chart(states, title="Pointless choropleth map: states by 2010 census population"
).mark_geoshape(
    stroke='white'
  ).encode( color=alt.Color("2010:Q")
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(joinpop, 'id', ["flag", "2010"])
).project('albersUsa').properties(
    width=500,
    height=300
)