In [53]:
import pandas as pd
import numpy as np

In [54]:
df = pd.read_json("../data/cities.json")
df

Unnamed: 0,city,growth_from_2000_to_2013,latitude,longitude,population,rank,state
0,New York,4.8%,40.712784,-74.005941,8405837,1,New York
1,Los Angeles,4.8%,34.052234,-118.243685,3884307,2,California
2,Chicago,-6.1%,41.878114,-87.629798,2718782,3,Illinois
3,Houston,11.0%,29.760427,-95.369803,2195914,4,Texas
4,Philadelphia,2.6%,39.952584,-75.165222,1553165,5,Pennsylvania
...,...,...,...,...,...,...,...
995,Weslaco,28.8%,26.159519,-97.990837,37093,996,Texas
996,Keizer,14.4%,44.990119,-123.026208,37064,997,Oregon
997,Spanish Fork,78.1%,40.114955,-111.654923,36956,998,Utah
998,Beloit,2.9%,42.508348,-89.031776,36888,999,Wisconsin


In [55]:
# mean and median populations
print(
    f"Population mean is {df['population'].mean():,} and median is {df['population'].median():,}"
)
# this tells us most of the cities are pretty tiny
df["population"].describe()[["mean", "50%"]]

Population mean is 131,132.443 and median is 68,207.0


mean    131132.443
50%      68207.000
Name: population, dtype: float64

In [56]:
# ignore the 50 most populous cities, what happens to the mean and median?
top_removed = df.loc[df["rank"] > 50, "population"]
print(
    f"Population of 51-1000 mean is {top_removed.mean():,.2f} and median is {top_removed.median():,}"
)
# mean is dragged down by 50k and the median barely budges, which reinforces the idea most of the
# cities are very small
df.loc[50:, "population"].describe()[["mean", "50%"]]

Population of 51-1000 mean is 87,027.39 and median is 65,796.0


mean    87027.387368
50%     65796.000000
Name: population, dtype: float64

In [57]:
# rank of the northern-most city
# so max latitude
df.loc[df["latitude"] == df["latitude"].max()]
# surprise! Alaska!

Unnamed: 0,city,growth_from_2000_to_2013,latitude,longitude,population,rank,state
62,Anchorage,15.4%,61.218056,-149.900278,300950,63,Alaska


In [58]:
# which state has the largest number of cities on the list
state_cities = df["state"].value_counts()
# `value_counts()` just returns a Series, so we can do all the usual things
# with the results
state_cities[state_cities == state_cities.max()]
# head and tail sort of work here too because value_counts sorts
# as long as no duplicate values
# state_cities.head(1)

state
California    212
Name: count, dtype: int64

In [59]:
# smallest number of cities
state_cities[state_cities == state_cities.min()]
# using tail - BUT this is misleading because there are multiple states with 1 city
# state_cities.tail(1)

state
Alaska                  1
Hawaii                  1
District of Columbia    1
Maine                   1
Vermont                 1
Name: count, dtype: int64

# Extension questions
1. Convert the `"growth_from_2000_to_2013"` column into a floating-point number, then find the mean and median changes in city size between 2000 and 2013. If a city has no recorded growrth, set it to 0.
2. How many cities had positive growth in this period, and how many had negative growth?
3. Find the city or cities with latitudes more than two standard deviations from the mean.

In [60]:
df.dtypes

city                         object
growth_from_2000_to_2013     object
latitude                    float64
longitude                   float64
population                    int64
rank                          int64
state                        object
dtype: object

In [None]:
# 1. converting the string percentages to a float
# Existing data would need to be stripped of percentage signs, and some values are blank so replace with 0 then convert
# This is based on the assumption that just reloading the data with a float type would result in everything being a NaN
df["growth_from_2000_to_2013"] = (
    df["growth_from_2000_to_2013"]
    .apply(lambda x: (x if x else "0").strip("%"))
    .astype(np.float32)
)
df["growth_from_2000_to_2013"] = df["growth_from_2000_to_2013"] / 100
df["growth_from_2000_to_2013"].describe()[["mean", "50%"]]

# From the book suggested solution - using pandas' functions instead of `apply`
# Remove trailing %
# df["growth_from_2000_to_2013"] = df["growth_from_2000_to_2013"].str.rstrip("%")

# Find empty strings, and turn to 0
# df.loc[df["growth_from_2000_to_2013"] == "", "growth_from_2000_to_2013"] = "0"

mean    0.22936
50%     0.09650
Name: growth_from_2000_to_2013, dtype: float64

In [None]:
# 2. How many cities had positive and negative growth
# The book had basically the same binning but didn't include the "none"
# category, which IMO is a mistake since it misrepresents no growth as
# either positive or negative
df["growth_category"] = pd.cut(
    df["growth_from_2000_to_2013"],
    bins=[-float("inf"), 0, float("inf")],
    labels=["negative", "positive"],
    include_lowest=True,
)
df["growth_category"] = df["growth_category"].cat.add_categories("none")
df.loc[df["growth_from_2000_to_2013"] == 0, "growth_category"] = "none"

df["growth_category"].value_counts()

growth_category
positive    847
negative    142
none         11
Name: count, dtype: int64

In [63]:
df.query(
    "latitude > latitude.mean() + 2 * latitude.std() | latitude < latitude.mean() - 2 * latitude.std()"
)[["city", "latitude", "growth_category", "population", "rank"]].sort_values(
    "population", ascending=False
)

Unnamed: 0,city,latitude,growth_category,population,rank
43,Miami,25.76168,positive,417650,44
53,Honolulu,21.306944,negative,347884,54
62,Anchorage,61.218056,positive,300950,63
88,Hialeah,25.857596,positive,233394,89
130,Brownsville,25.901747,positive,181860,131
138,Fort Lauderdale,26.122439,positive,172389,139
145,Cape Coral,26.562854,positive,165831,146
149,Pembroke Pines,26.007765,positive,162329,150
173,Hollywood,26.011201,positive,146526,174
187,McAllen,26.203407,positive,136639,188
