In [28]:
import pandas as pd
import numpy as np

In [29]:
names = ["date_time", "max_temp", "min_temp"]
data_files = [
    "san+francisco,ca.csv",
    "new+york,ny.csv",
    "springfield,ma.csv",
    "boston,ma.csv",
    "springfield,il.csv",
    "albany,ny.csv",
    "los+angeles,ca.csv",
    "chicago,il.csv",
]
df = pd.concat(
    [
        pd.read_csv(
            f"../data/{f}",
            usecols=[0, 1, 2],
            names=["date_time", "max_temp", "min_temp"],
            skiprows=1,
        ).assign(state=f[-6:-4], city=f.split(",")[0].replace("+", " "))
        for f in data_files
    ],
    ignore_index=True,
)
df.describe()

Unnamed: 0,max_temp,min_temp
count,5824.0,5824.0
mean,4.989011,-0.903846
std,7.67643,7.98309
min,-25.0,-28.0
25%,0.0,-6.0
50%,4.0,-2.0
75%,11.0,6.0
max,23.0,17.0


1. Does the data for each city start and end at roughly the same time? How do you know?
2. What is the lowest minimum temperature recorded for each city in the dataset?
3. What is the highest maximum temperature recorded in each *state* in the dataset?

In [30]:
df.head()

Unnamed: 0,date_time,max_temp,min_temp,state,city
0,2018-12-11 00:00:00,13,8,ca,san francisco
1,2018-12-11 03:00:00,13,8,ca,san francisco
2,2018-12-11 06:00:00,13,8,ca,san francisco
3,2018-12-11 09:00:00,13,8,ca,san francisco
4,2018-12-11 12:00:00,13,8,ca,san francisco


In [31]:
# for the first problem we could:
#  - create a new field for the date, and compare the first and last dates for each city
#  - could also groupby the date and get the first and last count
df["date"] = df["date_time"].str.split().str[0]
df["time"] = df["date_time"].str.split().str[1]
df.head()

Unnamed: 0,date_time,max_temp,min_temp,state,city,date,time
0,2018-12-11 00:00:00,13,8,ca,san francisco,2018-12-11,00:00:00
1,2018-12-11 03:00:00,13,8,ca,san francisco,2018-12-11,03:00:00
2,2018-12-11 06:00:00,13,8,ca,san francisco,2018-12-11,06:00:00
3,2018-12-11 09:00:00,13,8,ca,san francisco,2018-12-11,09:00:00
4,2018-12-11 12:00:00,13,8,ca,san francisco,2018-12-11,12:00:00


In [None]:
len(df.query("date == date.min() & time == time.min()").count()) == len(
    df.query("date == date.max() & time == time.max()").count()
)
# these numbers match, so we have records for each of the 8 cities in the min and max date/times

True

In [None]:
# The book does it differently:
df.groupby(["city", "state"])["date_time"].min()  # and .max()

city           state
albany         ny       2018-12-11 00:00:00
boston         ma       2018-12-11 00:00:00
chicago        il       2018-12-11 00:00:00
los angeles    ca       2018-12-11 00:00:00
new york       ny       2018-12-11 00:00:00
san francisco  ca       2018-12-11 00:00:00
springfield    il       2018-12-11 00:00:00
               ma       2018-12-11 00:00:00
Name: date_time, dtype: object

In [41]:
# minimum recorded temperature for each city in the dataset
df.groupby(["city", "state"])["min_temp"].min()

city           state
albany         ny      -19
boston         ma      -14
chicago        il      -28
los angeles    ca        4
new york       ny      -14
san francisco  ca        3
springfield    il      -25
               ma      -20
Name: min_temp, dtype: int64

In [40]:
# maximum recorded temperature for each state in the dataset
df.groupby("state")["max_temp"].max()

state
ca    23
il    16
ma    17
ny    15
Name: max_temp, dtype: int64

# Extension questions
1. Run describe on each of the min and max temperature results from state and city queries.
2. Running describe works, but only shows the first and last few rows from each result. Using `pd.set_option` to change the value of `display_max_rows` makes it possible to see all the results. Reset the option to 10 rows.
3. What is the average distance in temperature (max - min) for each of the cities in the dataset?

In [None]:
df.groupby(["state", "city"])[["min_temp", "max_temp"]].apply(pd.DataFrame.describe)
# this is the book's answer, since I had no idea - honestly since `describe` had been
# used elsewhere as a method of a Series or DataFrame, instead of through `apply`, no
# clue how I was supposed to arrive at this

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,min_temp,max_temp
state,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ca,los angeles,count,728.000000,728.000000
ca,los angeles,mean,10.637363,17.054945
ca,los angeles,std,2.705200,2.708640
ca,los angeles,min,4.000000,12.000000
ca,los angeles,25%,9.000000,15.000000
...,...,...,...,...
ny,new york,min,-14.000000,-12.000000
ny,new york,25%,-4.000000,2.000000
ny,new york,50%,0.000000,4.000000
ny,new york,75%,2.000000,7.000000


In [None]:
# again, not sure how you're supposed to get from "use pd.set_option" to `display_max_rows`
# to `display.max_rows`
pd.set_option("display.max_rows", 1000)
df.groupby(["state", "city"])[["min_temp", "max_temp"]].apply(pd.DataFrame.describe)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,min_temp,max_temp
state,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ca,los angeles,count,728.0,728.0
ca,los angeles,mean,10.637363,17.054945
ca,los angeles,std,2.7052,2.70864
ca,los angeles,min,4.0,12.0
ca,los angeles,25%,9.0,15.0
ca,los angeles,50%,11.0,16.0
ca,los angeles,75%,12.0,19.0
ca,los angeles,max,17.0,23.0
ca,san francisco,count,728.0,728.0
ca,san francisco,mean,8.252747,12.604396


In [48]:
pd.set_option("display.max_rows", 10)

In [None]:
# 3. range for temps for each city
(
    df.groupby(["state", "city"])["max_temp"].max()
    - df.groupby(["state", "city"])["min_temp"].min()
)

state  city         
ca     los angeles      19
       san francisco    12
il     chicago          37
       springfield      41
ma     boston           31
       springfield      35
ny     albany           32
       new york         29
dtype: int64

In [56]:
# mean of the ranges
df["range"] = df["max_temp"] - df["min_temp"]

In [57]:
df.groupby(["city", "state"])["range"].mean()

city           state
albany         ny       6.318681
boston         ma       6.010989
chicago        il       4.340659
los angeles    ca       6.417582
new york       ny       5.263736
san francisco  ca       4.351648
springfield    il       6.934066
               ma       7.505495
Name: range, dtype: float64

# Extension 3
What a horribly explained problem.

What is actually being calculated is:
- For each city take the range of minimum temperatures
- Find the range of the maximum tempatures
- Find the mean of those two numers

This sounds like a method that could show some measure of variability in a climate, but I also can't find any sources where it might be used.

In [None]:
# book version
df.groupby(["state", "city"])[["min_temp", "max_temp"]].apply(
    lambda g: np.mean(g.max() - g.min())
)


state  city         
ca     los angeles      12.0
       san francisco     8.0
il     chicago          34.0
       springfield      35.5
ma     boston           26.0
       springfield      28.5
ny     albany           26.5
       new york         26.5
dtype: float64