In [1]:
import pandas as pd
import numpy as np

In [4]:
names = ["date_time", "max_temp", "min_temp"]
data_files = [
    "san+francisco,ca.csv",
    "new+york,ny.csv",
    "springfield,ma.csv",
    "boston,ma.csv",
    "springfield,il.csv",
    "albany,ny.csv",
    "los+angeles,ca.csv",
    "chicago,il.csv",
]
df = pd.concat(
    [
        pd.read_csv(
            f"../data/{f}",
            usecols=[0, 1, 2, 19],
            names=["date_time", "max_temp", "min_temp", "precipMM"],
            skiprows=1,
        ).assign(state=f[-6:-4], city=f.split(",")[0].replace("+", " "))
        for f in data_files
    ],
    ignore_index=True,
)
df.describe()


Unnamed: 0,max_temp,min_temp,precipMM
count,5824.0,5824.0,5824.0
mean,4.989011,-0.903846,0.501082
std,7.67643,7.98309,1.898486
min,-25.0,-28.0,0.0
25%,0.0,-6.0,0.0
50%,4.0,-2.0,0.0
75%,11.0,6.0,0.1
max,23.0,17.0,30.8


1. Which cities had, on at least three occasions, precipitation of 15mm or more.
2. Find cities that had at least three measurements of 10mm of precipitation or more when the temperature was at or below 0°C.
3. For each precipitation measurement, calculate the proportion of that city's total precipitation.
4. For each city, determine the greatest proportion of that city's total precipitation to fall in a given period.

In [26]:
# create a new column for proportion of total precipitation for each city
df["precipProp"] = df.groupby("city")["precipMM"].transform(lambda x: x / x.sum())
# SO - when you use transform if you treat the arg like a scalar it will be broadcast (treated like a single value)
# and if you treat it like a series or dataframe you can call aggregate functions on it - neat!

# let's look at some non-zero values to eyeball and verify
df.loc[df["precipProp"] > 0].head()

Unnamed: 0,date_time,max_temp,min_temp,precipMM,state,city,precipProp
30,2018-12-14 18:00:00,15,10,0.8,ca,san francisco,0.001907
34,2018-12-15 06:00:00,15,13,0.4,ca,san francisco,0.000954
46,2018-12-16 18:00:00,15,13,4.0,ca,san francisco,0.009537
48,2018-12-17 00:00:00,15,11,0.3,ca,san francisco,0.000715
81,2018-12-21 03:00:00,13,10,0.8,ca,san francisco,0.001907


In [None]:
# which cities had at least three precipitations of 15mm or more
def filter_rainfall(df, amount, number, temp=None):
    if temp is None:
        return df.loc[df["precipMM"] >= amount, "precipMM"].count() >= number
    else:
        return (
            df.loc[
                (df["precipMM"] >= amount) & (df["min_temp"] <= temp), "precipMM"
            ].count()
            >= number
        )


# filter can be passed extra kwargs to plug into the filter function
df.groupby(["city", "state"]).filter(filter_rainfall, amount=15, number=3)[
    ["city", "state"]
].drop_duplicates()

Unnamed: 0,city,state
728,new york,ny
2184,boston,ma
4368,los angeles,ca


In [None]:
# cities with three measurements of 10mm or more with 0° or lower temperatures
df.groupby(["city", "state"]).filter(filter_rainfall, amount=10, number=3, temp=0)[
    ["city", "state"]
].drop_duplicates()

Unnamed: 0,city,state
728,new york,ny
2184,boston,ma
3640,albany,ny


In [27]:
# determine each city's greatest total proportion of rainfall in the given period
df.groupby(["city", "state"])["precipProp"].max()

city           state
albany         ny       0.029228
boston         ma       0.048302
chicago        il       0.057257
los angeles    ca       0.059242
new york       ny       0.055149
san francisco  ca       0.056509
springfield    il       0.030977
               ma       0.023459
Name: precipProp, dtype: float64

# Extension questions
1. Use a lambda to implement the arg-free version of the filter rainfall function
2. Use a lambda to implement the two arg version of the filter rainfall function
3. Use the transform with a lambda

In [None]:
# 1. Use a lambda for filtering rainfall and count
df.groupby(["city", "state"]).filter(
    lambda x: x.loc[x["precipMM"] >= 15, "precipMM"].count() >= 3
)[["city", "state"]].drop_duplicates()

Unnamed: 0,city,state
728,new york,ny
2184,boston,ma
4368,los angeles,ca


In [34]:
# 2. Use a lambda for the filtering rainfall of 10mm 3 times with 0° or below with lambda
df.groupby(["city", "state"]).filter(
    lambda x: x.loc[(x["precipMM"] >= 10) & (x["min_temp"] <= 0), "precipMM"].count()
    >= 3
)[["city", "state"]].drop_duplicates()

Unnamed: 0,city,state
728,new york,ny
2184,boston,ma
3640,albany,ny


In [None]:
# book version of the first lambda approach
# honestly I don't see the point of using the kwargs version of the `filter` call here
# the whole point of kwargs is to make the function reusable but here we're using a
# lambda which is generally treated as a single use function that isn't going to be
# reused
(
    df.groupby(["city", "state"])
    .filter(
        lambda df_, min_mm, times: df_.loc[df_["precipMM"] > min_mm, "precipMM"].count()
        >= times,
        min_mm=15,
        times=3,
    )[["city", "state"]]
    .drop_duplicates()
)

Unnamed: 0,city,state
728,new york,ny
2184,boston,ma
4368,los angeles,ca
