In [6]:
import pandas as pd
import numpy as np
import time

In [5]:
df = pd.read_csv(
    "../data/nyc-parking-violations-2020.csv",
    usecols=[1, 2, 3, 37, 7, 33],
    header=0,
    names=[
        "pid",
        "state",
        "ptype",
        "make",
        "color",
        "feet",
    ],  # this is in numeric order :/
)
df.head()

Unnamed: 0,pid,state,ptype,make,color,feet
0,J58JKX,NJ,PAS,HONDA,BK,0
1,KRE6058,PA,PAS,ME/BE,BLK,0
2,444326R,NJ,PAS,LEXUS,BLACK,0
3,F728330,OH,PAS,CHEVR,,0
4,FMY9090,NY,PAS,JEEP,GREY,0


In [10]:
# find all cars with registration state of New York (NY), New Jersey (NJ), or Conneticut (CT) using `.loc`
start_time = time.perf_counter()
# df.loc[df["state"].isin(("NY", "NJ", "CT"))]  # this is actually quite quick (0.6s)
df.loc[
    (df["state"] == "NY") | (df["state"] == "NJ") | (df["state"] == "CT")
]  # slow at 1.8s
end_time = time.perf_counter()
print(f".loc time {end_time - start_time}")

.loc time 1.8353916670312174


In [None]:
# now use `.query`
start_time = time.perf_counter()
# df.query("state.isin(('NY', 'NJ', 'CT'))")  # quick at 0.7s
df.query(
    "(state == 'NY') | (state == 'NJ') | (state == 'CT')"
)  # 0.9s, so about twice as fast for the same op using `.loc`
end_time = time.perf_counter()
print(f".query time {end_time - start_time}")

.query time 0.9077031249762513


Use `timeit` with the ipython magic: `%timeit myfunc(args)`

1. Find cars from New York
2. Find cars from New York with passenger (`PAS`) plates
3. Find white cars from New York with passenger plates
4. Find white cars from New York with passenger plates that were parked more than 1 foot from the curb
5. Find white Toyota cars from New York with passenger plates that were parked more than 1 foot from the curb

Use `df.loc`, `df.query`, and `df.eval` and compare the differences.

In [25]:
%timeit df.loc[df["state"] == "NY"]
%timeit df.query("state == 'NY'")
%timeit df.loc[df.eval("state == 'NY'")]

820 ms ± 4.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
557 ms ± 1.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
566 ms ± 20.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
%timeit df.loc[(df["state"] == "NY") & (df["ptype"] == "PAS")]
%timeit df.query("(state == 'NY') & (ptype == 'PAS')")
%timeit df.loc[df.eval("(state == 'NY') & (ptype == 'PAS')")]

1.21 s ± 1.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
666 ms ± 11.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
662 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
%timeit df.loc[(df["state"] == "NY") & (df["ptype"] == "PAS") & (df["color"] == "WHITE")]
%timeit df.query("(state == 'NY') & (ptype == 'PAS') & (color == 'WHITE')")
%timeit df.loc[df.eval("(state == 'NY') & (ptype == 'PAS') & (color == 'WHITE')")]

1.42 s ± 15.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
561 ms ± 944 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
561 ms ± 675 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
%timeit df.loc[(df["state"] == "NY") & (df["ptype"] == "PAS") & (df["color"] == "WHITE") & (df["feet"] > 1)]
%timeit df.query("(state == 'NY') & (ptype == 'PAS') & (color == 'WHITE') & (feet > 1)")
%timeit df.loc[df.eval("(state == 'NY') & (ptype == 'PAS') & (color == 'WHITE') & (feet > 1)")]

1.41 s ± 1.62 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
559 ms ± 968 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
559 ms ± 1.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
%timeit df.loc[(df["state"] == "NY") & (df["ptype"] == "PAS") & (df["color"] == "WHITE") & (df["feet"] > 1) & (df["make"] == "TOYOT")]
%timeit df.query("(state == 'NY') & (ptype == 'PAS') & (color == 'WHITE') & (feet > 1) & (make == 'TOYOT')")
%timeit df.loc[df.eval("(state == 'NY') & (ptype == 'PAS') & (color == 'WHITE') & (feet > 1) & (make == 'TOYOT')")]

1.9 s ± 15.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
728 ms ± 10.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
724 ms ± 1.25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [30]:
# beyond 1, can use 'and' and 'or' in `.query` - does it make any difference to speed?
%timeit df.query("(state == 'NY') and (ptype == 'PAS') and (color == 'WHITE') and (feet > 1) and (make == 'TOYOT')")
# doesn't look like it

713 ms ± 2.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [31]:
# beyond 2 - use a multiplication in the selection to find out which cars are more than 1m (3.28 feet) from the curb
%timeit df.loc[df["feet"] * 3.28 > 1]
%timeit df.query("feet * 3.28 > 1")
# looks like a slight cost increase when using .query including a calculation

43.4 ms ± 430 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
59.1 ms ± 112 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [32]:
# beyond 3 - use the calculation from beyond 2 and are in NY
%timeit df.loc[(df["feet"] * 3.28 > 1) & (df["state"] == "NY")]
%timeit df.query("(feet * 3.28 > 1) and (state == 'NY')")
# calculation hit is more than outweighed by the speed boost for multiple boolean series

511 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
253 ms ± 916 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
