In [1]:
# High-Performance Pandas: eval and query
import numpy as np 
rng = np.random.default_rng(42)
x = rng.random(1_000_000)
y = rng.random(1_000_000)
%timeit x + y

3.22 ms ± 21.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [2]:
# speed type operation
np.fromiter((xi + yi for xi,yi in zip(x,y)), dtype=x.dtype, count=len(x))

array([1.47276651, 0.90040381, 0.87388613, ..., 0.77123543, 1.01142688,
       1.13417028])

In [3]:
mask = (x>0.5) & (y<0.5)

In [5]:
tmp1 = (x>0.5)
tmp2 = (y<0.5)
mask = tmp1 & tmp2 
mask 

array([False, False,  True, ...,  True,  True,  True])

In [6]:
import numexpr
mask_numexpr = numexpr.evaluate("(x > 0.5) & (y < 0.5)")
np.all(mask == mask_numexpr)

True

In [7]:
# pandas.eval for Efficient Operations
import pandas as pd 
nrows,ncols = 100_000, 100
df1, df2, df3, df4 = (pd.DataFrame(rng.random((nrows, ncols))) for i in range(4))

df1 + df2 + df3 + df4 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.894619,2.350323,2.478404,2.882839,1.343365,1.786434,2.018079,1.987882,1.735450,2.465074,...,1.421749,2.467428,1.879926,2.541935,1.944373,1.107981,1.164714,1.218824,1.826986,1.768287
1,2.639100,2.094697,1.557960,1.780941,1.830018,2.329232,1.231432,1.396454,2.155633,0.942625,...,1.795405,2.723724,1.930859,2.627139,2.206071,1.400315,1.309764,1.844379,1.097728,1.967507
2,1.721849,1.783487,1.823500,1.815596,1.705560,1.581237,2.064727,2.455018,1.240429,2.224491,...,1.706844,2.797325,2.188354,2.324506,1.371082,2.374894,1.794025,1.752218,2.953386,1.836734
3,1.625072,2.173571,0.987042,3.065283,2.585640,1.556933,2.184181,1.968105,2.251560,1.903023,...,2.107717,2.990492,1.863779,2.628333,1.784315,2.597918,2.073985,2.419915,2.845571,1.463960
4,2.166188,2.905083,1.881283,2.200816,3.151786,1.163403,1.296112,1.081093,1.721385,1.290778,...,2.309246,2.270861,2.552256,3.011268,2.576969,2.579716,2.280400,1.183832,2.946427,2.056846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2.469060,1.944152,1.464401,1.554039,2.052452,1.719286,2.299570,1.783415,1.694976,2.036992,...,2.743753,2.413405,1.769985,2.077083,1.421381,1.672501,2.164880,2.417685,2.394320,1.325424
99996,1.613260,2.357866,2.600431,2.142335,3.100055,2.423109,1.782654,3.142578,2.317951,2.182345,...,2.260197,2.369591,1.531543,2.091696,1.110596,2.325127,2.961379,2.178663,2.988712,2.006969
99997,2.710291,1.639400,0.845316,2.859595,1.363385,1.401434,1.242514,2.843363,1.919517,1.235176,...,3.233900,2.189463,2.025656,1.436872,1.593415,1.912365,2.208321,3.380556,1.891715,2.542389
99998,1.888056,2.454579,1.248855,0.990096,2.601651,0.876812,1.730572,2.463358,1.834659,0.949414,...,1.526910,1.147391,1.169447,1.725887,1.964907,1.667066,2.265025,2.822579,2.700696,2.228108


In [8]:
pd.eval("df1 + df2 + df3 + df4")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.894619,2.350323,2.478404,2.882839,1.343365,1.786434,2.018079,1.987882,1.735450,2.465074,...,1.421749,2.467428,1.879926,2.541935,1.944373,1.107981,1.164714,1.218824,1.826986,1.768287
1,2.639100,2.094697,1.557960,1.780941,1.830018,2.329232,1.231432,1.396454,2.155633,0.942625,...,1.795405,2.723724,1.930859,2.627139,2.206071,1.400315,1.309764,1.844379,1.097728,1.967507
2,1.721849,1.783487,1.823500,1.815596,1.705560,1.581237,2.064727,2.455018,1.240429,2.224491,...,1.706844,2.797325,2.188354,2.324506,1.371082,2.374894,1.794025,1.752218,2.953386,1.836734
3,1.625072,2.173571,0.987042,3.065283,2.585640,1.556933,2.184181,1.968105,2.251560,1.903023,...,2.107717,2.990492,1.863779,2.628333,1.784315,2.597918,2.073985,2.419915,2.845571,1.463960
4,2.166188,2.905083,1.881283,2.200816,3.151786,1.163403,1.296112,1.081093,1.721385,1.290778,...,2.309246,2.270861,2.552256,3.011268,2.576969,2.579716,2.280400,1.183832,2.946427,2.056846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2.469060,1.944152,1.464401,1.554039,2.052452,1.719286,2.299570,1.783415,1.694976,2.036992,...,2.743753,2.413405,1.769985,2.077083,1.421381,1.672501,2.164880,2.417685,2.394320,1.325424
99996,1.613260,2.357866,2.600431,2.142335,3.100055,2.423109,1.782654,3.142578,2.317951,2.182345,...,2.260197,2.369591,1.531543,2.091696,1.110596,2.325127,2.961379,2.178663,2.988712,2.006969
99997,2.710291,1.639400,0.845316,2.859595,1.363385,1.401434,1.242514,2.843363,1.919517,1.235176,...,3.233900,2.189463,2.025656,1.436872,1.593415,1.912365,2.208321,3.380556,1.891715,2.542389
99998,1.888056,2.454579,1.248855,0.990096,2.601651,0.876812,1.730572,2.463358,1.834659,0.949414,...,1.526910,1.147391,1.169447,1.725887,1.964907,1.667066,2.265025,2.822579,2.700696,2.228108


In [9]:
# можно кстати сравнить результаты на вычислении
np.allclose(df1 + df2 + df3 + df4, pd.eval("df1 + df2 + df3 + df4"))

True

In [10]:
df1, df2, df3, df4, df5 = (
    pd.DataFrame(rng.integers(0,1_000,(100,3)))
    for i in range(5)
)

In [11]:
# supports: arithmetic operations, comparison (<=,!=), bitwise, boolean
result1 = -df1 * df2 / (df3+df4) - df5 
result2 = pd.eval("-df1 * df2 / (df3 + df4) - df5")
np.allclose(result1, result2)

True

In [12]:
result1 = df2.T[0] + df3.iloc[1]
result2 = pd.eval("df2.T[0] + df3.iloc[1]")
np.allclose(result1, result2)

True

In [13]:
# DataFrame.eval for Column-Wise Operations
df = pd.DataFrame(rng.random((1000,3)), columns=["A","B","C"])
df.head()

Unnamed: 0,A,B,C
0,0.850888,0.966709,0.95869
1,0.820126,0.385686,0.061402
2,0.059729,0.831768,0.652259
3,0.244774,0.140322,0.041711
4,0.818205,0.753384,0.578851


In [14]:
result1 = (df["A"] + df["B"]) / (df["C"] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)

True

In [15]:
result3 = df.eval("(A + B) / (C - 1)")
np.allclose(result1, result3)

True

In [17]:
# Assignment in DataFrame.eval 
df.head()

Unnamed: 0,A,B,C
0,0.850888,0.966709,0.95869
1,0.820126,0.385686,0.061402
2,0.059729,0.831768,0.652259
3,0.244774,0.140322,0.041711
4,0.818205,0.753384,0.578851


In [18]:
df.eval("D = (A + B) / C", inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.850888,0.966709,0.95869,1.895916
1,0.820126,0.385686,0.061402,19.638139
2,0.059729,0.831768,0.652259,1.366782
3,0.244774,0.140322,0.041711,9.23237
4,0.818205,0.753384,0.578851,2.715013


In [19]:
# in the same way any existing column can be modified:
df.eval("D = (A - B) / C", inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.850888,0.966709,0.95869,-0.120812
1,0.820126,0.385686,0.061402,7.075399
2,0.059729,0.831768,0.652259,-1.183638
3,0.244774,0.140322,0.041711,2.504142
4,0.818205,0.753384,0.578851,0.111982


In [20]:
# Local Variables in DataFrame.eval 
column_mean = df.mean(1)
result1 = df["A"] + column_mean 
result2 = df.eval("A + @column_mean")
np.allclose(result1, result2)

True

In [21]:
# The DataFrame.query method:
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval("df[(df.A < 0.5) & (df.B < 0.5)]")
np.allclose(result1, result2)

True

In [22]:
result2 = df.query("A < 0.5 and B < 0.5")
np.allclose(result1, result2)

True

In [23]:
Cmean = df["C"].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query("A < @Cmean and B < @Cmean")
np.allclose(result1, result2)

True