# Tips and Tricks for Pandas

In [228]:
import numpy as np
np.random.seed(0)
import pandas as pd

In [229]:
def series_info(series: pd.Series) -> None:
    print(f"ndim: {series.ndim}")
    print(f"shape: {series.shape}")
    print(f"size: {series.size}")
    print(f"dtype: {series.dtype}")
    print(f"values:\n{series}\n")

In [230]:
def df_info(df: pd.DataFrame) -> None:
    print(f"ndim: {df.ndim}")
    print(f"shape: {df.shape}")
    print(f"size: {df.size}")
    print(f"dtype: {df.dtypes}")
    print(f"values:\n{df}\n")

In [231]:
df = pd.DataFrame(
    np.random.rand(10_000, 3),
    columns=["A", "B", "C"]
)

In [232]:
df_info(df)

ndim: 2
shape: (10000, 3)
size: 30000
dtype: A    float64
B    float64
C    float64
dtype: object
values:
             A         B         C
0     0.548814  0.715189  0.602763
1     0.544883  0.423655  0.645894
2     0.437587  0.891773  0.963663
3     0.383442  0.791725  0.528895
4     0.568045  0.925597  0.071036
...        ...       ...       ...
9995  0.744256  0.833002  0.177207
9996  0.463218  0.581561  0.390984
9997  0.763993  0.886611  0.535190
9998  0.820328  0.622359  0.531140
9999  0.852904  0.734230  0.638997

[10000 rows x 3 columns]



## Speed-Up Apply

Use-Case:
Replace the value in a column by 0.0 if it is less than 0.5

In [233]:
def slow_function(df):
    col = "A"
    for idx, row in df.iterrows():
        if row[col] < 0.5:
            row[col] = 0.0

In [234]:
%timeit slow_function(df)

print(df.head())

606 ms ± 66.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
          A         B         C
0  0.548814  0.715189  0.602763
1  0.544883  0.423655  0.645894
2  0.000000  0.891773  0.963663
3  0.000000  0.791725  0.528895
4  0.568045  0.925597  0.071036


In [235]:
def faster_function(df):
    df["B"] = df["B"].apply(lambda x: 0.0 if x < 0.5 else x)

In [236]:
%timeit faster_function(df)

print(df.head())

4.22 ms ± 1.17 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
          A         B         C
0  0.548814  0.715189  0.602763
1  0.544883  0.000000  0.645894
2  0.000000  0.891773  0.963663
3  0.000000  0.791725  0.528895
4  0.568045  0.925597  0.071036


In [237]:
print(f"Speedup by {int(585 / 3.14)}x")

Speedup by 186x


In [238]:
def even_faster_function(df):
    df["C"] = np.where(df["C"] < 0.0, 0.5, df["C"])

In [239]:
%timeit even_faster_function(df)

print(df.head())

609 µs ± 21 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
          A         B         C
0  0.548814  0.715189  0.602763
1  0.544883  0.000000  0.645894
2  0.000000  0.891773  0.963663
3  0.000000  0.791725  0.528895
4  0.568045  0.925597  0.071036


In [240]:
print(f"Speedup by {int(585 * 1000.0 / 53.1)}x")

Speedup by 11016x


# Speed-Up Transform

In [241]:
def slow_function(df):
    col = "A"
    for idx, row in df.iterrows():
        row[col] = row[col] + 1.0

In [242]:
%timeit slow_function(df)

1.01 s ± 54.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [243]:
def faster_function(df):
    df["B"] = df["B"].transform(lambda x: x + 1.0)

In [244]:
%timeit faster_function(df)

5.47 ms ± 295 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [245]:
def even_faster_function(df):
    df["C"] = df["C"] + 1.0

In [246]:
%timeit even_faster_function(df)

233 µs ± 23.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [256]:
%load_ext Cython

In [273]:
%%cython
cimport numpy as np

cpdef np.ndarray[double] f(np.ndarray vals):
    return vals + 1.0

def function(df):
    df["C"] = f(df["C"].values)

In [274]:
%timeit function(df)

169 µs ± 50.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [280]:
import numba

@numba.vectorize
def f(x):
    return x + 1.0

def function(df):
    df["C"] = f(df["C"].values)

TypeError: The decorated object is not a function (got type <class 'numba.np.ufunc.dufunc.DUFunc'>).

In [279]:
%timeit function(df)

246 µs ± 71.6 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Speed-Up Filter

In [247]:
def function(df):
    boolean_idxs = (df["A"] >= 0.5)
    df = df[boolean_idxs]

In [248]:
%timeit function(df)

850 µs ± 36.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [249]:
def faster_function(df):
    boolean_idxs = (df["A"].values >= 0.5)
    df = df.loc[boolean_idxs]

In [250]:
%timeit faster_function(df)

562 µs ± 25.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [253]:
def even_faster_function(df):
    boolean_idxs = np.where(df["A"].values >= 0.5)
    df = df.iloc[boolean_idxs]

In [254]:
%timeit even_faster_function(df)

595 µs ± 40.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
