# Tips and Tricks for Pandas

In [1]:
%load_ext Cython
import numpy as np
np.random.seed(0)
import pandas as pd

In [2]:
def make_df():
    return pd.DataFrame(
        np.random.rand(10_000, 3),
        columns=["A", "B", "C"]
    )

In [3]:
df = make_df()

In [4]:
print(df.head())

          A         B         C
0  0.548814  0.715189  0.602763
1  0.544883  0.423655  0.645894
2  0.437587  0.891773  0.963663
3  0.383442  0.791725  0.528895
4  0.568045  0.925597  0.071036


## Speed-Up Apply

Use-Case:
Replace the value in a column by 0.0 if it is less than 0.5

In [5]:
def slow_function(df):
    col = "A"
    for idx, row in df.iterrows():
        if row[col] < 0.5:
            row[col] = 0.0

In [6]:
%timeit slow_function(df)

print(df.head())

1.58 s ± 169 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
          A         B         C
0  0.548814  0.715189  0.602763
1  0.544883  0.423655  0.645894
2  0.000000  0.891773  0.963663
3  0.000000  0.791725  0.528895
4  0.568045  0.925597  0.071036


In [7]:
def faster_function(df):
    df["B"] = df["B"].apply(lambda x: 0.0 if x < 0.5 else x)

In [8]:
df = make_df()

%timeit faster_function(df)

print(df.head())

6.18 ms ± 853 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
          A         B         C
0  0.758125  0.503319  0.177017
1  0.832537  0.516825  0.926920
2  0.971807  0.675130  0.312100
3  0.575684  0.000000  0.269004
4  0.454180  0.000000  0.610589


In [9]:
def even_faster_function(df):
    df["C"] = np.where(df["C"] < 0.0, 0.5, df["C"])

In [10]:
df = make_df()

%timeit even_faster_function(df)

print(df.head())

680 µs ± 43.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
          A         B         C
0  0.927955  0.902937  0.427617
1  0.510806  0.583200  0.952330
2  0.280789  0.795345  0.975140
3  0.463157  0.712459  0.102832
4  0.387149  0.448450  0.360397


In [25]:
%%cython
cimport cython
cimport numpy as np
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef inner(np.ndarray[double, ndim=1] array):
    for i in range(array.shape[0]):
        if array[i] < 0.5:
            array[i] = 0.0

def cython_apply(df):
    inner(df["A"].values)

In [12]:
df = make_df()

%timeit cython_apply(df)

print(df.head())

55.2 µs ± 1.72 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
          A         B         C
0  0.000000  0.058548  0.436811
1  0.720297  0.824412  0.447509
2  0.000000  0.174104  0.852748
3  0.526098  0.488417  0.380276
4  0.000000  0.799242  0.955156


In [26]:
import numba

@numba.jit
def inner(array):
    for i in range(array.shape[0]):
        if array[i] < 0.5:
            array[i] = 0.0

def numba_apply(df):
    inner(df["A"].values)

In [27]:
df = make_df()

%timeit numba_apply(df)

print(df.head())

55.4 µs ± 7.07 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
          A         B         C
0  0.765605  0.050536  0.997545
1  0.000000  0.209466  0.733057
2  0.000000  0.554136  0.299882
3  0.000000  0.556513  0.967307
4  0.592292  0.325236  0.201376


| Function 	| Time 	| Speed-Up 	|
| ---	|-	|-	|
|  slow_function	| 1.58 * 10^6 &nbsp &nbsp	| - 	|
| faster_function 	| 6.18 * 10^3 	|  255x	|
| even_faster_function  &nbsp  	|  680 &nbsp &nbsp	|  2,323x	|
| cython_function 	|  55	| 28,727x 	|
| numba_function 	| 55 	|  28,727x	|
| c++ 	| 30 	|  52,666x	|

# Speed-Up Transform

In [15]:
def slow_function(df):
    col = "A"
    for idx, row in df.iterrows():
        row[col] = row[col] + 1.0

In [16]:
df = make_df()

%timeit slow_function(df)

print(df.head())

600 ms ± 107 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
          A         B         C
0  8.622652  0.824218  0.284283
1  8.878396  0.641164  0.888156
2  8.761528  0.375030  0.921856
3  8.291822  0.412477  0.464101
4  8.251662  0.813229  0.124318


In [17]:
def faster_function(df):
    df["B"] = df["B"].transform(lambda x: x + 1.0)

In [18]:
df = make_df()

%timeit faster_function(df)

print(df.head())

4.6 ms ± 708 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
          A           B         C
0  0.966403  811.974602  0.130418
1  0.510132  811.113840  0.696690
2  0.479423  811.080034  0.057539
3  0.953299  811.174636  0.619358
4  0.736562  811.642363  0.777584


In [19]:
def even_faster_function(df):
    df["C"] = df["C"] + 1.0

In [20]:
df = make_df()

%timeit even_faster_function(df)

print(df.head())

525 µs ± 102 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
          A         B            C
0  0.290573  0.768656  8111.050974
1  0.679872  0.222246  8111.274793
2  0.677915  0.922029  8111.178734
3  0.941326  0.893525  8111.654297
4  0.451397  0.355714  8111.105789


In [21]:
%%cython
cimport cython
cimport numpy as np
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef inner(np.ndarray[double, ndim=1] array):
    for i in range(array.shape[0]):
        array[i] += 1.0

def cython_transform(df):
    inner(df["A"].values)

In [22]:
df = make_df()

%timeit cython_transform(df)

print(df.head())

31 µs ± 3.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
              A         B         C
0  81111.515565  0.319024  0.048873
1  81111.325628  0.495717  0.687538
2  81111.925181  0.278398  0.565618
3  81111.674131  0.876853  0.916673
4  81111.225720  0.908423  0.656138


In [23]:
import numba

@numba.jit
def inner(array):
    for i in range(array.shape[0]):
        array[i] += 1.0

def numba_transform(df):
    inner(df["A"].values)

In [24]:
df = make_df()

%timeit numba_transform(df)

print(df.head())

31.3 µs ± 5.13 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
              A         B         C
0  81111.205671  0.947660  0.097525
1  81111.719924  0.841147  0.863829
2  81111.439500  0.124873  0.657900
3  81111.050730  0.564794  0.142302
4  81111.403096  0.118479  0.425367


| Function 	| Time 	| Speed-Up 	|
| ---	|-	|-	|
|  slow_function	| 600 * 10^3 &nbsp &nbsp	| - 	|
| faster_function 	| 4.6 * 10^3 	|  130x	|
| even_faster_function  &nbsp  	|  525 &nbsp &nbsp	|  1,142x	|
| cython_function 	|  31	|  19,354x	|
| numba_function 	| 31 	| 19,354x 	|
| c++ 	| 2	| 300,000x 	|