reference : https://inneka.com/programming/python/performance-of-pandas-apply-vs-np-vectorize-to-create-new-column-from-existing-columns/

## 단순 연산에 대한 처리 최적화

In [1]:
import numpy as np
import pandas as pd

np.random.seed(0)
N = 10**6

A_list = np.random.randint(1, 100, N)
B_list = np.random.randint(1, 100, N)
df = pd.DataFrame({'A': A_list, 'B': B_list})

# custom function
def divide(a, b):
    if b == 0:
        return 0.0
    return a / b

In [2]:
# true vectorization
%timeit np.where(df['B'].values==0, 0.0, df['A'].values/df['B'].values)
## values로 ndarray형태로 계산

7 ms ± 55.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [3]:
%timeit np.where(df['B']==0, 0.0, df['A']/df['B'])

7.31 ms ± 17.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [4]:
# np.vectorize // fake vectorization 이라고도 함
%timeit np.vectorize(divide)(df['A'], df['B'])                               

183 ms ± 270 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
# map
%timeit list(map(divide, df['A'], df['B']))                                   

278 ms ± 4.03 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
# zip
%timeit [divide(a, b) for a, b in zip(df['A'], df['B'])]                   

327 ms ± 8.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
# itertuples
%timeit [divide(a, b) for a, b in df[['A', 'B']].itertuples(index=False)]     
# iterrows보다 itertuples만 바꿔도 엄청 빠른 속도 향상 

728 ms ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%timeit df.apply(lambda row: divide(*row), axis=1, raw=True)            
# *row로 전달하는 편이 속도가 좀 더 빠른 듯
# raw: true - 함수에 series가 아닌 ndarray형태로 전달

4.93 s ± 30.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%timeit df.apply(lambda row: divide(*row), axis=1)              

16.9 s ± 48.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%timeit df.apply(lambda row: divide(row['A'], row['B']), axis=1)              

In [None]:
%timeit [divide(row['A'], row['B']) for _, row in df[['A', 'B']].iterrows()]  