In [1]:
# check CPU information
cpu_info = !lscpu
for inf_item in cpu_info.get_list():
  print(inf_item)

Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
Address sizes:       46 bits physical, 48 bits virtual
CPU(s):              4
On-line CPU(s) list: 0-3
Thread(s) per core:  2
Core(s) per socket:  2
Socket(s):           1
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               106
Model name:          Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
Stepping:            6
CPU MHz:             3499.983
BogoMIPS:            5799.99
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           48K
L1i cache:           32K
L2 cache:            1280K
L3 cache:            55296K
NUMA node0 CPU(s):   0-3
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movb

In [2]:
import numpy as np
import pandas as pd
import timeit

In [3]:
# a very simple arithmetic based method

def sum_nums(a, b):
    return a + b

In [4]:
df = pd.DataFrame({'series1':np.random.randn(1000000), 'series2':np.random.randn(1000000)})

In [5]:
df

Unnamed: 0,series1,series2
0,-0.592514,-0.632610
1,0.555945,1.927692
2,1.741297,1.116981
3,0.253611,-0.301722
4,1.028622,-1.742940
...,...,...
999995,1.328523,0.117499
999996,-0.984490,0.542356
999997,-0.298033,1.113037
999998,-0.597248,-0.742494


## Check to see if all methods result in the same output

In [6]:
series3 = df.apply(lambda df: sum_nums(df['series1'],df['series2']),axis=1)
series3[0:5].to_list(), series3[-5:].to_list()

([-1.2251244064792526,
  2.48363674099452,
  2.8582780926287974,
  -0.04811132198094986,
  -0.7143177980855515],
 [1.446022830998357,
  -0.44213311264394073,
  0.8150038348480322,
  -1.339741748523136,
  -0.6831815537853898])

In [7]:
series3 = [sum_nums(a, b) for a, b in df.itertuples(index=False)]
series3[0:5], series3[-5:]

([-1.2251244064792526,
  2.48363674099452,
  2.8582780926287974,
  -0.04811132198094986,
  -0.7143177980855515],
 [1.446022830998357,
  -0.44213311264394073,
  0.8150038348480322,
  -1.339741748523136,
  -0.6831815537853898])

In [8]:
series3 = [sum_nums(a, b) for a, b in zip(df['series1'],df['series2'])]
series3[0:5], series3[-5:]

([-1.2251244064792526,
  2.48363674099452,
  2.8582780926287974,
  -0.04811132198094986,
  -0.7143177980855515],
 [1.446022830998357,
  -0.44213311264394073,
  0.8150038348480322,
  -1.339741748523136,
  -0.6831815537853898])

In [9]:
series3 = np.vectorize(sum_nums)(df['series1'],df['series2'])
series3[0:5].tolist(), series3[-5:].tolist()

([-1.2251244064792526,
  2.48363674099452,
  2.8582780926287974,
  -0.04811132198094986,
  -0.7143177980855515],
 [1.446022830998357,
  -0.44213311264394073,
  0.8150038348480322,
  -1.339741748523136,
  -0.6831815537853898])

In [10]:
series3 = df['series1'] + df['series2']
series3[0:5].tolist(), series3[-5:].tolist()

([-1.2251244064792526,
  2.48363674099452,
  2.8582780926287974,
  -0.04811132198094986,
  -0.7143177980855515],
 [1.446022830998357,
  -0.44213311264394073,
  0.8150038348480322,
  -1.339741748523136,
  -0.6831815537853898])

In [11]:
series3 = np.add(df['series1'].to_numpy(),df['series2'].to_numpy())
series3[0:5].tolist(), series3[-5:].tolist()

([-1.2251244064792526,
  2.48363674099452,
  2.8582780926287974,
  -0.04811132198094986,
  -0.7143177980855515],
 [1.446022830998357,
  -0.44213311264394073,
  0.8150038348480322,
  -1.339741748523136,
  -0.6831815537853898])

## Timit setup

In [12]:
setup = '''
import numpy as np
import pandas as pd
series = np.array([np.arange(1,1000000),np.arange(2,1000001)]).T
df = pd.DataFrame(series, columns=['series1','series2'])
def sum_nums(a, b):
    return a + b
'''

In [13]:
apply_func = '''
series3 = df.apply(lambda df: sum_nums(df['series1'],df['series2']),axis=1)
'''

In [14]:
iter_tup_func = '''
series3 = [sum_nums(a, b) for a, b in df.itertuples(index=False)]
'''

In [15]:
list_comp_func = '''
series3 = [sum_nums(a, b) for a, b in zip(df['series1'],df['series2'])]
'''

In [16]:
vectorize_func = '''
series3 = np.vectorize(sum_nums)(df['series1'],df['series2'])
'''

In [17]:
df_func = '''
series3 = df['series1'] + df['series2']
'''

In [18]:
numpy_func = '''
series3 = np.add(df['series1'].to_numpy(),df['series2'].to_numpy())
'''

## Run timit for different methods

In [19]:
timeit.timeit(stmt=apply_func, setup=setup, number=1)

6.894508293000399

In [20]:
timeit.timeit(stmt=iter_tup_func, setup=setup, number=10)

4.191379108000547

In [21]:
timeit.timeit(stmt=list_comp_func, setup=setup, number=100)

17.963948019008967

In [22]:
timeit.timeit(stmt=vectorize_func, setup=setup, number=100)

16.306607428006828

In [23]:
timeit.timeit(stmt=df_func, setup=setup, number=1000)

0.9586487359920284

In [24]:
timeit.timeit(stmt=numpy_func, setup=setup, number=1000)

0.8103398569946876

# A more complicated example

You could of couse put logic into the function. However, this will negate the use of both a simple numpy operation, and a simple pandas operation as detailed in the section above.

This section will therefore consider the potential to speed up conditional methods.

In [25]:
def categorise(a, b):
    if a < 0:
        return a * 2 + b
    elif b < 0:
        return a + 2 * b
    else:
        return None

## Check to see if all methods result in the same output

In [26]:
series3 = df.apply(lambda df: categorise(df['series1'],df['series2']),axis=1)
series3[0:5].to_list(), series3[-5:].to_list()

([-1.8176383571371104, nan, nan, -0.349833207662407, -2.457257597027051],
 [nan,
  -1.426622633205623,
  0.5169703831401592,
  -1.9369898277201019,
  -1.4663032240581628])

In [27]:
series3 = [categorise(a, b) for a, b in df.itertuples(index=False)]
series3[0:5], series3[-5:]

([-1.8176383571371104, None, None, -0.349833207662407, -2.457257597027051],
 [None,
  -1.426622633205623,
  0.5169703831401592,
  -1.9369898277201019,
  -1.4663032240581628])

In [28]:
series3 = [categorise(a, b) for a, b in zip(df['series1'],df['series2'])]
series3[0:5], series3[-5:]

([-1.8176383571371104, None, None, -0.349833207662407, -2.457257597027051],
 [None,
  -1.426622633205623,
  0.5169703831401592,
  -1.9369898277201019,
  -1.4663032240581628])

In [29]:
series3 = np.vectorize(categorise)(df['series1'],df['series2'])
series3[0:5].tolist(), series3[-5:].tolist()

([-1.8176383571371104, nan, nan, -0.349833207662407, -2.457257597027051],
 [nan,
  -1.426622633205623,
  0.5169703831401592,
  -1.9369898277201019,
  -1.4663032240581628])

## Timeit setup

In [30]:
setup = '''
import numpy as np
import pandas as pd
series = np.array([np.arange(1,1000000),np.arange(2,1000001)]).T
df = pd.DataFrame(series, columns=['series1','series2'])
def categorise(a, b):
    if a < 0:
        return a * 2 + b
    elif b < 0:
        return a + 2 * b
    else:
        return None
'''

In [31]:
apply_func = '''
series3 = df.apply(lambda df: categorise(df['series1'],df['series2']),axis=1)
'''

In [32]:
iter_tup_func = '''
series3 = [categorise(a, b) for a, b in df.itertuples(index=False)]
'''

In [33]:
list_comp_func = '''
series3 = [categorise(a, b) for a, b in zip(df['series1'],df['series2'])]
'''

In [34]:
vectorize_func = '''
series3 = np.vectorize(categorise)(df['series1'],df['series2'])
'''

## Run timit for different methods

In [35]:
timeit.timeit(stmt=apply_func, setup=setup, number=1)

6.753696040002978

In [36]:
timeit.timeit(stmt=iter_tup_func, setup=setup, number=10)

4.192175960997702

In [37]:
timeit.timeit(stmt=list_comp_func, setup=setup, number=100)

18.83312189701246

In [38]:
timeit.timeit(stmt=vectorize_func, setup=setup, number=100)

14.077709199002129

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=52f7c90b-3e2d-4013-927e-2e7a52de7d0a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>