In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## Create fake DataFrame

In [None]:
def create_dataset(size):
  df = pd.DataFrame()
  df["age"] = np.random.randint(18, 80, size)
  df["weight_kg"] = np.random.randint(40, 150, size)
  df["height_cm"] = np.random.randint(140, 210, size)
  return df


### Functions to calcualte BMI

In [None]:
def calculate_BMI(row):
  bmi_calculation = row["weight_kg"]/(row["height_cm"]*row["height_cm"])
  return bmi_calculation*10_000

In [None]:
def iterrows_method(df):
  for index, row in tqdm(df.iterrows()):
    df.loc[index, "BMI"] = calculate_BMI(row)
  return df

In [None]:
def apply_method(df):
  df["BMI"] = df.apply(calculate_BMI, axis=1)
  return df

In [None]:
def np_where_method(df):
  df["BMI"] = np.where(df["weight_kg"]>0, (df["weight_kg"]/(df["height_cm"]*df["height_cm"]))*10_000, 0)
  return df

## Calculate BMI

### iterrows method

In [None]:
df = create_dataset(100_000)

In [None]:
%%timeit
iterrows_method(df)

100000it [01:20, 1244.84it/s]
100000it [01:20, 1236.19it/s]
100000it [01:19, 1256.01it/s]
100000it [01:19, 1256.17it/s]
100000it [01:17, 1282.12it/s]
100000it [01:17, 1288.55it/s]

1 loop, best of 5: 1min 17s per loop





In [None]:
df.head()

Unnamed: 0,age,weight_kg,height_cm,BMI
0,52,91,191,24.944492
1,19,109,195,28.665352
2,26,85,148,38.805698
3,52,112,206,26.392685
4,20,143,193,38.390292


### Apply method

In [None]:
df = create_dataset(100_000)

In [None]:
%%timeit
apply_method(df)

1 loop, best of 5: 1.73 s per loop


In [None]:
df.head()

Unnamed: 0,age,weight_kg,height_cm,BMI
0,68,120,174,39.635355
1,67,116,186,33.529888
2,58,132,169,46.216869
3,20,64,194,17.004995
4,49,104,189,29.114526


## NP.WHERE

In [None]:
df = create_dataset(100_000)

In [None]:
%%timeit
np_where_method(df)

100 loops, best of 5: 2.53 ms per loop


In [None]:
df.head()

Unnamed: 0,age,weight_kg,height_cm,BMI
0,57,64,169,22.408179
1,70,54,149,24.323229
2,63,77,144,37.133488
3,64,128,191,35.086757
4,37,137,177,43.729452


### With 500k rows

In [None]:
df = create_dataset(500_000)

In [None]:
%%timeit
iterrows_method(df)

500000it [27:55, 298.48it/s]
500000it [27:39, 301.23it/s]
83551it [04:21, 319.63it/s]


KeyboardInterrupt: ignored

### SO SLOW!

In [None]:
df = create_dataset(500_000)

In [None]:
%%timeit
apply_method(df)

1 loop, best of 5: 8.82 s per loop


In [None]:
df.head()

Unnamed: 0,age,weight_kg,height_cm,BMI
0,21,45,183,13.437248
1,27,116,207,27.07181
2,52,53,183,15.826092
3,22,135,172,45.632774
4,77,79,141,39.736432


In [None]:
df = create_dataset(500_000)

In [None]:
%%timeit
np_where_method(df)

100 loops, best of 5: 8.25 ms per loop


In [None]:
df.head()

Unnamed: 0,age,weight_kg,height_cm,BMI
0,60,52,153,22.213678
1,21,42,163,15.807896
2,79,88,208,20.340237
3,73,71,181,21.67211
4,30,116,207,27.07181
