In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## Create fake DataFrame

In [None]:
def create_dataset(size=500_000):
  df = pd.DataFrame()
  df["age"] = np.random.randint(18, 80, size)
  df["weight_kg"] = np.random.randint(40, 150, size)
  df["height_cm"] = np.random.randint(140, 210, size)
  return df


In [None]:
df = create_dataset()
df.head()

Unnamed: 0,age,weight_kg,height_cm
0,40,48,151
1,59,115,143
2,27,106,163
3,38,82,176
4,54,78,196


In [None]:
len(df)

500000

### Functions to calcualte BMI

In [None]:
def calculate_BMI(row):
  bmi_calculation = row["weight_kg"]/(row["height_cm"]*row["height_cm"])
  return bmi_calculation*10_000

In [122]:
def iterrows_method(df):
  for index, row in tqdm(df.iterrows()):
    df.loc[index, "BMI"] = calculate_BMI(row)
  return df

In [127]:
def apply_method(df):
  df["BMI"] = df.apply(calculate_BMI, axis=1)
  return df

In [130]:
def np_where_method(df):
  df["BMI"] = np.where(df["weight_kg"]>0, (df["weight_kg"]/(df["height_cm"]*df["height_cm"]))*10_000, 0)
  return df

## Calculate BMI

### iterrows method

In [123]:
df = create_dataset(size=10_000)

In [124]:
%%timeit
iterrows_method(df)

10000it [00:03, 3138.92it/s]
10000it [00:03, 3147.17it/s]
10000it [00:03, 3195.76it/s]
10000it [00:03, 3143.94it/s]
10000it [00:03, 3096.37it/s]
10000it [00:03, 3075.92it/s]

1 loop, best of 5: 3.14 s per loop





### Apply method

In [128]:
df = create_dataset(size=10_000)

In [129]:
%%timeit
apply_method(df)

1 loop, best of 5: 174 ms per loop


## NP.WHERE

In [133]:
df = create_dataset(size=10_000)

In [134]:
%%timeit
np_where_method(df)

The slowest run took 6.18 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 5: 708 µs per loop


### With 500k rows

In [None]:
df = create_dataset()

In [None]:
%%timeit
iterrows_method(df)

500000it [27:55, 298.48it/s]
500000it [27:39, 301.23it/s]
83551it [04:21, 319.63it/s]


KeyboardInterrupt: ignored

### SO SLOW!

In [None]:
df = create_dataset()

In [None]:
%%timeit
apply_method(df)

1 loop, best of 5: 8.82 s per loop


In [None]:
df.head()

Unnamed: 0,age,weight_kg,height_cm,BMI
0,21,45,183,13.437248
1,27,116,207,27.07181
2,52,53,183,15.826092
3,22,135,172,45.632774
4,77,79,141,39.736432


In [None]:
df = create_dataset()

In [None]:
%%timeit
np_where_method(df)

100 loops, best of 5: 8.25 ms per loop


In [None]:
df.head()

Unnamed: 0,age,weight_kg,height_cm,BMI
0,60,52,153,22.213678
1,21,42,163,15.807896
2,79,88,208,20.340237
3,73,71,181,21.67211
4,30,116,207,27.07181
