https://medium.com/the-modern-scientist/make-pandas-code-120x-faster-a-forbidden-mathematical-jutsu-87103030eb9c

In [15]:
import pandas as pd
import numpy as np

def generate_df(size):
    
    df = pd.DataFrame()
    df['age'] = np.random.randint(1,100,size)
    df['avg_sleeping'] = np.random.randint(1,24, size)
    df['gender'] = np.random.choice(['Male','Female'], size)
    df['annual_income'] = np.random.randint(1000,100000, size)
    df['phone_number'] = np.random.randint(1_111_111_111, 1_999_999_999, size)
    df['favourite_food'] = np.random.choice(['pizza', 'burger', 'chips', 'nachos'], size)
    
    return df

df = generate_df(10_000_000) # 10 million rows.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 6 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   age             int32 
 1   avg_sleeping    int32 
 2   gender          object
 3   annual_income   int32 
 4   phone_number    int32 
 5   favourite_food  object
dtypes: int32(4), object(2)
memory usage: 305.2+ MB


In [16]:
def reward_function(row):
    total_bonus = 0

    if (row['avg_sleeping'] >= 6) and (5000 <= row['annual_income'] <= 10000):
        total_bonus += row['annual_income']*10/100
    
    if (60<=row['age']<=90) and row['gender'] == 'Female':
        total_bonus += row['annual_income'] * 20/100
    
    elif (60<=row['age']<=90) and row['gender'] == 'Male':
        total_bonus += row['annual_income'] * 18/100
    
    total_bonus += row['annual_income'] * 10/100
    
    return total_bonus

# A Wrapper function which will help in timing the function

def wrapper(func, *args, **kwargs):
    def wrapped():
        return func(*args, **kwargs)
    return wrapped

In [17]:
def loop_function(size):
    df = generate_df(size)
    for idx, row in df.iterrows():
        df.loc[idx, 'bonus'] = reward_function(row)
        
    return df

import timeit

sizes = ['10','50', '100','150','1_000','1_500','10_000','15_000']
time_loop = []

for size in sizes:
    
    size = int(size)
    
    wrap = wrapper(loop_function, size)
    n = timeit.timeit(wrap, number = 10)
    
    time_loop.append(n)
    
    
    print(f'Size: {size} | Time: {n}')

Size: 10 | Time: 0.0510417000000416
Size: 50 | Time: 0.09340029999998478
Size: 100 | Time: 0.148123299999952
Size: 150 | Time: 0.215199900000016
Size: 1000 | Time: 1.4127091000000291
Size: 1500 | Time: 2.008473600000002
Size: 10000 | Time: 12.948629200000028
Size: 15000 | Time: 20.841306799999984


In [18]:
def apply_function(size):
    df = generate_df(size)
    df['reward'] = df.apply(reward_function, axis=1)
    return df

import timeit

sizes = ['10','50', '100','150','1_000','1_500','10_000','15_000','100_000']
time_apply = []

for size in sizes:
    
    size = int(size)
    
    wrap = wrapper(apply_function, size)
    n = timeit.timeit(wrap, number = 10)
    
    time_apply.append(n)
    
    
    print(f'Size: {size} | Time: {n}')

Size: 10 | Time: 0.028500500000063766
Size: 50 | Time: 0.0368045999999822
Size: 100 | Time: 0.046288300000014715
Size: 150 | Time: 0.05759090000003653
Size: 1000 | Time: 0.27724269999998796
Size: 1500 | Time: 0.36603949999994256
Size: 10000 | Time: 2.3970189000000346
Size: 15000 | Time: 3.391621700000087
Size: 100000 | Time: 22.998310100000026


In [22]:
def reward_function_part(s,a,g,y):
    total_bonus = 0

    if (s >= 6) and (5000 <= a<= 10000):
        total_bonus += a*10/100
    
    if (60<=y<=90) and g == 'Female':
        total_bonus += a* 20/100
    
    elif (60<=y<=90) and g == 'Male':
        total_bonus += a * 18/100
    
    total_bonus += a * 10/100
    
    return total_bonus

def vectorize_function(size):
    df = generate_df(size)
    return np.vectorize(reward_function_part)(df['avg_sleeping'], df['annual_income'], df['gender'], df['age'])

import timeit

sizes = ['10','50', '100','150','1_000','1_500','10_000','15_000','100_000']
time_vector = []

for size in sizes:
    
    size = int(size)
    
    wrap = wrapper(vectorize_function, size)
    n = timeit.timeit(wrap, number = 10)
    
    time_vector.append(n)
    
    
    print(f'Size: {size} | Time: {n}')

Size: 10 | Time: 0.02737629999978708
Size: 50 | Time: 0.02261610000005021
Size: 100 | Time: 0.020013699999935852
Size: 150 | Time: 0.019712999999910608
Size: 1000 | Time: 0.026163999999880616
Size: 1500 | Time: 0.032264800000120886
Size: 10000 | Time: 0.09880669999984093
Size: 15000 | Time: 0.13260869999999159
Size: 100000 | Time: 0.9364877999998953
