In [1]:
# Check out performance differences

In [2]:
import pandas as pd
import numpy as np

In [3]:
%timeit -n 10

In [4]:
# Generate random input

def reset_df():
    global df
    # randint -- [lowbound, upbound)
    # randint(1, 2) -- [0, 2) 
    # this will generate 0 or 1, because 2 is not included

    # 1M rows
    df = pd.DataFrame(np.random.randint(0, 2, size=(1000000, 2)),
                      columns=["form_code", "ipop_ind"])

    # form_code: type of Medicaid Form
    # 0 = U (Institutional), 1 = H (Professional)

    # ipop_ind : inpatient or outpatient indicator
    # 0 = inpatient, 1 = outpatient

reset_df()
print(df.shape)
df.head()

(1000000, 2)


Unnamed: 0,form_code,ipop_ind
0,1,1
1,0,1
2,0,0
3,0,1
4,0,0


In [5]:
# method 1: simple
def update_simple(df):
    claim_type_list = []
    for index, row in df.iterrows():
        if row['form_code'] == 1:
            # Professional
            claim_type = 'P'
        elif row['ipop_ind'] == 0:
            # Institutional Inpatient
            claim_type = 'I'
        else:
            # Institutional Outpatient
            claim_type == 'O'
        
        claim_type_list.append(claim_type)
        
    df['claim_type'] = claim_type_list
    
df = None
reset_df()

In [6]:
%time update_simple(df)

KeyboardInterrupt: 

In [7]:
# https://jakevdp.github.io/PythonDataScienceHandbook/01.07-timing-and-profiling.html

In [8]:
# method 2: apply
def update_standard(x):
    if x.form_code == 1:
        # Professional
        return 'P'
    elif x.ipop_ind == 0:
        # Institutional Inpatient
        return 'I'
    else:
        # Institutional Outpatient
        return 'O'
        
%timeit df['claim_type'] = df.apply(update_simple, axis=1)

df = None
reset_df()

AttributeError: ("'Series' object has no attribute 'iterrows'", 'occurred at index 0')

In [None]:
# method 3: pandarallel
# using simple library to runs apply function on multiple processor

from pandarallel import pandarallel
from math import sin

pandarallel.initialize()
%time df['claim_type'] = df.parallel_apply(update_simple, axis=1)

df = None
reset_df()

In [None]:
# https://github.com/nalepae/pandarallel
# Multi processing for Pandas for morals