In [1]:
# Now the next question:
# Can we improve even more from yesterday?

# I thought about this tonight as I am coming home from
# Micro Center. What if we turn the classification problem
# into a mathematical problem?

In [2]:
# First in ordre to determine if a given health insurance 
# claim is Professional or Institutional.

# form_code: type of Medicaid Form
# 0 = U (Institutional), 1 = H (Professional)

# ipop_ind : inpatient or outpatient indicator
# 0 = inpatient, 1 = outpatient

In [3]:
# The logics go like this
# If form_code is H, then Professional
# Else if ipop_ind is 0 then Inpatient (Institutional)
# Else Outpatient (Institutional)

# In this particular problem, these are the three classification.
# How can I model this as basic math?

In [4]:
# Idea: Multiple the two columns
# form_code * ipopind = (answer)

# Professional P: 0 * (any inpatient code) is 0
# Institutional IP: 1 * (1 Inpatient) is 1
# Institutional OP: 1 * (2 Outpatient) is 2

In [19]:
# Finally the outcome will be 0, 1, 2

In [20]:
# Let's code it up and see what happen :)
# I wonder if it will yeld any sort of performance improvement.
# Really excited to try even if it does not work.

In [35]:
import pandas as pd
import numpy as np

In [53]:
df = None
sample_size = 1000000

def reset_df():
    global df
    # randint -- [lowbound, upbound)
    # randint(1, 2) -- [0, 2) 
    # this will generate 0 or 1, because 2 is not included

    # 100K rows
    df = pd.DataFrame(np.random.randint(0, 2, size=(sample_size, 2)),
                      columns=["form_code", "ipop_ind"])

    # Using broadcasting to increment test data
    df = df + [0, 1]

reset_df()
# Check the size of dataframe
print(df.shape)
df.head()

(1000000, 2)


Unnamed: 0,form_code,ipop_ind
0,0,1
1,0,2
2,0,1
3,1,2
4,0,1


In [45]:
%timeit df["claim_type"] = df["form_code"] * df["ipop_ind"]

df.head(5)

3.45 ms ± 180 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Unnamed: 0,form_code,ipop_ind,claim_type
0,1,1,1
1,0,2,0
2,0,2,0
3,1,2,2
4,0,2,0


In [46]:
%timeit df.groupby(["claim_type"]).count()

print(df.groupby(["claim_type"]).count())

16.6 ms ± 120 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
            form_code  ipop_ind
claim_type                     
0              500102    500102
1              249634    249634
2              250264    250264


In [24]:
# This even faster the 4 cores multi-processing solution
# from yesterday. By order of magnitude of 4. 1000X faster.

In [25]:
# In this case, I don't think running them off multiple worker
# will be nesscary. But as practice and exploration.

# I suspect the overhead will be more expensive (slow).

In [28]:
from pandarallel import pandarallel
from math import sin

pandarallel.initialize()

def update_standard(x):
    return x.form_code * x.ipop_ind

reset_df()
%timeit df['claim_type'] = df.parallel_apply(update_standard, axis=1)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
7 s ± 102 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
# This look like a apply method didn't write properly.

In [30]:
# Let's move on to see how much imrpovement
# If there are extra juice to sqeeze out if we write this in numpy.

In [101]:
# This is the numpy array of the same version above
data = np.random.randint(0, 2, size=(sample_size, 2))
data = data + [0, 1]
data

array([[1, 1],
       [0, 2],
       [0, 1],
       ...,
       [1, 1],
       [1, 2],
       [1, 1]])

In [102]:
# https://stackoverflow.com/questions/4455076/how-to-access-the-ith-column-of-a-numpy-multidimensional-array
# Reference: How to get the first columns 
data[:,0]

array([1, 0, 0, ..., 1, 1, 1])

In [103]:
data[:,1]

array([1, 2, 1, ..., 1, 2, 1])

In [104]:
claim_type = data[:,0] * data[:,1]
claim_type.shape

(1000000,)

In [105]:
# TypeError: data type not understood
np.insert(data, -1, claim_type, axis=1)

array([[1, 1, 1],
       [0, 0, 2],
       [0, 0, 1],
       ...,
       [1, 1, 1],
       [1, 2, 2],
       [1, 1, 1]])

In [106]:
%timeit new_data = np.insert(data, -1, (data[:,0] * data[:,1]), axis=1)

9.12 ms ± 124 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [107]:
# The documentation online is correct base on
# this one simple example. Pandas with broadcast/map on >100K rows
# Pandas' performance is excellent.