# Funnel Analysis

In [1]:
%matplotlib notebook
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import HTML, display

In [2]:
dateparser = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
#TODO: add dtype={"deviceid":np.intxx} instead of low_memory. currently, it is guessing dtype
#      only after everything is loaded into memory
df = pd.read_csv('events.csv', low_memory = False, 
                 parse_dates=['date'], date_parser=dateparser)
# let us print how many records are we talking about here
df.shape

(5980, 3)

## Group data into a funnel by deviceid
Now, let us group the data frame into another funnel.

We will only take those devices that had First Launch count as 1 to ignore all the upgrade after install before situations.

In [3]:
GROUP_KEY = 'deviceid'
TIME_FIELD = 'date'
EVENT_FIELD = 'event'
FUNNEL_STEPS = [
    'First Launch', 
    'Register Button', 
    'Provide Email', 
    'Validate Email', 
    'Terms & Conditions Show', 
    'Terms & Conditions Accept'
    ]


def funnelize_by_timestamp(group, funnel_steps):
    curr = 0
    funnel_cts = [0 for s in funnel_steps]

    for i, row in group.iterrows():
        evt = row[EVENT_FIELD]
        if evt in funnel_steps:
            idx = funnel_steps.index(evt)
            if idx <= curr + 1:
                funnel_cts[idx] += 1
                curr = idx
    return funnel_cts
    

def funnelize_wo_timestamp(group, funnel_steps):
    funnel_cts = [0 for s in funnel_steps]

    for i, row in group.iterrows():
        evt = row[EVENT_FIELD]
        if evt in funnel_steps:
            idx = funnel_steps.index(evt)
            funnel_cts[idx] += 1
    return funnel_cts

# sort event dataset by time - this is only required because
# of the code above in funnelize to consider counts for a group
# and event, provided previous event was hit
df.sort_values(by=TIME_FIELD, ascending=True, inplace=True)

# step through events by group_key 
# and increment event counts by funnel step if prior funnel steps occurred
funnel_cts = df.groupby(GROUP_KEY).apply(funnelize_wo_timestamp, FUNNEL_STEPS)

# format results as a dataframe
funnel = pd.DataFrame(list(funnel_cts.values), index=funnel_cts.index, columns=FUNNEL_STEPS)
#filter and make a new df that has first launch already

funnel = funnel[funnel['First Launch']==1]

funnel.shape

(1692, 6)

We got so many devices in the funnel, that has done *First Launch*.

Now, let us reduce the counts for each event to just having it as a normalized. For example, if a device has had *Register Button* 5 times, let us have another funnel that just has this as 1. This way, we can easily dedupe the event count.

In [7]:
# funnel summary, deduped across group key 
# (i.e., event occurrences become 1/0 indicators instead of counts)
funnel_norm = funnel.copy()
funnel_norm[funnel_norm != 0] = 1

top_of_funnel = funnel_norm[FUNNEL_STEPS[0]].sum()
latest_step_count = top_of_funnel
htmltbl = [['<b>Event</b>', '<b>Devices</b>', '<b>% / ' + FUNNEL_STEPS[0] + '</b>', '<b>% / Prev Step</b>']]
for fs in FUNNEL_STEPS:
    fs_devices = funnel_norm[fs].sum()
    htmltbl.append([fs, fs_devices, round(100-100.0*fs_devices/top_of_funnel,2), round(100-100.0*fs_devices/latest_step_count,2)])
    latest_step_count = fs_devices

display(HTML(
    '<h4>Drop Off Funnel</h4><table><tr>{}</tr></table>'.format(
        '</tr><tr>'.join(
            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in htmltbl)
        )
))


0,1,2,3
Event,Devices,% / First Launch,% / Prev Step
First Launch,1692,0.0,0.0
Register Button,1408,16.78,16.78
Provide Email,1131,33.16,19.67
Validate Email,864,48.94,23.61
Terms & Conditions Show,592,65.01,31.48
Terms & Conditions Accept,293,82.68,50.51


In [5]:
funnel_norm[FUNNEL_STEPS].sum().plot(kind='bar')

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f5b3dd0cbd0>