In [67]:
import pandas as pd

df_home = pd.read_csv('./data/home_page_table.csv')
df_payment_confirm = pd.read_csv('./data/payment_confirmation_table.csv')
df_payment_page = pd.read_csv('./data/payment_page_table.csv')
df_search = pd.read_csv('./data/search_page_table.csv')
df_user_data = pd.read_csv('./data/user_table.csv')
print('Shape of df_user_data: {}, df_home: {}, df_search: {}, df_payment_page: {}, df_payment_confirm: {}'.format(df_user_data.shape, df_home.shape, df_search.shape, df_payment_page.shape, df_payment_confirm.shape))

Shape of df_user_data: (90400, 4), df_home: (90400, 2), df_search: (45200, 2), df_payment_page: (6030, 2), df_payment_confirm: (452, 2)


In [63]:
from functools import reduce; 

def mergedf(df_list, on_col, how_join):
    df = reduce(lambda left,right: pd.merge(left, right, on=on_col, how=how_join), df_list)
    return df

df = mergedf([df_user_data, df_home, df_search, df_payment_page, df_payment_confirm], 'user_id', 'outer')
df = pd.melt(df, id_vars=['user_id', 'date', 'device', 'sex'], value_name='page').drop(['variable'],axis=1)
print(df.head(10))

   user_id        date   device     sex     page_x       page_y page_x page_y
0   450007  2015-02-28  Desktop  Female  home_page          NaN    NaN    NaN
1   756838  2015-01-13  Desktop    Male  home_page          NaN    NaN    NaN
2   568983  2015-04-09  Desktop    Male  home_page  search_page    NaN    NaN
3   190794  2015-02-18  Desktop  Female  home_page  search_page    NaN    NaN
4   537909  2015-01-15  Desktop    Male  home_page          NaN    NaN    NaN
5   993454  2015-03-03  Desktop    Male  home_page          NaN    NaN    NaN
6   377150  2015-04-15  Desktop  Female  home_page  search_page    NaN    NaN
7   137258  2015-01-27   Mobile    Male  home_page          NaN    NaN    NaN
8   608249  2015-04-12  Desktop  Female  home_page          NaN    NaN    NaN
9   615615  2015-02-24   Mobile  Female  home_page  search_page    NaN    NaN
   user_id        date   device     sex       page
0   450007  2015-02-28  Desktop  Female  home_page
1   756838  2015-01-13  Desktop    Male 

In [64]:
GROUP_KEY = 'user_id'
TIME_FIELD = 'date'
EVENT_FIELD = 'page'
FUNNEL_STEPS = ['home_page', 'search_page', 'payment_page', 'payment_confirmation_page']


def funnelize(group, funnel_steps):
    funnel_cts = [0 for s in funnel_steps]

    for i, row in group.iterrows():
        evt = row[EVENT_FIELD]
        if evt in funnel_steps:
            idx = funnel_steps.index(evt)
            funnel_cts[idx] += 1
            
    return funnel_cts


# sort event dataset by user
df.sort_values(by=GROUP_KEY, ascending=True, inplace=True)

# step through events by group_key (i.e., browser, visit), 
# and increment event counts by funnel step if prior funnel steps occurred
funnel_cts = df.groupby(GROUP_KEY).apply(funnelize, FUNNEL_STEPS)

# format results as a dataframe
funnel = pd.DataFrame(list(funnel_cts.values), index=funnel_cts.index, columns=FUNNEL_STEPS)

funnel.head()

Unnamed: 0_level_0,home_page,search_page,payment_page,payment_confirmation_page
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
17,1,1,0,0
28,1,0,0,0
37,1,1,0,0
38,1,1,1,0
55,1,0,0,0


In [46]:
# funnel summary (aggregate)

print (funnel[FUNNEL_STEPS].sum())

home_page                    90400
search_page                  45200
payment_page                  6030
payment_confirmation_page      452
dtype: int64


In [47]:
# funnel summary, deduped across group key 
# (i.e., event occurrences become 1/0 indicators instead of counts)

funnel_norm = funnel.copy()
funnel_norm[funnel_norm != 0] = 1
funnel_analysis = funnel_norm[FUNNEL_STEPS].sum()
print (funnel_analysis)

home_page                    90400
search_page                  45200
payment_page                  6030
payment_confirmation_page      452
dtype: int64


In [48]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
from __future__ import division

colors = ['rgb(32,155,160)', 'rgb(253,93,124)', 'rgb(28,119,139)', 'rgb(182,231,235)']
phases = list(funnel_analysis.index)
values = list(funnel_analysis)

In [49]:
n_phase = len(phases)
 
# the fixed width for the plot
plot_width = 400
 
# height of a section and difference between sections 
section_h = 100
section_d = 10
 
# multiply factor to calculate the width of other sections
unit_width = plot_width / max(values)
 
# width for all the sections (phases)
phase_w = [int(value * unit_width) for value in values]

In [50]:
height = section_h * n_phase + section_d * (n_phase-1)
 
shapes = []
 
label_y = []
 
for i in range(n_phase):
        if (i == n_phase-1):
                points = [phase_w[i]/2, height, phase_w[i]/2, height - section_h]
        else:
                points = [phase_w[i]/2, height, phase_w[i+1]/2, height - section_h]
 
        path = 'M {0} {1} L {2} {3} L -{2} {3} L -{0} {1} Z'.format(*points)
 
        shape = {
                'type': 'path',
                'path': path,
                'fillcolor': colors[i],
                'line': {
                    'width': 1,
                    'color': colors[i]
                }
        }
        shapes.append(shape)
        
        # Y-axis location for this section's details (phase name and value)
        label_y.append(height - (section_h / 2))
 
        height = height - (section_h + section_d)

In [51]:
# For phase names
label_trace = go.Scatter(
    x=[-350]*n_phase,
    y=label_y,
    mode='text',
    text=phases,
    textfont=dict(
        color='rgb(200,200,200)',
        size=15
    )
)
 
# For phase values
value_trace = go.Scatter(
    x=[350]*n_phase,
    y=label_y,
    mode='text',
    text=values,
    textfont=dict(
        color='rgb(200,200,200)',
        size=15
    )
)

In [52]:
data = [label_trace, value_trace]
 
layout = go.Layout(
    title='Funnel Chart',
    shapes=shapes,
    height=560,
    width=800,
    showlegend=True,
    paper_bgcolor='rgba(44,58,71,1)',
    plot_bgcolor='rgba(44,58,71,1)',
    xaxis=dict(
        showticklabels=False,
        zeroline=False,
    ),
    yaxis=dict(
        showticklabels=False,
        zeroline=False
    )
)
 
fig = go.Figure(data=data, layout=layout)
iplot(fig)