# Sankey diagrams for customer flow

In [2]:
import pandas as pd
import plotly.graph_objects as go

### Import transition probabilities and transform data

In [3]:
# We use the probabilities that have not been normalized
df = pd.read_csv("data/transition_probabilities_count.csv")
df.columns = ['source', 'checkout', 'dairy', 'drinks', 'fruit', 'spices']
df

Unnamed: 0,source,checkout,dairy,drinks,fruit,spices
0,dairy,1839,13095,1041,886,913
1,drinks,2098,106,5821,855,846
2,entrance,0,2141,1143,2810,1351
3,fruit,2562,1219,697,7586,644
4,spices,946,1213,1024,571,2524


In [4]:
# Convert to long format
df1 = df.melt(id_vars=["source"])
df1.columns = ['source', 'target', 'value']
df1.head()

Unnamed: 0,source,target,value
0,dairy,checkout,1839
1,drinks,checkout,2098
2,entrance,checkout,0
3,fruit,checkout,2562
4,spices,checkout,946


In [5]:
# Mapping for the labels
mapping = {
    "entrance": 0,
    "dairy": 1,
    "drinks": 2,
    "fruit": 3,
    "spices": 4,
    "checkout": 5,
}

In [6]:
# Use mapping to change strings to numbers
df1["source"] = df1["source"].map(mapping)
df1["target"] = df1["target"].map(mapping)

In [7]:
df1.head()

Unnamed: 0,source,target,value
0,1,5,1839
1,2,5,2098
2,0,5,0
3,3,5,2562
4,4,5,946


### Plot sankey diagram

In [8]:
fig = go.Figure(
    go.Sankey(
        node=dict(
            label=list(mapping.keys()),
        ),
        link=dict(
            source=df1["source"],
            target=df1["target"],
            value=df1["value"],
        ),
    )
)

fig.update_layout(title_text="Customer flow", width=1200, height=800)
fig.show()

### Exclude self-loops

In [9]:
# Remove rows where source and target are the same
df2 = df1[df1["source"] != df1["target"]].copy()
df2.head()

Unnamed: 0,source,target,value
0,1,5,1839
1,2,5,2098
2,0,5,0
3,3,5,2562
4,4,5,946


### Plot again

In [10]:
fig = go.Figure(
    go.Sankey(
        node=dict(
            label=list(mapping.keys()),
        ),
        link=dict(
            source=df2["source"],
            target=df2["target"],
            value=df2["value"],
        ),
    )
)

fig.update_layout(title_text="Customer flow without self-loops", width=1200, height=800)
fig.show()

## Create sankey diagram for steps

In [11]:
df = pd.read_csv("data/data_clean.csv", index_col=[0])
df

Unnamed: 0,customer_no,timestamp,location
0,10001,2019-09-02 07:02:00,entrance
1,10001,2019-09-02 07:03:00,dairy
2,10001,2019-09-02 07:04:00,dairy
3,10001,2019-09-02 07:05:00,checkout
4,10002,2019-09-02 07:02:00,entrance
...,...,...,...
61371,51509,2019-09-06 21:50:00,drinks
61372,51509,2019-09-06 21:51:00,checkout
61373,51510,2019-09-06 21:49:00,entrance
61374,51510,2019-09-06 21:50:00,spices


In [12]:
# Change the shape of the dataframe to get one column for each step
# https://stackoverflow.com/a/35024259
df_steps = df.groupby("customer_no")["location"].apply(lambda x: pd.Series(x.values)).unstack().copy()

In [13]:
df_steps.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
customer_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001,entrance,dairy,dairy,checkout,,,,,,,...,,,,,,,,,,
10002,entrance,dairy,dairy,dairy,checkout,,,,,,...,,,,,,,,,,
10003,entrance,dairy,dairy,checkout,,,,,,,...,,,,,,,,,,
10004,entrance,dairy,dairy,dairy,dairy,checkout,,,,,...,,,,,,,,,,
10005,entrance,spices,checkout,,,,,,,,...,,,,,,,,,,


In [14]:
def transform_df(df, n):

    df_ = df.copy()

    # Add missing columns
    cols = ["checkout", "dairy", "drinks", "fruit", "spices"]
    for c in cols:
        if c not in df_.columns:
            df_[c] = 0

    # Rename first column
    df_.reset_index(inplace=True)
    df_.columns = ["source", f"dairy_{n+1}", f"drinks_{n+1}", f"fruit_{n+1}", f"spices_{n+1}", f"checkout_{n+1}"]

    df_["source"] = df_["source"].apply(lambda x: f"{x}_{n}")

    # Convert to long format
    df_ = df_.melt(id_vars=["source"])
    df_.columns = ['source', 'target', 'value']
    
    return df_

In [15]:
P1 = pd.crosstab(
    index=df_steps[0],
    columns=df_steps[1],
)

P1 = transform_df(P1, 0)
P1.head()

Unnamed: 0,source,target,value
0,entrance_0,dairy_1,2141
1,entrance_0,drinks_1,1143
2,entrance_0,fruit_1,2810
3,entrance_0,spices_1,1351
4,entrance_0,checkout_1,0


In [16]:
P2 = pd.crosstab(
    index=df_steps[1],
    columns=df_steps[2],
)

P2 = transform_df(P2, 1)
P2.head()

Unnamed: 0,source,target,value
0,dairy_1,dairy_2,250
1,drinks_1,dairy_2,245
2,fruit_1,dairy_2,597
3,spices_1,dairy_2,207
4,dairy_1,drinks_2,1489


In [17]:
P3 = pd.crosstab(
    index=df_steps[2],
    columns=df_steps[3],
)

P3 = transform_df(P3, 2)
P3.head()

Unnamed: 0,source,target,value
0,dairy_2,dairy_3,215
1,drinks_2,dairy_3,278
2,fruit_2,dairy_3,367
3,spices_2,dairy_3,145
4,dairy_2,drinks_3,1523


In [18]:
P4 = pd.crosstab(
    index=df_steps[3],
    columns=df_steps[4],
)

P4 = transform_df(P4, 3)
P4.head()

Unnamed: 0,source,target,value
0,dairy_3,dairy_4,192
1,drinks_3,dairy_4,241
2,fruit_3,dairy_4,295
3,spices_3,dairy_4,92
4,dairy_3,drinks_4,1381


In [19]:
P_final = pd.concat([P1, P2, P3, P4])
P_final.head()

Unnamed: 0,source,target,value
0,entrance_0,dairy_1,2141
1,entrance_0,drinks_1,1143
2,entrance_0,fruit_1,2810
3,entrance_0,spices_1,1351
4,entrance_0,checkout_1,0


In [20]:
# Mapping for the labels
mapping = {
    "entrance_0": 0,
    "dairy_0": 1,
    "drinks_0": 2,
    "fruit_0": 3,
    "spices_0": 4,
    "checkout_0": 5,
    "entrance_1": 6,
    "dairy_1": 7,
    "drinks_1": 8,
    "fruit_1": 9,
    "spices_1": 10,
    "checkout_1": 11,
    "entrance_2": 12,
    "dairy_2": 13,
    "drinks_2": 14,
    "fruit_2": 15,
    "spices_2": 16,
    "checkout_2": 17,
    "entrance_3": 18,
    "dairy_3": 19,
    "drinks_3": 20,
    "fruit_3": 21,
    "spices_3": 22,
    "checkout_3": 23,
    "entrance_4": 24,
    "dairy_4": 25,
    "drinks_4": 26,
    "fruit_4": 27,
    "spices_4": 28,
    "checkout_4": 29,
}

In [21]:
# Use mapping to change strings to numbers
P_final["source"] = P_final["source"].map(mapping)
P_final["target"] = P_final["target"].map(mapping)
P_final.head()

Unnamed: 0,source,target,value
0,0,7,2141
1,0,8,1143
2,0,9,2810
3,0,10,1351
4,0,11,0


In [22]:
# Create sankey diagram
fig = go.Figure(
    go.Sankey(
        node=dict(
            label=list(mapping.keys()),
        ),
        link=dict(
            source=P_final["source"],
            target=P_final["target"],
            value=P_final["value"],
        ),
    )
)

fig.update_layout(title_text="Customer flow", width=1200, height=800)
fig.show()