In [1]:
import pandas as pd 
import numpy as np

# intialise data of lists. 
data = {'lvl1':['A','A','A','A','A','A','A','A','B','B','B'], 
        'lvl2':['AP','AP','AP','AC','AC','AC','AB','AB','BE','BR','BA'], 
        'lvl3':['APP','APE','APA','ACT','ACC','ACE','ABL','ABO','BET','BRE','BAK'], 
        'lvl4':['APPL','APEX','APAR','ACTO','ACCE','ACER','ABLE','ABOU','N/A','N/A','N/A'], 
        'count':[5, 2, 3, 8, 2, 10, 1, 3, 4, 6, 3]} 

#data = {'lvl1':['A','A','B'], 
#        'lvl2':['C','D','C'], 
        #'lvl3':['APP','APE','APA','ACT','ACC','ACE','ABL','ABO','BET','BRE','BAK'], 
        #'lvl4':['APPL','APEX','APAR','ACTO','ACCE','ACER','ABLE','ABOU','','',''], 
#        'count':[5, 2, 3]}   

# Create DataFrame 
df = pd.DataFrame(data) 
df=df.replace('N/A',np.NaN)  
# Print the output. 
df

Unnamed: 0,lvl1,lvl2,lvl3,lvl4,count
0,A,AP,APP,APPL,5
1,A,AP,APE,APEX,2
2,A,AP,APA,APAR,3
3,A,AC,ACT,ACTO,8
4,A,AC,ACC,ACCE,2
5,A,AC,ACE,ACER,10
6,A,AB,ABL,ABLE,1
7,A,AB,ABO,ABOU,3
8,B,BE,BET,,4
9,B,BR,BRE,,6


In [5]:
list(set(df['lvl2'].values))

['AB', 'AP', 'BA', 'BE', 'AC', 'BR']

In [2]:
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum
        
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    
    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count']
        )
      )
    
    layout =  dict(
        title = title,
        font = dict(
          size = 10
        )
    )
       
    fig = dict(data=[data], layout=layout)
    return fig

In [3]:
import pandas as pd
import plotly
import plotly.plotly as py

fig = genSankey(df,cat_cols=['lvl1','lvl2','lvl3','lvl4'],value_cols='count',title='Word Etymology')
plotly.offline.plot(fig, validate=False)

'temp-plot.html'