In [2]:
import pandas as pd

In [3]:
from bokeh.io import show
from bokeh.sampledata.les_mis import data

In [5]:
#file-matrix that contains kl coefficient per each genre pair
df = pd.read_csv("kl_div_no_idle.csv",header='infer')

In [6]:
df = df.rename(columns={'Unnamed: 0':'Genre'})

In [7]:
df = df.set_index('Genre').stack().reset_index()

In [8]:
df.head()

Unnamed: 0,Genre,level_1,0
0,Action,Action,0.0
1,Action,Action sports,1.149823
2,Action,Adults only,0.781951
3,Action,Adventure,0.127276
4,Action,Agriculture,1.621293


In [9]:
df.describe()

Unnamed: 0,0
count,20736.0
mean,inf
std,
min,0.0
25%,0.7680417
50%,2.447244
75%,inf
max,inf


In [10]:
df.columns=['SourceID','DestinationID','Coeff']

In [11]:
df.columns

Index(['SourceID', 'DestinationID', 'Coeff'], dtype='object')

In [12]:
df.head()

Unnamed: 0,SourceID,DestinationID,Coeff
0,Action,Action,0.0
1,Action,Action sports,1.149823
2,Action,Adults only,0.781951
3,Action,Adventure,0.127276
4,Action,Agriculture,1.621293


In [13]:
df = df[df.SourceID!=df.DestinationID]

In [14]:
df.describe()

Unnamed: 0,Coeff
count,20592.0
mean,inf
std,
min,0.02132919
25%,0.7860181
50%,2.486397
75%,inf
max,inf


In [505]:
df_sim = df[(df['SourceID'].isin(["Horror","Action","Adventure","Children","Anthology","Biography","Comedy","Arts/crafts","Computers","Adults only","Animals","Music", "News","Sitcom","Shopping","Skateboarding","Animated","Drama","Education","Sports event"])) & (df['DestinationID'].isin(["Horror","Action","Adventure","Children","Anthology","Biography","Comedy","Arts/crafts","Computers","Adults only","Animals","Music", "News","Sitcom","Shopping","Skateboarding","Animated","Drama","Education","Sports event"]))]

In [506]:
df_inf = df_sim[df_sim['Coeff'].isin(['inf'])]
df_without_inf = pd.concat([df_sim, df_inf]).drop_duplicates(keep=False)


In [507]:
df_without_inf.describe()

Unnamed: 0,Coeff
count,243.0
mean,0.739991
std,0.87053
min,0.029862
25%,0.142291
50%,0.445591
75%,0.879155
max,3.683492


In [508]:
df_sim[df_sim['SourceID']!=df_sim['DestinationID']].sort_values(by='Coeff')

Unnamed: 0,SourceID,DestinationID,Coeff
16792,Shopping,Music,0.029862
71,Action,Horror,0.030952
10224,Horror,Action,0.031033
12788,Music,Shopping,0.032328
3198,Biography,Comedy,0.035499
4342,Comedy,Biography,0.038360
3239,Biography,Horror,0.038992
10246,Horror,Biography,0.042110
3215,Biography,Drama,0.052502
6790,Drama,Biography,0.054781


In [509]:
df_sim = df_sim.sort_values(by=['SourceID','Coeff'])

In [510]:
df_sim

Unnamed: 0,SourceID,DestinationID,Coeff
71,Action,Horror,0.030952
22,Action,Biography,0.058676
47,Action,Drama,0.069230
30,Action,Comedy,0.074089
3,Action,Adventure,0.127276
117,Action,Sitcom,0.140832
127,Action,Sports event,0.151585
88,Action,Music,0.210409
116,Action,Shopping,0.242864
5,Action,Animals,0.314255


In [511]:
df_sim.describe()

Unnamed: 0,Coeff
count,342.0
mean,inf
std,
min,0.029862
25%,0.191329
50%,0.782311
75%,inf
max,inf


In [512]:
from numpy import inf
df_sim['Coeff'].replace(inf,10,inplace=True)

In [513]:
df_sim[df_sim['Coeff'].isin(['inf'])]

Unnamed: 0,SourceID,DestinationID,Coeff


In [514]:
import numpy as np
#df_sim['Coeff']= ((10.1 - df_sim['Coeff'])*10)
df_sim['Coeff']= 1/df_sim['Coeff']
#df_sim['Coeff']= np.log(df_sim['Coeff']*100)/np.log(df_sim['Coeff'].max()*100)

In [532]:
df_sim[df_sim['SourceID'] == 'Action'].sort_values(by='Coeff',ascending=False).head(20)

Unnamed: 0,SourceID,DestinationID,Coeff,source,target
71,Action,Horror,32.308192,0,12
22,Action,Biography,17.042621,0,7
47,Action,Drama,14.444709,0,11
30,Action,Comedy,13.497199,0,9
3,Action,Adventure,7.856962,0,2
117,Action,Sitcom,7.100679,0,16
127,Action,Sports event,6.596965,0,18
88,Action,Music,4.752647,0,13
116,Action,Shopping,4.117537,0,15
5,Action,Animals,3.182125,0,3


In [516]:
#building chord using holoviews and bokeh libraries
import holoviews as hv
import bokeh as bk
from holoviews import opts, dim

hv.extension('bokeh')

In [517]:
df_sim.columns

Index(['SourceID', 'DestinationID', 'Coeff'], dtype='object')

In [518]:
df_sim['source'] = df_sim.SourceID.astype('category').cat.codes
df_sim['target'] = df_sim.DestinationID.astype('category').cat.codes

In [519]:
nodes_df = pd.DataFrame(df_sim['SourceID'].unique(), columns=['SourceID'])

In [520]:
nodes_sim = hv.Dataset(nodes_df, 'index') #, 'SourceID')
nodes_sim.data.head()
#nodes_sim = hv.Dataset(df_sim)

Unnamed: 0,index,SourceID
0,0,Action
1,1,Adults only
2,2,Adventure
3,3,Animals
4,4,Animated


In [521]:
smallest_coeff = list(df_sim.groupby('Coeff').count().index.values)

In [523]:
chord =  hv.Chord((df_sim[['source', 'target', 'Coeff']], nodes_sim))#.select(Coeff = smallest_coeff,selection_mode='nodes')

In [524]:
chord.opts(opts.Chord(cmap='Category20', edge_color=dim('Coeff').str(), node_color=dim('index').str(),
               height=1000, labels='SourceID', width=1000))