In [1]:
"""
Author(s): Steven van den Broek, Yuqin Cui
Created: 2019-05-05
Edited: 2019-05-05
"""

In [58]:
import numpy as np
import pandas as pd
import panel as pn
import param
import hvplot.pandas
from colorcet import palette

In [40]:
fname = 'GephiMatrix_author_similarity.csv'
f = open(fname, 'r')

# Get author names
line1 = f.readline()
names = line1[1:].split(';');

seen = {}
dupes = []

for index, name in enumerate(names):
    if name not in seen:
        seen[name] = 1
    else:
        if seen[name] == 1:
            dupes.append((index, name))
        seen[name] += 1

# add 1, 2 etc after the name
for pair in dupes:
    index = pair[0]
    name = pair[1]
    for i in range(seen[name]):
        names[index] = name + str((i+1))
        #print(names[index])

# Read csv
df = pd.read_csv(f, names=names, sep=';')

# Fix it
df = df.reset_index(level=1)
names.append("delete")
names = [name.replace('_', ' ') for name in names]
df.columns = names
del df["delete"]
df.set_index([df.columns], inplace=True)

# Get names again for later use
names = df.columns.tolist()

# Get 150*150 sub matrix since otherwise the plot is very slow..
df = df.head(150)[names[0:150]]
names = df.columns.tolist()
df.head()

Unnamed: 0,Jim Thomas,Eleftherios Koutsofios,Lawrence A. Rowe,Jonathan I. Helfman,Mary Brewster,Min Chen,Christopher G. Healey,Mac Holden,M. Gray,Jason Stewart,...,Daniel Keim,Kenneth Utting,Ted Selker,Gabor T. Herman,Laxmi Parida,Jeannine Pinto,Jonathan Furner,Clayton Lewis,Francis P. Boscoe,Ian M. Soboroff
Jim Thomas,1.0,0.097384,0.12563,0.109691,0.275405,0.195006,0.196239,0.317986,0.407599,0.0,...,0.098332,0.311889,0.179382,0.235521,0.185396,0.13484,0.147984,0.279826,0.262875,0.058026
Eleftherios Koutsofios,0.097384,1.0,0.223607,0.14462,0.281718,0.235008,0.38318,0.365148,0.132508,0.0,...,0.115145,0.172133,0.201843,0.426963,0.172848,0.0,0.400819,0.434813,0.225279,0.258199
Lawrence A. Rowe,0.12563,0.223607,1.0,0.242536,0.094491,0.181902,0.123579,0.255155,0.027778,0.0,...,0.08092,0.072169,0.246183,0.292925,0.052705,0.0,0.128037,0.220971,0.087186,0.072169
Jonathan I. Helfman,0.109691,0.14462,0.242536,1.0,0.229175,0.294118,0.17384,0.049507,0.053897,0.0,...,0.06423,0.525105,0.417957,0.284179,0.076696,0.0,0.341589,0.064312,0.225554,0.35007
Mary Brewster,0.275405,0.281718,0.094491,0.229175,1.0,0.297927,0.336302,0.115728,0.356966,0.0,...,0.175167,0.245495,0.255883,0.319844,0.099602,0.253546,0.338754,0.150334,0.483312,0.163663


In [41]:
solid = df.reset_index()
liquid = solid.melt(id_vars='index', value_vars=list(df.columns[1:]), value_name="Similarity", var_name="name2")
liquid.columns = ['name1', 'name2', 'similarity']
liquid.head()

Unnamed: 0,name1,name2,similarity
0,Jim Thomas,Eleftherios Koutsofios,0.097384
1,Eleftherios Koutsofios,Eleftherios Koutsofios,1.0
2,Lawrence A. Rowe,Eleftherios Koutsofios,0.223607
3,Jonathan I. Helfman,Eleftherios Koutsofios,0.14462
4,Mary Brewster,Eleftherios Koutsofios,0.281718


In [42]:
liquid.hvplot.heatmap('name1', 'name2', 'similarity',
                      height=500, width=600, flip_yaxis=True, xaxis=None, yaxis=None)

In [43]:
#convert similarity into unsimilarity (1.0 - similarity)
for name in names:
    df[name] = 1 - df[name]
df.head()

Unnamed: 0,Jim Thomas,Eleftherios Koutsofios,Lawrence A. Rowe,Jonathan I. Helfman,Mary Brewster,Min Chen,Christopher G. Healey,Mac Holden,M. Gray,Jason Stewart,...,Daniel Keim,Kenneth Utting,Ted Selker,Gabor T. Herman,Laxmi Parida,Jeannine Pinto,Jonathan Furner,Clayton Lewis,Francis P. Boscoe,Ian M. Soboroff
Jim Thomas,0.0,0.902616,0.87437,0.890309,0.724595,0.804994,0.803761,0.682014,0.592401,1.0,...,0.901668,0.688111,0.820618,0.764479,0.814604,0.86516,0.852016,0.720174,0.737125,0.941974
Eleftherios Koutsofios,0.902616,0.0,0.776393,0.85538,0.718282,0.764992,0.61682,0.634852,0.867492,1.0,...,0.884855,0.827867,0.798157,0.573037,0.827152,1.0,0.599181,0.565187,0.774721,0.741801
Lawrence A. Rowe,0.87437,0.776393,0.0,0.757464,0.905509,0.818098,0.876421,0.744845,0.972222,1.0,...,0.91908,0.927831,0.753817,0.707075,0.947295,1.0,0.871963,0.779029,0.912814,0.927831
Jonathan I. Helfman,0.890309,0.85538,0.757464,0.0,0.770825,0.705882,0.82616,0.950493,0.946103,1.0,...,0.93577,0.474895,0.582043,0.715821,0.923304,1.0,0.658411,0.935688,0.774446,0.64993
Mary Brewster,0.724595,0.718282,0.905509,0.770825,0.0,0.702073,0.663698,0.884272,0.643034,1.0,...,0.824833,0.754505,0.744117,0.680156,0.900398,0.746454,0.661246,0.849666,0.516688,0.836337


In [44]:
#This is just the method online: https://gmarti.gitlab.io/ml/2017/09/07/how-to-sort-distance-matrix.html
#We have to clean data and modified the method

import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn import datasets
from fastcluster import linkage
dist_mat = squareform(pdist(df))

In [45]:
def seriation(Z,N,cur_index):
    '''
        input:
            - Z is a hierarchical tree (dendrogram)
            - N is the number of points given to the clustering process
            - cur_index is the position in the tree for the recursive traversal
        output:
            - order implied by the hierarchical tree Z
            
        seriation computes the order implied by a hierarchical tree (dendrogram)
    '''
    if cur_index < N:
        return [cur_index]
    else:
        left = int(Z[cur_index-N,0])
        right = int(Z[cur_index-N,1])
        return (seriation(Z,N,left) + seriation(Z,N,right))
    
def compute_serial_matrix(dist_mat,method="ward"):
    '''
        input:
            - dist_mat is a distance matrix
            - method = ["ward","single","average","complete"]
        output:
            - seriated_dist is the input dist_mat,
              but with re-ordered rows and columns
              according to the seriation, i.e. the
              order implied by the hierarchical tree
            - res_order is the order implied by
              the hierarhical tree
            - res_linkage is the hierarhical tree (dendrogram)
        
        compute_serial_matrix transforms a distance matrix into 
        a sorted distance matrix according to the order implied 
        by the hierarchical tree (dendrogram)
    '''
    N = len(dist_mat)
    flat_dist_mat = squareform(dist_mat)
    res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
    res_order = seriation(res_linkage, N, N + N-2)
    seriated_dist = np.zeros((N,N))
    a,b = np.triu_indices(N,k=1)
    seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
    seriated_dist[b,a] = seriated_dist[a,b]
    
    return seriated_dist, res_order, res_linkage

In [52]:
ordered_dist_mat = {}
ordered_dist_mat['ward'], res_order, res_linkage = compute_serial_matrix(dist_mat,"ward")
ordered_dist_mat['single'], res_order, res_linkage = compute_serial_matrix(dist_mat,"single")
ordered_dist_mat['average'], res_order, res_linkage = compute_serial_matrix(dist_mat,"average")
ordered_dist_mat['complete'], res_order, res_linkage = compute_serial_matrix(dist_mat,"complete")

solid = pd.DataFrame(ordered_dist_mat['ward'])
solid.index = names
solid.columns = names
solid

Unnamed: 0,Jim Thomas,Eleftherios Koutsofios,Lawrence A. Rowe,Jonathan I. Helfman,Mary Brewster,Min Chen,Christopher G. Healey,Mac Holden,M. Gray,Jason Stewart,...,Daniel Keim,Kenneth Utting,Ted Selker,Gabor T. Herman,Laxmi Parida,Jeannine Pinto,Jonathan Furner,Clayton Lewis,Francis P. Boscoe,Ian M. Soboroff
Jim Thomas,0.000000,1.243522,1.555156,1.295865,1.851939,2.124743,2.512458,2.218384,1.926663,2.371333,...,2.296816,2.457861,2.395389,2.546512,2.453286,2.233329,2.244022,2.329723,2.364660,2.438057
Eleftherios Koutsofios,1.243522,0.000000,2.040694,2.037526,2.061392,2.378482,2.826163,2.551903,1.998967,2.665069,...,2.404376,2.616371,2.559524,2.624666,2.617896,2.497058,2.489411,2.574095,2.796494,2.832488
Lawrence A. Rowe,1.555156,2.040694,0.000000,1.766817,2.327296,2.129100,2.773459,2.395397,2.584579,2.276586,...,2.125880,2.242053,1.968006,1.806564,2.346016,2.515937,1.871250,2.187927,2.427487,2.029075
Jonathan I. Helfman,1.295865,2.037526,1.766817,0.000000,2.057465,2.407062,2.361378,2.373719,2.297918,2.338859,...,2.406117,2.675967,2.515774,2.699179,2.544512,2.385047,2.366037,2.221385,2.420321,2.432876
Mary Brewster,1.851939,2.061392,2.327296,2.057465,0.000000,1.382579,1.908125,1.920476,1.609275,2.000509,...,2.372942,2.679563,2.572671,2.755059,2.518690,2.410354,2.300479,2.422663,2.238732,2.288706
Min Chen,2.124743,2.378482,2.129100,2.407062,1.382579,0.000000,2.122823,1.922009,1.915280,1.996209,...,2.158856,2.377998,2.228691,2.408918,2.331612,2.250613,1.877511,2.171857,2.203221,1.938375
Christopher G. Healey,2.512458,2.826163,2.773459,2.361378,1.908125,2.122823,0.000000,1.700806,1.855404,1.725084,...,2.730340,2.778787,2.409245,2.986631,2.602119,2.274354,2.390703,2.414110,2.213465,2.399321
Mac Holden,2.218384,2.551903,2.395397,2.373719,1.920476,1.922009,1.700806,0.000000,2.012223,1.821983,...,2.473697,2.398618,2.149382,2.925052,2.587635,2.382199,2.073748,2.370305,2.118189,2.037097
M. Gray,1.926663,1.998967,2.584579,2.297918,1.609275,1.915280,1.855404,2.012223,0.000000,1.731145,...,2.531880,2.760393,2.540697,2.874982,2.432645,1.983777,2.378341,2.529616,2.130931,2.473760
Jason Stewart,2.371333,2.665069,2.276586,2.338859,2.000509,1.996209,1.725084,1.821983,1.731145,0.000000,...,2.488829,2.661936,2.159919,2.586660,2.503113,2.294953,2.263934,2.425331,1.933243,1.981392


In [65]:
pn.extension()

class Matrix_dropdown(param.Parameterized):
    reordering = param.ObjectSelector(default="ward",objects=["ward","single","average","complete"])
    
    def view(self):
        solid = pd.DataFrame(ordered_dist_mat[self.reordering])
        solid.index = names
        solid.columns = names
        solid.reset_index(inplace=True)
        liquid = solid.melt(id_vars='index', value_vars=list(df.columns[1:]), var_name="name2")
        liquid.columns = ['name1', 'name2', 'distance']
        return liquid.hvplot.heatmap('name1', 'name2', 'distance',
                      height=500, width=600, flip_yaxis=True, xaxis=None, yaxis=None, cmap=palette['kbc'])

matrix = Matrix_dropdown(name='Adjacency Matrix')
from bokeh.embed import components
pn.Column(matrix.param, matrix.view)