In [71]:
"""
Author(s): Steven van den Broek, Yuqin Cui
Created: 2019-05-08
Edited: 2019-05-08
"""

'\nAuthor(s): Steven van den Broek, Yuqin Cui\nCreated: 2019-05-08\nEdited: 2019-05-08\n'

In [72]:
import numpy as np
import pandas as pd
import panel as pn
import param
import hvplot.pandas
from colorcet import palette

In [73]:
fname = 'GephiMatrix_author_similarity.csv'
f = open(fname, 'r')

# Get author names
line1 = f.readline()
names = line1[1:].split(';');

seen = {}
dupes = []

for index, name in enumerate(names):
    if name not in seen:
        seen[name] = 1
    else:
        if seen[name] == 1:
            dupes.append((index, name))
        seen[name] += 1

# add 1, 2 etc after the name
for pair in dupes:
    index = pair[0]
    name = pair[1]
    for i in range(seen[name]):
        names[index] = name + str((i+1))
        #print(names[index])

# Read csv
df = pd.read_csv(f, names=names, sep=';')

# Fix it
df = df.reset_index(level=1)
names.append("delete")
names = [name.replace('_', ' ') for name in names]
df.columns = names
del df["delete"]
df.set_index([df.columns], inplace=True)

# Get names again for later use
names = df.columns.tolist()

# Get 150*150 sub matrix since otherwise the plot is very slow..
df = df.head(150)[names[0:150]]
names = df.columns.tolist()

In [74]:
#convert similarity into unsimilarity (1.0 - similarity)
for name in names:
    df[name] = 1 - df[name]

In [75]:
#This is just the method online: https://gmarti.gitlab.io/ml/2017/09/07/how-to-sort-distance-matrix.html
#We have to clean data and modified the method

import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn import datasets
from fastcluster import linkage

In [81]:
#This is just the method online: https://gmarti.gitlab.io/ml/2017/09/07/how-to-sort-distance-matrix.html

def seriation(Z,N,cur_index):
#     '''
#         input:
#             - Z is a hierarchical tree (dendrogram)
#             - N is the number of points given to the clustering process
#             - cur_index is the position in the tree for the recursive traversal
#         output:
#             - order implied by the hierarchical tree Z
            
#         seriation computes the order implied by a hierarchical tree (dendrogram)
#     '''
    if cur_index < N:
        return [cur_index]
    else:
        left = int(Z[cur_index-N,0])
        right = int(Z[cur_index-N,1])
        return (seriation(Z,N,left) + seriation(Z,N,right))
    
def compute_serial_matrix(df,method="ward", dist_metric = "euclidean"):
#     '''
#         input:
#             - dist_mat is a distance matrix
#             - method = ["ward","single","average","complete"]
#         output:
#             - seriated_dist is the input dist_mat,
#               but with re-ordered rows and columns
#               according to the seriation, i.e. the
#               order implied by the hierarchical tree
#             - res_order is the order implied by
#               the hierarhical tree
#             - res_linkage is the hierarhical tree (dendrogram)
        
#         compute_serial_matrix transforms a distance matrix into 
#         a sorted distance matrix according to the order implied 
#         by the hierarchical tree (dendrogram)
#     '''
    dist_mat = squareform(pdist(df, metric=dist_metric))
    N = len(dist_mat)
    flat_dist_mat = squareform(dist_mat)
    #flat_dist_mat = dist_mat
    res_linkage = linkage(flat_dist_mat, method=method,preserve_input=True)
    
    res_order = seriation(res_linkage, N, N + N-2)
    seriated_dist = np.zeros((N,N))
    a,b = np.triu_indices(N,k=1)
    seriated_dist[a,b] = dist_mat[ [res_order[i] for i in a], [res_order[j] for j in b]]
    seriated_dist[b,a] = seriated_dist[a,b]
    return seriated_dist, res_order, res_linkage

In [82]:
def reorderrow(df, order):
    a = df.values
    permutation = order
    return a[:,permutation]

def reordercol(df, order):
    secondIndex = []
    new_df = df
    new_df['nindex'] = np.arange(len(new_df))
    for i in order:
        secondIndex += new_df.index[new_df['nindex'] == i].tolist()
    new_df.drop('nindex',axis=1, inplace = True)
    a = new_df.reindex( index = secondIndex)
    return a

In [83]:
def to_liquid(matrix):
    solid = pd.DataFrame(matrix)
    solid.index = names
    solid.columns = names
    solid.reset_index(inplace=True)
    liquid = solid.melt(id_vars='index', value_vars=list(df.columns[1:]), var_name="name2")
    liquid.columns = ['name1', 'name2', 'similarity']
    #print(liquid)
    return liquid

In [84]:
def dis_to_similarity(grid):
    nrows = len(grid)
    ncols = len(grid[0])
    for i in range(nrows):
        for j in range(ncols):
            grid[i][j] = 1 - grid[i][j]

In [85]:
pn.extension()

class Matrix_dropdown(param.Parameterized):
    reordering = param.ObjectSelector(default="ward",objects=["ward","single","average","complete", "centroid", "weighted", "median"])
    metric = param.ObjectSelector(default="euclidean", objects=["euclidean", "minkowski", "cityblock", "sqeuclidean", "cosine", "correlation", "hamming", "jaccard", "chebyshev", "canberra", "braycurtis"])
    def view(self):
        ordered_dist_mat, res_order, res_linkage = compute_serial_matrix(df,self.reordering, dist_metric = self.metric)
        reordered_matrix_col = reordercol(df, res_order)
        reordered_matrix = reorderrow(reordered_matrix_col, res_order)
        dis_to_similarity(reordered_matrix)     
        result = to_liquid(reordered_matrix)
        return result.hvplot.heatmap('name1', 'name2', 'similarity',
                      height=500, width=600, flip_yaxis=True, xaxis=None, yaxis=None, cmap=palette['kbc'])

matrix = Matrix_dropdown(name='Adjacency Matrix')
from bokeh.embed import components
pn.Column(matrix.param, matrix.view)