In [2]:
"""
Author(s): Steven van den Broek, Yuqin Cui
Created: 2019-05-08
Edited: 2019-05-08
"""

'\nAuthor(s): Steven van den Broek, Yuqin Cui\nCreated: 2019-05-08\nEdited: 2019-05-08\n'

In [1]:
import numpy as np
import pandas as pd
import panel as pn
import param
import hvplot.pandas
from colorcet import palette

In [2]:
fname = 'GephiMatrix_author_similarity.csv'
f = open(fname, 'r')

# Get author names
line1 = f.readline()
names = line1[1:].split(';');

seen = {}
dupes = []

for index, name in enumerate(names):
    if name not in seen:
        seen[name] = 1
    else:
        if seen[name] == 1:
            dupes.append((index, name))
        seen[name] += 1

# add 1, 2 etc after the name
for pair in dupes:
    index = pair[0]
    name = pair[1]
    for i in range(seen[name]):
        names[index] = name + str((i+1))
        #print(names[index])

# Read csv
df = pd.read_csv(f, names=names, sep=';')

# Fix it
df = df.reset_index(level=1)
names.append("delete")
names = [name.replace('_', ' ') for name in names]
df.columns = names
del df["delete"]
df.set_index([df.columns], inplace=True)

# Get names again for later use
names = df.columns.tolist()

# Get 150*150 sub matrix since otherwise the plot is very slow..
df = df.head(150)[names[0:150]]
names = df.columns.tolist()

In [3]:
#convert similarity into unsimilarity (1.0 - similarity)
for name in names:
    df[name] = 1 - df[name]

In [4]:
#This is just the method online: https://gmarti.gitlab.io/ml/2017/09/07/how-to-sort-distance-matrix.html
#We have to clean data and modified the method

import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn import datasets
from fastcluster import linkage

In [5]:
# The output of linkage is stepwise dendrogram, 
# which is represented as an (N − 1) × 4 NumPy array with floating point entries (dtype=numpy.double). 
# The first two columns contain the node indices which are joined in each step. The input nodes are
# labeled 0,..., N − 1, and the newly generated nodes have the labels N,...2N-2.
# The third column contains the distance between the two nodes at each step, ie. the
# current minimal distance at the time of the merge. The fourth column counts the
# number of points which comprise each new node.

#Idea is from: https://gmarti.gitlab.io/ml/2017/09/07/how-to-sort-distance-matrix.html

#Traversal the hierarhical tree generated by linkage
def traversal_tree(hier_tree,number_of_node, current_index):
    if current_index < number_of_node:
        return [current_index]
    else:
        return (traversal_tree(hier_tree,number_of_node, int(hier_tree[current_index-number_of_node][1])) + 
                traversal_tree(hier_tree,number_of_node,int(hier_tree[current_index-number_of_node][0])))

In [6]:
def compute_serial_matrix(df,method="ward", dist_metric = "euclidean"):
    #define the dist_mat by different dist_metric mathod in fast_clustering package
    dist_mat = squareform(pdist(df, metric=dist_metric))    
    #hierar tree was got from package "fast-clustering"
    hierar_tree = linkage(squareform(dist_mat), method=method,preserve_input=True)   
    #The order implied by the hierarhical tree
    reordered_index = traversal_tree(hierar_tree, len(dist_mat), 2*len(dist_mat)-2)
    return reordered_index, hierar_tree


#linkage(squareform(pdist(df, metric="euclidean")), method="ward",preserve_input=True)

In [7]:
def reorderrow(df, order):
    a = df.values
    permutation = order
    return a[:,permutation]

def reordercol(df, order):
    secondIndex = []
    new_df = df
    new_df['nindex'] = np.arange(len(new_df))
    for i in order:
        secondIndex += new_df.index[new_df['nindex'] == i].tolist()
    new_df.drop('nindex',axis=1, inplace = True)
    a = new_df.reindex( index = secondIndex)
    return a

In [8]:
def to_liquid(matrix):
    solid = pd.DataFrame(matrix)
    solid.index = names
    solid.columns = names
    solid.reset_index(inplace=True)
    liquid = solid.melt(id_vars='index', value_vars=list(df.columns[1:]), var_name="name2")
    liquid.columns = ['name1', 'name2', 'similarity']
    #print(liquid)
    return liquid

In [9]:
def dis_to_similarity(grid):
    nrows = len(grid)
    ncols = len(grid[0])
    for i in range(nrows):
        for j in range(ncols):
            grid[i][j] = 1 - grid[i][j]

In [10]:
# pn.extension()

# class Matrix_dropdown(param.Parameterized):
#     reordering = param.ObjectSelector(default="ward",objects=["ward","single","average","complete", "centroid", "weighted", "median"])
#     metric = param.ObjectSelector(default="euclidean", objects=["euclidean", "minkowski", "cityblock", "sqeuclidean", "cosine", "correlation", "hamming", "jaccard", "chebyshev", "canberra", "braycurtis"])
#     def view(self):
#         res_order, res_linkage = compute_serial_matrix(df,self.reordering, dist_metric = self.metric)
#         reordered_matrix_col = reordercol(df, res_order)
#         reordered_matrix = reorderrow(reordered_matrix_col, res_order)
#         dis_to_similarity(reordered_matrix)     
#         result = to_liquid(reordered_matrix)
#         return result.hvplot.heatmap('name1', 'name2', 'similarity',
#                       height=500, width=600, flip_yaxis=True, xaxis=None, yaxis=None, cmap=palette['kbc'])

# matrix = Matrix_dropdown(name='Adjacency Matrix')
# from bokeh.embed import components
# pn.Column(matrix.param, matrix.view)

In [11]:
pn.extension()

class Matrix_dropdown(param.Parameterized):
    reordering = param.ObjectSelector(default="ward",objects=["ward","single","average","complete", "centroid", "weighted", "median","none"])
    metric = param.ObjectSelector(default="euclidean", objects=["euclidean", "minkowski", "cityblock", "sqeuclidean", "cosine", "correlation", "hamming", "jaccard", "chebyshev", "canberra", "braycurtis"])
    def view(self):
        if self.reordering == "none":
            result = to_liquid(df)
            return result.hvplot.heatmap('name1', 'name2', 'similarity',
                      height=500, width=600, flip_yaxis=True, xaxis=None, yaxis=None, cmap=palette['kbc'])
        else:
            res_order, res_linkage = compute_serial_matrix(df,self.reordering, dist_metric = self.metric)
            reordered_matrix_col = reordercol(df, res_order)
            reordered_matrix = reorderrow(reordered_matrix_col, res_order)
            dis_to_similarity(reordered_matrix)     
            result = to_liquid(reordered_matrix)
            return result.hvplot.heatmap('name1', 'name2', 'similarity',
                      height=500, width=600, flip_yaxis=True, xaxis=None, yaxis=None, cmap=palette['kbc'])

matrix = Matrix_dropdown(name='Adjacency Matrix')
from bokeh.embed import components
pn.Column(matrix.param, matrix.view)