In [148]:
import pandas as pd 
import numpy as np
import math

In [149]:
def read_file(file):
    df = pd.read_csv(file, header=None)
    for column in df.columns:
        if int(column) == 0:
            df.drop(column, axis=1)
    shrunk_df = df.drop([0], axis=0)
    return shrunk_df

In [150]:
df = read_file("data/4clusters.csv")

### Create Dist Matrix

In [151]:
def euclideanDist(point, pointArray):
    return np.sqrt(np.sum((pointArray - point) ** 2, axis=1))

In [152]:
def calcDistMatrix(df, distFunctionVect):
    # must be fully numeric and normalized df
    dfarray = np.array(df)
    
    distMatrix = []
    for i, d in enumerate(dfarray):
        # performs Euclidean distance on all elements in data (vectorized)
        dists = distFunctionVect(dfarray[i], dfarray)
        distMatrix.append(dists)
    
    return pd.DataFrame(distMatrix)

In [23]:
dist_matrix = calcDistMatrix(df, euclideanDist)
dist_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
0,0.0,2.236068,2.236068,3.605551,31.144823,4.242641,33.24154,4.0,28.442925,6.403124,...,28.79236,32.649655,33.526109,32.649655,32.140317,36.055513,34.71311,35.22783,36.013886,36.013886
1,2.236068,0.0,3.162278,5.09902,29.068884,2.236068,31.144823,3.605551,26.305893,7.211103,...,27.202941,30.870698,32.015621,30.870698,31.016125,34.481879,33.376639,34.058773,35.014283,35.128336
2,2.236068,3.162278,0.0,2.0,32.015621,4.123106,34.058773,2.236068,29.154759,4.242641,...,27.313001,31.38471,31.953091,31.38471,30.265492,34.539832,32.984845,33.376639,34.058773,34.0
3,3.605551,5.09902,2.0,0.0,34.014703,6.082763,36.055513,3.605551,31.144823,3.162278,...,28.178006,32.449961,32.695565,32.449961,30.594117,35.341194,33.526109,33.734256,34.234486,34.058773
4,31.144823,29.068884,32.015621,34.014703,0.0,28.0,2.236068,31.016125,3.605551,35.057096,...,31.890437,30.528675,35.805028,30.528675,40.311289,36.359318,39.204592,41.868843,44.598206,45.96738
5,4.242641,2.236068,4.123106,6.082763,28.0,0.0,30.016662,3.162278,25.079872,7.28011,...,25.0,28.635642,29.832868,28.635642,29.0,32.280025,31.256999,32.015621,33.060551,33.24154
6,33.24154,31.144823,34.058773,36.055513,2.236068,30.016662,0.0,33.0,5.09902,37.013511,...,32.526912,30.805844,36.235342,30.805844,41.036569,36.619667,39.698866,42.449971,45.254834,46.69047
7,4.0,3.605551,2.236068,3.605551,31.016125,3.162278,33.0,0.0,28.017851,4.123106,...,25.079872,29.154759,29.732137,29.154759,28.160256,32.310989,30.805844,31.256999,32.015621,32.015621
8,28.442925,26.305893,29.154759,31.144823,3.605551,25.079872,5.09902,28.017851,0.0,32.0,...,28.425341,27.294688,32.449961,27.294688,36.796739,33.12099,35.805028,38.418745,41.10961,42.449971
9,6.403124,7.211103,4.242641,3.162278,35.057096,7.28011,37.013511,4.123106,32.0,0.0,...,26.07681,30.610456,30.413813,30.610456,27.892651,33.12099,31.016125,31.048349,31.400637,31.144823


In [153]:
class Leaf:
    def __init__(self, data):
        self.n_type = 'leaf'
        self.height = 0
        self.data = data

    def __repr__(self):
        return f"type: {self.n_type}, height: {self.height}, data: {self.data}"

    def to_dict(self):
        json_dict = {}
        json_dict["type"] = self.n_type
        json_dict["height"] = self.height
        json_dict["data"] = self.data
        return json_dict

class Node:
    def __init__(self, n_type, height, nodes):
        self.n_type = n_type
        self.height = height
        self.nodes = nodes

    def __repr__(self):
        return f"type: {self.n_type}, height: {self.height}, nodes: {self.nodes}"

    def to_dict(self):
        json_dict = {}
        json_dict["type"] = self.n_type
        json_dict["height"] = self.height
        json_dict["nodes"] = self.nodes
        return json_dict


In [154]:
def min_matrix(dist_matrix):
    min_locs = dist_matrix.idxmin()
    min_row = 0
    min_col = 0
    min_val = np.inf
    for val in min_locs:
        if dist_matrix.at[val, min_locs[val]] < min_val:
            min_col = val
            min_row = min_locs[val]
            min_val = dist_matrix.at[val, min_locs[val]]

    return min_row, min_col, min_val

In [155]:
def generate_starting_clusters(dist_matrix):
    clusters = {0:[]}
    for column in dist_matrix:
        clusters[0].append(column)
    return clusters

In [236]:
def init_dendegram(dist_matrix):
    dgram = []
    for column in dist_matrix:
        dgram.append(Leaf(column))
    return dgram

In [156]:
def single_link_dist(s, r):
    return np.minimum(s, r)

In [237]:
import copy
import re
dist_matrix = calcDistMatrix(df, euclideanDist)
def hcluster(dist_matrix, threshold = np.inf):
    for i in range(len(dist_matrix)):
        for j in range(len(dist_matrix)):
            if i == j:
                dist_matrix.iat[i,j]=np.nan #so that 0 isn't always reported as the min val

    clusters = generate_starting_clusters(dist_matrix)
    dgram = init_dendegram(dist_matrix)
    current_height = 0
    
    while len(clusters[current_height]) > 1:
        
        s, r, min_val = min_matrix(dist_matrix) #s = row, r = column


        a_node = Node("node", min_val, 
        # Set up next level of clusters
        clusters[current_height+1] = copy.deepcopy(clusters[current_height])
        clusters[current_height+1].remove(s)
        clusters[current_height+1].remove(r)

        # Create new cluster
        new_cluster = str((s,r)).strip()
        new_cluster = re.sub('[^A-Za-z0-9,()]+','', new_cluster)

        # Insert new cluster
        clusters[current_height+1].insert(0, new_cluster)
        
        new_matrix = pd.DataFrame(np.nan, clusters[current_height+1], clusters[current_height+1]) 
        s_values = dist_matrix.loc[s]
        r_values = dist_matrix[r]
        dist_matrix.drop(index=s, inplace=True)
        dist_matrix.drop(columns=r, inplace=True)  
        for j in clusters[current_height]:
            if j != s and j != r:
                new_matrix[j] = dist_matrix[j]
        merge = single_link_dist(s_values, r_values)
        merge = merge.drop(s)
        merge = merge.drop(r)
        
        new_matrix[new_cluster] = merge
        for i in merge.keys():
            new_matrix.loc[new_cluster].at[i] = merge[i]      

        dist_matrix = copy.deepcopy(new_matrix)
        current_height += 1
    return clusters[current_height]



In [238]:
end_grid = hcluster(dist_matrix)
end_grid

[type: leaf, height: 0, data: 0, type: leaf, height: 0, data: 1, type: leaf, height: 0, data: 2, type: leaf, height: 0, data: 3, type: leaf, height: 0, data: 4, type: leaf, height: 0, data: 5, type: leaf, height: 0, data: 6, type: leaf, height: 0, data: 7, type: leaf, height: 0, data: 8, type: leaf, height: 0, data: 9, type: leaf, height: 0, data: 10, type: leaf, height: 0, data: 11, type: leaf, height: 0, data: 12, type: leaf, height: 0, data: 13, type: leaf, height: 0, data: 14, type: leaf, height: 0, data: 15, type: leaf, height: 0, data: 16, type: leaf, height: 0, data: 17, type: leaf, height: 0, data: 18, type: leaf, height: 0, data: 19, type: leaf, height: 0, data: 20, type: leaf, height: 0, data: 21, type: leaf, height: 0, data: 22, type: leaf, height: 0, data: 23, type: leaf, height: 0, data: 24, type: leaf, height: 0, data: 25, type: leaf, height: 0, data: 26, type: leaf, height: 0, data: 27, type: leaf, height: 0, data: 28, type: leaf, height: 0, data: 29, type: leaf, height:

['((((22,26),((30,32),(((((((23,25),27),24),28),29),21),((((36,35),(37,38)),33),(31,34))))),((16,15),((19,(18,20)),((8,14),((((4,6),10),13),17))))),((((((((2,3),0),1),5),7),12),11),9))']