In [646]:
import pandas as pd 
import numpy as np
import math

In [689]:
def read_file(file):
    df = pd.read_csv(file, header=None)
    for column in df.columns:
        if int(column) == 0:
            #df.set_axis(df[column], axis=0)
            df.drop(column, axis=1, inplace=True)
    shrunk_df = df.drop([0], axis=0)
    return shrunk_df

In [690]:
df = read_file("data/mammal_milk.csv")
df

Unnamed: 0,1,2,3,4,5
1,90.1,2.6,1.0,6.9,0.35
2,88.5,1.4,3.5,6.0,0.24
3,88.4,2.2,2.7,6.4,0.18
4,90.3,1.7,1.4,6.2,0.4
5,90.4,0.6,4.5,4.4,0.1
6,87.7,3.5,3.4,4.8,0.71
7,86.9,4.8,1.7,5.7,0.9
8,82.1,5.9,7.9,4.7,0.78
9,81.9,7.4,7.2,2.7,0.85
10,81.6,10.1,6.3,4.4,0.75


### Create Dist Matrix

In [691]:
def euclideanDist(point, pointArray):
    return np.sqrt(np.sum((pointArray - point) ** 2, axis=1))

In [692]:
def calcDistMatrix(df, distFunctionVect):
    # must be fully numeric and normalized df
    dfarray = np.array(df)
    
    distMatrix = []
    for i, d in enumerate(dfarray):
        # performs Euclidean distance on all elements in data (vectorized)
        dists = distFunctionVect(dfarray[i], dfarray)
        distMatrix.append(dists)
    
    return pd.DataFrame(distMatrix)

In [693]:
dist_matrix = calcDistMatrix(df, euclideanDist)
dist_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.0,3.327477,2.493772,1.225765,4.759464,4.107262,4.160829,11.292692,12.108262,12.767145,...,10.436647,17.986453,25.586879,24.954809,22.404073,31.871029,33.145927,33.892772,60.736809,57.378414
1,3.327477,0.0,1.205653,2.79385,2.798142,2.59247,4.229137,9.085791,10.223116,11.574113,...,8.392789,16.036571,22.821089,22.975282,20.206078,29.446657,30.724837,31.445224,57.964835,54.750837
2,2.493772,1.205653,0.0,2.374532,3.715696,2.347531,3.319398,9.147131,10.182284,11.222072,...,8.344633,15.989384,23.197036,22.963545,20.271122,29.638124,30.913887,31.652945,58.362222,55.04564
3,1.225765,2.79385,2.374532,0.0,3.762978,4.007006,4.664762,11.380879,12.212391,13.176589,...,10.564568,18.172782,25.507311,25.14657,22.537524,31.935873,33.213702,33.94131,60.648269,57.364858
4,4.759464,2.798142,3.715696,3.762978,0.0,4.176374,6.329297,10.444731,11.368047,13.090168,...,9.995804,17.397414,23.883486,24.222923,21.514646,30.532933,31.816819,32.448459,58.693973,55.739079
5,4.107262,2.59247,2.347531,4.007006,4.176374,0.0,2.462945,7.575282,8.229192,9.452069,...,6.778643,14.293009,22.165207,21.246837,18.732221,28.175452,29.455494,30.154098,57.071443,53.756324
6,4.160829,4.229137,3.319398,4.664762,6.329297,2.462945,0.0,7.981504,8.426891,8.891147,...,6.909421,14.17145,22.73264,21.118002,18.7494,28.394013,29.659231,30.416484,57.626752,54.065302
7,11.292692,9.085791,9.147131,11.380879,10.444731,7.575282,7.981504,0.0,2.604784,4.532207,...,1.538473,7.129264,15.171437,13.956733,11.290899,20.657309,21.938879,22.645858,49.737661,46.361973
8,12.108262,10.223116,10.182284,12.212391,11.368047,8.229192,8.426891,2.604784,0.0,3.330165,...,2.809555,6.3618,16.015567,13.187968,11.019188,20.531987,21.800287,22.434572,49.838439,46.366717
9,12.767145,11.574113,11.222072,13.176589,13.090168,9.452069,8.891147,4.532207,3.330165,0.0,...,4.531622,6.413462,17.035387,12.877209,11.177768,20.731679,21.968898,22.651711,50.329514,46.662602


In [694]:
class Leaf:
    def __init__(self, data):
        self.n_type = 'leaf'
        self.height = 0
        self.data = data
        self.str_rep = str(data)

    def __repr__(self):
        return f"type: {self.n_type}, height: {self.height}, data: {self.data}"

    def to_dict(self):
        json_dict = {}
        json_dict["type"] = self.n_type
        json_dict["height"] = self.height
        json_dict["data"] = self.data
        return json_dict

class Node:
    def __init__(self, n_type, height, nodes, str_rep):
        self.n_type = n_type
        self.height = height
        self.nodes = nodes
        self.str_rep = str_rep

    def __repr__(self):
        return f"type: {self.n_type}, height: {self.height}, nodes: {self.nodes}"

    def to_dict(self):
        json_dict = {}
        json_dict["type"] = self.n_type
        json_dict["height"] = self.height
        json_dict["nodes"] = self.nodes
        return json_dict


In [695]:
def min_matrix(dist_matrix):
    min_locs = dist_matrix.idxmin()
    min_row = 0
    min_col = 0
    min_val = np.inf
    for val in min_locs:
        if dist_matrix.at[val, min_locs[val]] < min_val:
            min_col = val
            min_row = min_locs[val]
            min_val = dist_matrix.at[val, min_locs[val]]

    return min_row, min_col, min_val

In [696]:
def generate_starting_clusters(dist_matrix):
    clusters = {0:[]}
    for column in dist_matrix:
        clusters[0].append(column)
    return clusters

In [697]:
def init_dendegram(dist_matrix):
    dgram = []
    for column in dist_matrix:
        dgram.append(Leaf(column))
    return dgram

In [698]:
def single_link(s, r):
    return np.minimum(s, r)

In [699]:
def complete_link(s, r):
    return np.maximum(s, r)

In [712]:
import copy
import re
import json
def hcluster(dist_matrix, threshold = np.inf, merge_func = single_link):
    for i in range(len(dist_matrix)):
        for j in range(len(dist_matrix)):
            if i == j:
                dist_matrix.iat[i,j]=np.nan #so that 0 isn't always reported as the min val

    clusters = generate_starting_clusters(dist_matrix)
    dgram = init_dendegram(dist_matrix)
    current_height = 0
    
    while len(clusters[current_height]) > 1:
        
        s, r, min_val = min_matrix(dist_matrix) #s = row, r = column

        # Set up next level of clusters
        clusters[current_height+1] = copy.deepcopy(clusters[current_height])
        clusters[current_height+1].remove(s)
        clusters[current_height+1].remove(r)

        # Create new cluster
        new_cluster = str((s,r)).strip()
        new_cluster = re.sub('[^A-Za-z0-9,()]+','', new_cluster)

        # Insert new cluster
        clusters[current_height+1].insert(0, new_cluster)

        a = next(item for item in dgram if item.str_rep == str(s))
        b = next(item for item in dgram if item.str_rep == str(r))
        
        dgram.append(Node('node', min_val+1, [a.to_dict(), b.to_dict()], new_cluster))
        dgram.remove(a)
        dgram.remove(b)

        
        new_matrix = pd.DataFrame(np.nan, clusters[current_height+1], clusters[current_height+1]) 
        s_values = dist_matrix.loc[s]
        r_values = dist_matrix[r]
        dist_matrix.drop(index=s, inplace=True)
        dist_matrix.drop(columns=r, inplace=True)  
        for j in clusters[current_height]:
            if j != s and j != r:
                new_matrix[j] = dist_matrix[j]
        merge = merge_func(s_values, r_values)
        merge = merge.drop(s)
        merge = merge.drop(r)
        
        new_matrix[new_cluster] = merge
        for i in merge.keys():
            new_matrix.loc[new_cluster].at[i] = merge[i]      

        dist_matrix = copy.deepcopy(new_matrix)
        current_height += 1
    dgram[0].n_type = 'root'
    dgram[0].height += min_val

    dendegram = dgram[0].to_dict()
    f = open("dendegram.json",'w')
    json.dump(dendegram, f, indent = 4)

    if threshold != np.inf:
        cuts = []
        cuts = cut_dgram(dendegram, threshold)
        return create_final_clusters(cuts)
    else:
        print("Output dendegram to")


In [701]:
def cut_dgram(dgram, threshold):
    s1 = []
    s2 = []

    s1.append(dgram)
    while len(s1) != 0:
        curr = s1.pop()
        
        if curr['height'] >= threshold:
            s1.append(curr['nodes'][0])
            s1.append(curr['nodes'][1])
        else:
            s2.append(curr)

    return s2


In [702]:
def find_leaves(dgram):
    s1 = []
    s2 = []

    s1.append(dgram)
    while len(s1) != 0:
        curr = s1.pop()
        if curr['type']=='node' or curr['type'] == 'root':
            s1.append(curr['nodes'][0])
            s1.append(curr['nodes'][1])
        else:
            s2.append(curr)
     
    # Return all the leaf data
    leaves = []
    for leaf in s2:
        leaves.append(leaf['data'])
    return leaves
        


In [703]:
def create_final_clusters(dgrams):
    cluster = 0
    clusters = {}
    for tree in dgrams:
        result = find_leaves(tree)
        clusters[cluster] = result
        cluster += 1
    return clusters


In [706]:

dist_matrix = calcDistMatrix(df, euclideanDist)
end_gram_single = hcluster(dist_matrix, 14)
print(end_gram_single)


{0: [24, 23], 1: [22, 21, 20, 17, 16, 18, 19, 8, 13, 7, 15, 10, 9, 4, 14, 6, 11, 5, 2, 1, 3, 12, 0]}


In [707]:
dist_matrix = calcDistMatrix(df, euclideanDist)
end_gram_comp = hcluster(dist_matrix, 14, complete_link)
print(end_gram_comp)

{0: [24, 23], 1: [9, 7, 15, 10, 8, 13], 2: [6, 14, 5, 11, 2, 1, 3, 12, 0, 4], 3: [16, 18, 19], 4: [22, 21, 20, 17]}


### Analysis

In [716]:
def all_together(file, threshold=np.inf, dist_func=single_link):
    df = read_file(file)
    dist_matrix = calcDistMatrix(df, euclideanDist)
    end_gram = hcluster(dist_matrix, threshold, dist_func)
    dist_matrix = calcDistMatrix(df, euclideanDist) #hcluster modifies og dist_matrix
    for gram in list(end_gram.keys()):
        print(f"Cluster {gram}")

    

In [717]:
all_together('data/mammal_milk.csv', 14, complete_link)

Cluster 0
Cluster 1
Cluster 2
Cluster 3
Cluster 4
