##### import libraries

In [1]:
import pandas as pd
import numpy as np

from my_library import normalize_data, distance, jaccard_coefficient, display_dist_matrix

# Hierarchical clustering algorithm implementation AGNES

##### Get the index of the minimum value of the matrix

In [2]:
def calcul_distances(df: pd.DataFrame, labels: list):
    # calculate distance

    ignore_columns = []
    ignore_columns.append('id')
    ignore_columns.append('class')
    jaccard = True
    for col in df.columns:
        if col in ignore_columns:
            continue
        if not df[col].isin([0, 1]).all():
            jaccard = False
            break

    Z = np.zeros((len(labels), len(labels)))
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            if not jaccard:
                Z[i, j] = distance(df.iloc[i], df.iloc[j], columns=df.columns)
            else:
                Z[i, j] = jaccard_coefficient(df.iloc[i], df.iloc[j])
            Z[j, i] = Z[i, j]
    return Z

In [3]:
def np_min_ign_diagonal(z: np.ndarray):
    row_index , col_index = 0, 1
    for i in range(len(z)):
        for j in range(len(z)):
            if i == j:
                continue
            if z[i, j] < z[row_index, col_index]:
                row_index = i
                col_index = j
    return row_index, col_index

# AGNES algorithm

In [4]:
def hierarchical_clustering_algo_agnes(df: pd.DataFrame, options: dict={}):
    # normalize data
    normalize_data(df, None)

    print(f'df :')
    print(df, end='\n\n')
    # l = dict['label']
    labels = df['id'].values
    if 'class' in df.columns:
        df = df.drop(columns=['id', 'class'])
    else:
        df = df.drop(columns=['id'])
    Z= calcul_distances(df, labels)

    while len(Z) > 1:
        display_dist_matrix(labels, Z)
        # get the index of the minimum value of the matrix
        row_index, col_index = np_min_ign_diagonal(Z)

        # update the matrix
        for j in range(len(Z)):
            if j == col_index:
                Z[row_index, j] = 0
            elif j != row_index:
                Z[row_index, j] = min(Z[row_index, j], Z[col_index, j])
                Z[j, row_index] = Z[row_index, j]
        Z = np.delete(Z, col_index, 0)
        Z = np.delete(Z, col_index, 1)

        labels[row_index] = f'{labels[row_index]}-{labels[col_index]}'
        labels = np.delete(labels, col_index, 0)

    print(df)


# Test

In [5]:
data = pd.read_csv('testText.csv')
labels = data['id'].values

hierarchical_clustering_algo_agnes(data)

df :
   id  c1  c2  c3  c4  c5  c6  c7  class
0  L1   1   0   0   0   1   1   1      1
1  L2   0   0   1   0   1   0   0      1
2  L3   0   0   1   0   1   0   0      1
3  L4   1   0   0   1   1   0   0      1
4  L5   0   1   0   0   0   0   0      1
5  L6   1   1   0   0   1   0   1      1

jaccard_coefficient :: c1    1
c2    0
c3    0
c4    0
c5    1
c6    1
c7    1
Name: 0, dtype: int64 -- c1    0
c2    0
c3    1
c4    0
c5    1
c6    0
c7    0
Name: 1, dtype: int64
jaccard_coefficient :: c1    1
c2    0
c3    0
c4    0
c5    1
c6    1
c7    1
Name: 0, dtype: int64 -- c1    0
c2    0
c3    1
c4    0
c5    1
c6    0
c7    0
Name: 2, dtype: int64
jaccard_coefficient :: c1    1
c2    0
c3    0
c4    0
c5    1
c6    1
c7    1
Name: 0, dtype: int64 -- c1    1
c2    0
c3    0
c4    1
c5    1
c6    0
c7    0
Name: 3, dtype: int64
jaccard_coefficient :: c1    1
c2    0
c3    0
c4    0
c5    1
c6    1
c7    1
Name: 0, dtype: int64 -- c1    0
c2    1
c3    0
c4    0
c5    0
c6    0
c7    0
N

Unnamed: 0,L1,L2,L3,L4,L5,L6
L1,0.0,0.8,0.8,0.6,1.0,0.4
L2,0.8,0.0,0.0,0.75,1.0,0.8
L3,0.8,0.0,0.0,0.75,1.0,0.8
L4,0.6,0.75,0.75,0.0,1.0,0.6
L5,1.0,1.0,1.0,1.0,0.0,0.75
L6,0.4,0.8,0.8,0.6,0.75,0.0


Unnamed: 0,L1,L2-L3,L4,L5,L6
L1,0.0,0.8,0.6,1.0,0.4
L2-L3,0.8,0.0,0.75,1.0,0.8
L4,0.6,0.75,0.0,1.0,0.6
L5,1.0,1.0,1.0,0.0,0.75
L6,0.4,0.8,0.6,0.75,0.0


Unnamed: 0,L1-L6,L2-L3,L4,L5
L1-L6,0.0,0.8,0.6,0.75
L2-L3,0.8,0.0,0.75,1.0
L4,0.6,0.75,0.0,1.0
L5,0.75,1.0,1.0,0.0


Unnamed: 0,L1-L6-L4,L2-L3,L5
L1-L6-L4,0.0,0.75,0.75
L2-L3,0.75,0.0,1.0
L5,0.75,1.0,0.0


Unnamed: 0,L1-L6-L4-L2-L3,L5
L1-L6-L4-L2-L3,0.0,0.75
L5,0.75,0.0


   c1  c2  c3  c4  c5  c6  c7
0   1   0   0   0   1   1   1
1   0   0   1   0   1   0   0
2   0   0   1   0   1   0   0
3   1   0   0   1   1   0   0
4   0   1   0   0   0   0   0
5   1   1   0   0   1   0   1
