In [122]:
import math


def entropy(prob_dist):
    """
    Calculates the entropy of a probability distribution.

    Arguments:
    prob_dist -- a list or dictionary containing the probabilities of each outcome

    Returns:
    The entropy of the probability distribution
    """
    entropy = 0
    for p in prob_dist:
        if p > 0:
            entropy -= p * math.log2(p)
    return entropy


In [123]:
prob_dist = [1 / 2, 1 / 2]
e = entropy(prob_dist)
print("Entropy = ", e)


Entropy =  1.0


## Import utilities

In [2]:
# import import_ipynb
from my_library import *
import pandas as pd
import numpy as np

# K-NN algorithm implementation

In [3]:
def knn_algo(df: pd.DataFrame, test_data: pd.DataFrame, options: dict = {}, ignore_columns: list = []):
    ignore_columns.append('id')
    ignore_columns.append('class')
    jaccard = True
    for col in df.columns:
        if col in ignore_columns:
            continue
        if not df[col].isin([0, 1]).all():
            jaccard = False
            break

    if jaccard:
        # normalize data
        normalize_data(df, test_data, ignore_columns)

        print(f'test_data :')
        print(test_data)

        print(f'df :')
        print(df, end='\n\n')

        # calculate distance
        df['distance'] = df.apply(
            lambda row: distance(row, test_data.loc[0], columns=test_data.columns, options=options), axis=1)
    else:
        # calculate distance
        df['distance'] = df.apply(
            lambda row: jaccard_coefficient(row.values, test_data[0].values), axis=1)

    # sort by distance
    df = df.sort_values(by=['distance'])

    print(df)

### Test the algorithm with the data

In [5]:
import numpy as np
import pandas as pd

data = pd.read_csv('testText.csv')
# test_data = {
#     'poids': [43],
#     'cheveux': ['brun'],
#     'taile': [1],
#     'vegetarien': [1]
# }
test_data = {
    'c1':[0],
    'c2':[0],
    'c3':[0],
    'c4':[0],
    'c5':[0],
    'c6':[0],
    'c7':[1],
    'c8':[0],
    'c9':[0],
    'c10':[1],
    'c11':[0]
}
test_data = pd.DataFrame(test_data)

knn_algo(data, test_data=test_data, ignore_columns=['name'])


test_data :
   c1  c2  c3  c4  c5  c6  c7  c8  c9  c10  c11
0   0   0   0   0   0   0   1   0   0    1    0
df :
   id  c1  c2  c3  c4  c5  c6  c7  c8  c9  c10  c11      class
0   1   1   1   0   0   0   1   0   0   0    0    0      Sport
1   2   1   0   0   1   0   0   0   1   0    0    0      Sport
2   3   1   0   1   0   0   0   0   0   0    0    0      Sport
3   4   0   0   0   0   0   0   0   0   1    1    1  Politique
4   5   0   0   0   0   0   1   0   0   1    0    0  Politique
5   6   0   0   0   0   0   0   1   0   0    1    0  Politique

distance of each column ::  {'c1': 1, 'c2': 1, 'c3': 0, 'c4': 0, 'c5': 0, 'c6': 1, 'c7': 1, 'c8': 0, 'c9': 0, 'c10': 1, 'c11': 0} / 11 -- result = 0.45454545454545453
distance of each column ::  {'c1': 1, 'c2': 0, 'c3': 0, 'c4': 1, 'c5': 0, 'c6': 0, 'c7': 1, 'c8': 1, 'c9': 0, 'c10': 1, 'c11': 0} / 11 -- result = 0.45454545454545453
distance of each column ::  {'c1': 1, 'c2': 0, 'c3': 1, 'c4': 0, 'c5': 0, 'c6': 0, 'c7': 1, 'c8': 0, 'c9': 0, '

# Hierarchical clustering algorithm implementation AGNES

In [13]:

def np_min_ign_diagonal(z: np.ndarray):
    row_index, col_index = 0, 1
    for i in range(len(z)):
        for j in range(len(z)):
            if i == j:
                continue
            if z[i, j] < z[row_index, col_index]:
                row_index = i
                col_index = j
    return row_index, col_index


def hierarchical_clustering_algo_agnes(df: pd.DataFrame, options: dict = {}):
    # normalize data
    normalize_data(df, None)

    print(f'df :')
    print(df, end='\n\n')
    # l = dict['label']
    labels = df['id'].values
    df = df.drop(columns=['id', 'class'])
    Z = calcul_distances(df, labels)

    while len(Z) > 1:
        display_dist_matrix(labels, Z)
        # get the index of the minimum value of the matrix
        row_index, col_index = np_min_ign_diagonal(Z)

        # update the matrix
        for j in range(len(Z)):
            if j == col_index:
                Z[row_index, j] = 0
            elif j != row_index:
                Z[row_index, j] = min(Z[row_index, j], Z[col_index, j])
                Z[j, row_index] = Z[row_index, j]
        Z = np.delete(Z, col_index, 0)
        Z = np.delete(Z, col_index, 1)

        labels[row_index] = f'{labels[row_index]}-{labels[col_index]}'
        labels = np.delete(labels, col_index, 0)

    print(df)


data = pd.read_csv('data.csv')
labels = data['id'].values

hierarchical_clustering_algo_agnes(data)

normilize order of "cheveux" :: ['blond' 'brun' 'rousse']
df :
      id     poids    cheveux  taile  vegetarien  class
0  Sarah  0.111111  [1, 0, 0]    0.5           0      1
1   Dana  0.444444  [1, 0, 0]    1.0           1      0
2   Alex  0.333333  [0, 1, 0]    0.0           1      0
3  Annie  0.555556  [1, 0, 0]    0.0           0      1
4  Emily  0.777778  [0, 0, 1]    0.5           0      1
5    Ali  0.888889  [0, 1, 0]    1.0           0      0
6   John  1.000000  [0, 1, 0]    0.5           0      0
7  Katie  0.000000  [1, 0, 0]    0.0           1      0

distance of each column ::  {'poids': 0.3333333333333333, 'cheveux': 0.0, 'taile': 0.5, 'vegetarien': 1} / 4 -- result = 0.4583333333333333
distance of each column ::  {'poids': 0.2222222222222222, 'cheveux': 0.6666666666666666, 'taile': 0.5, 'vegetarien': 1} / 4 -- result = 0.5972222222222222
distance of each column ::  {'poids': 0.4444444444444445, 'cheveux': 0.0, 'taile': 0.5, 'vegetarien': 0} / 4 -- result = 0.23611111111111

Unnamed: 0,Sarah,Dana,Alex,Annie,Emily,Ali,John,Katie
Sarah,0.0,0.458333,0.597222,0.236111,0.333333,0.486111,0.388889,0.402778
Dana,0.458333,0.0,0.444444,0.527778,0.625,0.527778,0.680556,0.361111
Alex,0.597222,0.444444,0.0,0.472222,0.652778,0.638889,0.541667,0.25
Annie,0.236111,0.527778,0.472222,0.0,0.347222,0.5,0.402778,0.388889
Emily,0.333333,0.625,0.652778,0.347222,0.0,0.319444,0.222222,0.736111
Ali,0.486111,0.527778,0.638889,0.5,0.319444,0.0,0.152778,0.888889
John,0.388889,0.680556,0.541667,0.402778,0.222222,0.152778,0.0,0.791667
Katie,0.402778,0.361111,0.25,0.388889,0.736111,0.888889,0.791667,0.0


Unnamed: 0,Sarah,Dana,Alex,Annie,Emily,Ali-John,Katie
Sarah,0.0,0.458333,0.597222,0.236111,0.333333,0.388889,0.402778
Dana,0.458333,0.0,0.444444,0.527778,0.625,0.527778,0.361111
Alex,0.597222,0.444444,0.0,0.472222,0.652778,0.541667,0.25
Annie,0.236111,0.527778,0.472222,0.0,0.347222,0.402778,0.388889
Emily,0.333333,0.625,0.652778,0.347222,0.0,0.222222,0.736111
Ali-John,0.388889,0.527778,0.541667,0.402778,0.222222,0.0,0.791667
Katie,0.402778,0.361111,0.25,0.388889,0.736111,0.791667,0.0


Unnamed: 0,Sarah,Dana,Alex,Annie,Emily-Ali-John,Katie
Sarah,0.0,0.458333,0.597222,0.236111,0.333333,0.402778
Dana,0.458333,0.0,0.444444,0.527778,0.527778,0.361111
Alex,0.597222,0.444444,0.0,0.472222,0.541667,0.25
Annie,0.236111,0.527778,0.472222,0.0,0.347222,0.388889
Emily-Ali-John,0.333333,0.527778,0.541667,0.347222,0.0,0.736111
Katie,0.402778,0.361111,0.25,0.388889,0.736111,0.0


Unnamed: 0,Sarah-Annie,Dana,Alex,Emily-Ali-John,Katie
Sarah-Annie,0.0,0.458333,0.472222,0.333333,0.388889
Dana,0.458333,0.0,0.444444,0.527778,0.361111
Alex,0.472222,0.444444,0.0,0.541667,0.25
Emily-Ali-John,0.333333,0.527778,0.541667,0.0,0.736111
Katie,0.388889,0.361111,0.25,0.736111,0.0


Unnamed: 0,Sarah-Annie,Dana,Alex-Katie,Emily-Ali-John
Sarah-Annie,0.0,0.458333,0.388889,0.333333
Dana,0.458333,0.0,0.361111,0.527778
Alex-Katie,0.388889,0.361111,0.0,0.541667
Emily-Ali-John,0.333333,0.527778,0.541667,0.0


Unnamed: 0,Sarah-Annie-Emily-Ali-John,Dana,Alex-Katie
Sarah-Annie-Emily-Ali-John,0.0,0.458333,0.388889
Dana,0.458333,0.0,0.361111
Alex-Katie,0.388889,0.361111,0.0


Unnamed: 0,Sarah-Annie-Emily-Ali-John,Dana-Alex-Katie
Sarah-Annie-Emily-Ali-John,0.0,0.388889
Dana-Alex-Katie,0.388889,0.0


      poids    cheveux  taile  vegetarien
0  0.111111  [1, 0, 0]    0.5           0
1  0.444444  [1, 0, 0]    1.0           1
2  0.333333  [0, 1, 0]    0.0           1
3  0.555556  [1, 0, 0]    0.0           0
4  0.777778  [0, 0, 1]    0.5           0
5  0.888889  [0, 1, 0]    1.0           0
6  1.000000  [0, 1, 0]    0.5           0
7  0.000000  [1, 0, 0]    0.0           1
