## Import utilities

In [41]:
# import import_ipynb
from my_library import *
import pandas as pd
import numpy as np

In [42]:
def jaccard_coefficient(l1, l2):
    print(f'jaccard_coefficient :: {l1} -- {l2}')
    # a is the number combinations of 1 and 0
    a = len([1 for i, j in zip(l1, l2) if i == 1 and j == 0])
    # b is the number combinations of 0 and 1
    b = len([1 for i, j in zip(l1, l2) if i == 0 and j == 1])
    # c is the number combinations of 1 and 1
    c = len([1 for i, j in zip(l1, l2) if i == j == 1])

    return (a + b) / (a + b + c)

# K-NN algorithm implementation

In [43]:
def knn_algo(df: pd.DataFrame, test_data: pd.DataFrame, options: dict = {}, ignore_columns: list = []):
    ignore_columns.append('id')
    ignore_columns.append('class')
    jaccard = True
    for col in df.columns:
        if col in ignore_columns:
            continue
        if not df[col].isin([0, 1]).all():
            jaccard = False
            break

    if not jaccard:
        # normalize data
        normalize_data(df, test_data, ignore_columns)

        print(f'test_data :')
        print(test_data)

        print(f'df :')
        print(df, end='\n\n')

        # calculate distance
        df['distance'] = df.apply(
            lambda row: distance(row, test_data.loc[0], columns=test_data.columns, options=options), axis=1)
    else:
        # calculate distance
        print(f'test_data :', test_data.values[0])
        ids = df['id']
        classes = df['class']
        df2 = df.drop(columns=['id', 'class'])
        df['distance'] = df2.apply(lambda row: jaccard_coefficient(row.values, test_data.values[0]), axis=1)

    # sort by distance
    df = df.sort_values(by=['distance'])

    print(df)

### Test the algorithm with the data

In [44]:
import numpy as np
import pandas as pd

data = pd.read_csv('testText.csv')
# test_data = {
#     'poids': [43],
#     'cheveux': ['brun'],
#     'taile': [1],
#     'vegetarien': [1]
# }
test_data = {
    'c1': [0],
    'c2': [0],
    'c3': [1],
    'c4': [0],
    'c5': [0],
    'c6': [0],
    'c7': [0],
    'c8': [0],
    'c9': [1],
    'c10': [0],
    'c11': [1]
}
test_data = pd.DataFrame(test_data)

knn_algo(data, test_data=test_data)


test_data : [0 0 1 0 0 0 0 0 1 0 1]
jaccard_coefficient :: [1 1 0 0 0 1 0 0 0 0 0] -- [0 0 1 0 0 0 0 0 1 0 1]
jaccard_coefficient :: [1 0 0 1 0 0 0 1 0 0 0] -- [0 0 1 0 0 0 0 0 1 0 1]
jaccard_coefficient :: [1 0 1 0 0 0 0 0 0 0 0] -- [0 0 1 0 0 0 0 0 1 0 1]
jaccard_coefficient :: [0 0 0 0 0 0 0 0 1 1 1] -- [0 0 1 0 0 0 0 0 1 0 1]
jaccard_coefficient :: [0 0 0 0 0 1 0 0 1 0 0] -- [0 0 1 0 0 0 0 0 1 0 1]
jaccard_coefficient :: [0 0 0 0 0 0 1 0 0 1 0] -- [0 0 1 0 0 0 0 0 1 0 1]
   id  c1  c2  c3  c4  c5  c6  c7  c8  c9  c10  c11      class  distance
3   4   0   0   0   0   0   0   0   0   1    1    1  Politique      0.50
2   3   1   0   1   0   0   0   0   0   0    0    0      Sport      0.75
4   5   0   0   0   0   0   1   0   0   1    0    0  Politique      0.75
0   1   1   1   0   0   0   1   0   0   0    0    0      Sport      1.00
1   2   1   0   0   1   0   0   0   1   0    0    0      Sport      1.00
5   6   0   0   0   0   0   0   1   0   0    1    0  Politique      1.00
