In [1]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
! pip install cvxpy
import cvxpy as cp
from tqdm import tqdm

# Charger le dataset Iris
iris = load_iris()

Collecting cvxpy
  Downloading cvxpy-1.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.2 kB)
Collecting osqp>=0.6.2 (from cvxpy)
  Downloading osqp-0.6.7.post3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.9 kB)
Collecting clarabel>=0.5.0 (from cvxpy)
  Downloading clarabel-0.9.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)
Collecting scs>=3.2.4.post1 (from cvxpy)
  Downloading scs-3.2.7.post2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting qdldl (from osqp>=0.6.2->cvxpy)
  Downloading qdldl-0.1.7.post5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)
Downloading cvxpy-1.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading clarabel-0.9.0-cp37-abi3-m

In [2]:
def gaussian_ker(x,y,q):
    """a function to compute the gaussian kernel of two points
    -------------------------------
    inputs : 

    x : array-like, vector
    first vector for which we want to compute the kernel

    y : array-like, vector
    second vector for which we want to compute the kernel

    q : positive float,
    value of the bandwidth of the kernel

    returns:
    ker : float,
    the value of the kernel
    -------------------------------
    """

    ker = np.exp(-q*np.linalg.norm(x-y)**2)
    return ker

In [3]:
def gram_mat(X,q):
    """
    a function to compute the gram matrix of a given dataset

    ----------------------------------------
    inputs : 
    X : array-like object, must be 2D
    the data for which we want to compute the gram matrix

    q : positive float, 
    the bandwidth of the gaussian kernel
    -----------------------------------------

    returns:
    K : the gram matrix
    """
    
    norms = np.linalg.norm(X, axis=1)**2
    dot = X@X.T
    squared_euclidian_distances = norms[:, None] - 2 * dot + norms[None, :]
    K = np.exp(-squared_euclidian_distances*q)
    return K

In [4]:
def compute_seg(x,y,nb=20):
    """
    a function used to compute the segment between two points
    -----------------------------------
    Parameters : 

    x : array-like obj,
    an input, d>=2

    y : array-like obj, 
    the second input

    nb : int, 
    the number of points we want to have between the two points

    Returns : 

    segment : array-like
    an array of shape d (dimension of x), nb
    ---------------------------------------
    """
    d = x.shape[0]
    segment = np.zeros((nb,d))
    points = np.linspace(start=0.,stop=1.,num=nb,endpoint=True)
    for i in range(nb):
        t = points[i]
        segment[i,:] = (1-t)*x + t*y
        
    return segment

In [5]:
def radius(x, sample, beta, bkb, ker_self=1.):
    """
    compute the radius for a given instance x
    -----------------------------------------------
    Parameters : 

    x : 1-D vector,
    the input vector

    sample : matrix, 
    the whole sample, 

    beta : array-like, 
    the calculated beta

    bkb : float,
    the result of beta.T@K@beta

    ker_self : float,
    the value of the kernel of the selected instance with itself, 
    set to 1 by default as we use mostly the gaussian kernel

    returns :

    radius : float,
    the distance between the test instance and the center of the 
    sphere enclosing all the points in the Hilbert space
    -----------------------------------------------
    """
    nb_samp = sample.shape[0]
    temp_k = np.zeros(nb_samp)

    for elem in range(nb_samp):
        temp_k[elem] = gaussian_ker(x, sample[elem], q=48)
        
    return np.sqrt(ker_self - 2*np.dot(temp_k, beta) + bkb)

Application on the Iris dataset

In [6]:
data = iris.data  # Les 4 dimensions des caractéristiques
target = iris.target  # Les vraies étiquettes (pour comparaison)

In [7]:
gram_x = gram_mat(X=data, q=48)

In [8]:
n = len(data)
C = 1
beta = cp.Variable(n)
gram_x += gram_x.T
gram_x /= 2
gram_x = cp.psd_wrap(gram_x)


# Formulation de l'objectif
objective = cp.Maximize(cp.sum(np.ones(n) @ beta) - cp.quad_form(beta, gram_x))

# Contraintes
constraints = [
    beta >= 0,  # 0 <= beta_j
    beta <= C,  # beta_j <= C
    cp.sum(beta) == 1  # La somme des éléments de beta doit être égale à 1
]

# Définir le problème d'optimisation
problem = cp.Problem(objective, constraints)

# Résoudre le problème
problem.solve()

np.float64(0.9918160483862049)

In [9]:
true_beta = beta.value
beta_k_beta = true_beta.T @ gram_x @ true_beta

In [10]:
index_of_sv = list(np.where(true_beta>1e-10)[0])

In [11]:
potential_sv = data[index_of_sv,:]

In [12]:
for i in range(n):
    if true_beta[i] < 1e-10:
        true_beta[i] = 0

In [13]:
gram_x = gram_mat(X=data,q=48)
gram_x += gram_x.T
gram_x /= 2
r = []
beta_k_beta = true_beta.T@gram_x@true_beta
for i in potential_sv:
    temp_K = np.zeros(n)
    for elem in range(n):
        temp_K[elem] = gaussian_ker(i,data[elem,:],q=48)
    r_xi = np.sqrt(1 - 2*np.dot(temp_K, true_beta) + beta_k_beta)
    r.append(r_xi)
rad = max(r)

In [14]:
adjacency_mat = np.zeros((n,n))

for i in tqdm(range(n)):
    for j in range(i,n):
        decision = True
        segment = compute_seg(x=data[i,:], y=data[j,:])
        list_of_val = []
        for point in segment:
            dist = radius(x=point, beta=true_beta, bkb=beta_k_beta, sample=data)
            list_of_val.append(dist)
        for value in list_of_val:
            if value > rad:
                decision = False
        
        if decision == True:
            adjacency_mat[i,j] = 1
        

100%|██████████| 150/150 [03:01<00:00,  1.21s/it]


In [15]:
adjacency_mat = adjacency_mat + adjacency_mat.T
adjacency_mat /= 2
for i in range(n):
    adjacency_mat[i,i] = 0

In [16]:
import networkx as nx
G = nx.from_numpy_array(adjacency_mat)

In [17]:
clusters = list(nx.connected_components(G))
clusters

[{0, 4, 7, 17, 27, 28, 37, 39, 40, 49},
 {1, 2, 3, 9, 12, 25, 29, 30, 34, 45, 47},
 {5},
 {6},
 {8, 38},
 {10, 48},
 {11},
 {13},
 {14},
 {15},
 {16},
 {18},
 {19, 21, 46},
 {20},
 {22},
 {23, 26, 43},
 {24},
 {31},
 {32},
 {33},
 {35},
 {36},
 {41},
 {42},
 {44},
 {50},
 {51},
 {52},
 {53, 89},
 {54},
 {55},
 {56},
 {57, 93},
 {58},
 {59},
 {60},
 {61},
 {62},
 {63, 78, 91},
 {64},
 {65, 75},
 {66, 84},
 {67},
 {68},
 {69, 80, 81},
 {70},
 {71},
 {72},
 {73},
 {74, 97},
 {76},
 {77},
 {79},
 {82, 92},
 {83},
 {85},
 {86},
 {87},
 {88, 94, 95, 96, 99},
 {90},
 {98},
 {100},
 {101, 142},
 {102},
 {103},
 {104},
 {105},
 {106},
 {107},
 {108},
 {109},
 {110, 147},
 {111},
 {112, 139},
 {113},
 {114},
 {115},
 {116, 137},
 {117},
 {118},
 {119},
 {120, 143},
 {121},
 {122},
 {123, 126},
 {124},
 {125},
 {127, 138},
 {128, 132},
 {129},
 {130},
 {131},
 {133},
 {134},
 {135},
 {136},
 {140},
 {141},
 {144},
 {145},
 {146},
 {148},
 {149}]