In [118]:
import numpy as np
import pandas as pd
from scipy.stats import bernoulli
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.svm import SVC

In [14]:
mushrooms = pd.read_csv('mushroom.csv')

In [15]:
mushrooms.head()

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
0,1372,2,2,10,3.807467,1545,11,1.804273,1
1,1461,2,2,10,3.807467,1557,11,1.804273,1
2,1371,2,2,10,3.612496,1566,11,1.804273,1
3,1261,6,2,10,3.787572,1566,11,1.804273,1
4,1305,6,2,10,3.711971,1464,11,0.943195,1


In [16]:
n_samples = mushrooms.shape[0]

In [17]:
mushrooms['label'] = np.ones(n_samples)*(-1)
for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if mushrooms.loc[i,'class'] == 1 and random == 0:
        mushrooms.loc[i,'label'] = 1
    else:
        mushrooms.loc[i,'label'] = -1

In [18]:
mushrooms.groupby('label').count()

Unnamed: 0_level_0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-1.0,46613,46613,46613,46613,46613,46613,46613,46613,46613
1.0,7422,7422,7422,7422,7422,7422,7422,7422,7422


In [21]:
list_of_inertia = np.empty(10)
for i in range(1,11):
    list_of_inertia[i-1] = KMeans(n_clusters=i,init = 'random').fit(mushrooms.drop(['label', 'class'], axis=1).to_numpy()).inertia_

In [42]:
n_cluster = 10

In [43]:
clusterized_data = KMeans(n_clusters=n_cluster, init='random').fit(mushrooms.drop(['label', 'class'], axis=1).to_numpy())

In [44]:
mushrooms['cluster'] = clusterized_data.labels_

In [45]:
mushrooms[['class','label','cluster']].groupby('cluster').sum()

Unnamed: 0_level_0,class,label
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,812,-1678.0
1,896,-932.0
2,738,-2820.0
3,8241,-7043.0
4,3564,-5527.0
5,6345,-7017.0
6,736,-515.0
7,2767,-4392.0
8,4510,-6913.0
9,1066,-2354.0


In [46]:
positive_cluster = np.where(mushrooms[['class','label','cluster']].groupby('cluster').sum().to_numpy()[:,0] == 
                            np.max(mushrooms[['class','label','cluster']].groupby('cluster').sum().to_numpy()[:,0]))[0][0]

In [50]:
positive_centroid = clusterized_data.cluster_centers_[positive_cluster]

In [53]:
dist_to_positive = np.empty(n_cluster)
for i in range(n_cluster):
    dist_to_positive[i] = np.linalg.norm(positive_centroid - clusterized_data.cluster_centers_[i])

In [54]:
negative_cluster = np.where(dist_to_positive == np.max(dist_to_positive))[0][0]

In [68]:
print('the cluster containing positive values is : ', positive_cluster, '\n', 
      'the one containing negative exampels is : ', negative_cluster)

the cluster containing positive values is :  3 
 the one containing negative exampels is :  6


In [63]:
reliable_negatives = mushrooms[mushrooms['cluster'] == negative_cluster]

In [64]:
reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]

In [1]:
def sigmoid(x,y,a=1,b=-1):
    """a function to compute the sigmoid kernel with the hyperbolic tangent

    Parameters
    ----------------------------
    x : array-like or float
    input vector or real number

    y : array-like or float
    second input vector or float

    a : strictly positive float
    a tuning parameter

    b : strictly negativ float
    a tuning parameter
    ----------------------------
    """
    dim = x.shape[0]
    vect = np.empty(dim)
    if dim == y.shape[0]:
        transfo = a*np.sum(x*y) + b
        d = (np.exp(transfo)-np.exp(-transfo))/(np.exp(transfo)+np.exp(-transfo))
        return d
    else:
        for i in range(dim):
            transfo = a*np.sum(x[i]*y) + b
            vect[i] = (np.exp(transfo)-np.exp(-transfo))/(np.exp(transfo)+np.exp(-transfo))
            return vect

In [152]:
def rbf(x,y,l=1):
    """Gaussian kernel

    Parameters
    -------------------------------
    x : float
    a real number

    y : float
    a real number

    l: float, non zero
    a scale parameter
    -------------------------------
    """
    dim = x.shape[0]
    vect = np.empty(dim)
    if dim == y.shape[0]  :
        d = np.exp(-((np.linalg.norm(x-y))**2)/(2*(l**2)))
        return d
    else :
        for i in range(dim):
            vect[i] = np.exp(-((np.linalg.norm(x[i] - y))**2)/(2*(l**2)))
        return vect

In [67]:
gamma = 1

In [69]:
positives = mushrooms[mushrooms['cluster'] == positive_cluster]
positives = positives[positives['label'] == 1]

In [70]:
positives

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class,label,cluster
2852,321,0,2,10,0.836013,299,11,0.888450,1,1.0,3
2853,232,2,2,5,0.959362,269,11,0.943195,1,1.0,3
2861,176,0,2,5,1.345324,277,11,0.943195,1,1.0,3
2863,291,2,2,5,0.748476,283,11,0.888450,1,1.0,3
2864,273,2,2,5,0.577379,306,11,0.888450,1,1.0,3
...,...,...,...,...,...,...,...,...,...,...,...
53869,67,6,3,2,1.393072,405,12,0.888450,1,1.0,3
53878,64,2,3,2,1.420925,404,12,0.943195,1,1.0,3
53893,85,6,3,2,1.150354,389,12,0.888450,1,1.0,3
53949,63,2,3,2,1.428883,407,12,0.943195,1,1.0,3


In [72]:
reliable_negatives

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class,label,cluster
4624,1592,5,1,10,1.551380,2783,11,0.943195,0,-1.0,6
4674,1396,2,1,10,1.511590,2846,11,0.943195,0,-1.0,6
4703,1522,5,0,10,1.925405,2823,11,0.943195,0,-1.0,6
4707,1440,6,1,10,1.949279,2884,11,1.804273,0,-1.0,6
4714,1844,5,1,10,1.193271,2727,11,0.943195,0,-1.0,6
...,...,...,...,...,...,...,...,...,...,...,...
52090,1849,2,4,11,0.155606,3346,6,1.804273,1,-1.0,6
52091,1691,2,4,11,0.374451,3086,6,0.943195,1,-1.0,6
52092,1510,6,4,11,0.298850,3360,6,0.943195,1,-1.0,6
52093,1699,6,4,11,0.418220,3445,6,0.943195,1,-1.0,6


In [73]:
positives_array = positives.drop(['class','cluster'], axis=1)
negatives_array = reliable_negatives.drop(['class', 'cluster'], axis=1)

In [75]:
data_svm = np.vstack((positives_array,negatives_array))

In [76]:
n_reliable = data_svm.shape[0]

In [86]:
outcome = data_svm[:,8]

In [91]:
data_svm = data_svm[:,:8]

In [92]:
omega = np.empty((n_reliable,n_reliable))
for k in range(n_reliable):
    for i in range(n_reliable):
        omega[k,i] = outcome[k]*outcome[i]*rbf(x=data_svm[k,:],y=data_svm[i,:],l=1)

In [94]:
first_row = np.hstack((0,-np.transpose(outcome)))
first_row = first_row.reshape(1,first_row.shape[0])
bot_of_mat_right = omega + (1/gamma)*np.eye(n_reliable)
bot_of_mat = np.hstack((outcome.reshape(n_reliable,1), bot_of_mat_right))
whole_mat = np.vstack((first_row, bot_of_mat))
right_hand = np.ones(n_reliable+1)
right_hand[0] = 0

In [95]:
coeffs = np.linalg.solve(a=whole_mat,b=right_hand)

In [99]:
b = coeffs[0]
alpha = coeffs[1:coeffs.shape[0]]

In [101]:
test_data = mushrooms.drop(['class','label','cluster'], axis=1).to_numpy()
results = np.empty(n_samples)

In [109]:
for i in tqdm(range(n_samples)):
    results[i] = np.sign(np.sum(alpha*outcome*rbf(x=data_svm,y=test_data[i,:],l=1))+b)

100%|██████████| 54035/54035 [12:36<00:00, 71.38it/s]


In [110]:
results_bis = results

In [111]:
for i in range(n_samples):
    if results[i] == -1:
        results[i] = 0

In [115]:
positives = np.sum(results)
true_positives = 0
mushrooms['result'] = results
for i in range(n_samples):
    if mushrooms.loc[i,'class'] == 1 and results[i] == 1:
        true_positives += 1

In [116]:
precision = true_positives/positives

In [117]:
precision

0.54620288768048

So, a simple LS-SVM applied on a PU dataset has a precision of 0.54, which is not very good as we know that if we do a simple SVM on the fully labelled data, we obtain a precision of 0.99.

In [None]:
omega = np.empty((n_reliable,n_reliable))
for k in range(n_reliable):
    for i in range(n_reliable):
        omega[k,i] = outcome[k]*outcome[i]*rbf(x=data_svm[k,:],y=data_svm[i,:],l=1)

In [147]:
sklearn_classif = SVC(kernel='rbf').fit(X=data_svm,y=outcome)

In [148]:
results_sklearn = sklearn_classif.predict(test_data)

In [149]:
for i in range(n_samples):
    if results_sklearn[i] == -1:
        results_sklearn[i] = 0

In [150]:
positives_sklearn = np.sum(results_sklearn)
true_positives_sklearn = 0
for i in range(n_samples):
    if mushrooms.loc[i,'class'] == 1 and results_sklearn[i] == 1:
        true_positives_sklearn += 1
precision = true_positives_sklearn/positives_sklearn
print('the precision with a rbf kernel is : ', precision)

the precision with a rbf kernel is :  0.5918848785090823


The one from sklearn is a bit more precise. It is done with an rbf kernel.

In [151]:
sklearn_classif = SVC(kernel='sigmoid').fit(X=data_svm,y=outcome)
results_sklearn = sklearn_classif.predict(test_data)
for i in range(n_samples):
    if results_sklearn[i] == -1:
        results_sklearn[i] = 0
positives_sklearn = np.sum(results_sklearn)
true_positives_sklearn = 0
for i in range(n_samples):
    if mushrooms.loc[i,'class'] == 1 and results_sklearn[i] == 1:
        true_positives_sklearn += 1
precision = true_positives_sklearn/positives_sklearn
print('the precision with a sigmoid kernel is : ', precision)

the precision with a sigmoid kernel is :  0.7261417717291475


Done with a sigmoid kernel the precision is a lot higher.