In [1]:
import numpy as np
import pandas as pd
from scipy.stats import bernoulli
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.svm import SVC

In [2]:
mushrooms = pd.read_csv('mushroom.csv')

In [3]:
mushrooms.head()

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
0,1372,2,2,10,3.807467,1545,11,1.804273,1
1,1461,2,2,10,3.807467,1557,11,1.804273,1
2,1371,2,2,10,3.612496,1566,11,1.804273,1
3,1261,6,2,10,3.787572,1566,11,1.804273,1
4,1305,6,2,10,3.711971,1464,11,0.943195,1


In [4]:
n_samples = mushrooms.shape[0]

In [5]:
mushrooms['label'] = np.ones(n_samples)*(-1)
for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if mushrooms.loc[i,'class'] == 1 and random == 0:
        mushrooms.loc[i,'label'] = 1
    else:
        mushrooms.loc[i,'label'] = -1

In [6]:
mushrooms.groupby('label').count()

Unnamed: 0_level_0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-1.0,46789,46789,46789,46789,46789,46789,46789,46789,46789
1.0,7246,7246,7246,7246,7246,7246,7246,7246,7246


In [7]:
list_of_inertia = np.empty(10)
for i in range(1,11):
    list_of_inertia[i-1] = KMeans(n_clusters=i,init = 'random').fit(mushrooms.drop(['label', 'class'], axis=1).to_numpy()).inertia_

In [8]:
n_cluster = 10

In [9]:
clusterized_data = KMeans(n_clusters=n_cluster, init='random').fit(mushrooms.drop(['label', 'class'], axis=1).to_numpy())

In [10]:
mushrooms['cluster'] = clusterized_data.labels_

In [11]:
mushrooms[['class','label','cluster']].groupby('cluster').sum()

Unnamed: 0_level_0,class,label
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1270,-1114.0
1,1964,-3939.0
2,3095,-5421.0
3,926,-1987.0
4,7523,-6229.0
5,4348,-5561.0
6,962,-2837.0
7,720,-1243.0
8,5023,-5585.0
9,3844,-5627.0


In [12]:
positive_cluster = np.where(mushrooms[['class','label','cluster']].groupby('cluster').sum().to_numpy()[:,0] == 
                            np.max(mushrooms[['class','label','cluster']].groupby('cluster').sum().to_numpy()[:,0]))[0][0]

In [13]:
positive_centroid = clusterized_data.cluster_centers_[positive_cluster]

In [14]:
dist_to_positive = np.empty(n_cluster)
for i in range(n_cluster):
    dist_to_positive[i] = np.linalg.norm(positive_centroid - clusterized_data.cluster_centers_[i])

In [15]:
negative_cluster = np.where(dist_to_positive == np.max(dist_to_positive))[0][0]

In [16]:
print('the cluster containing positive values is : ', positive_cluster, '\n', 
      'the one containing negative exampels is : ', negative_cluster)

the cluster containing positive values is :  4 
 the one containing negative exampels is :  0


In [17]:
reliable_negatives = mushrooms[mushrooms['cluster'] == negative_cluster]

In [18]:
reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]

In [104]:
def sigmoid(x,y,a=1,b=-1):
    """a function to compute the sigmoid kernel with the hyperbolic tangent

    Parameters
    ----------------------------
    x : array-like or float
    input vector or real number

    y : array-like or float
    second input vector or float

    a : strictly positive float
    a tuning parameter

    b : strictly negativ float
    a tuning parameter
    ----------------------------
    """
    dim = x.shape[0]
    vect = np.empty(dim)
    if dim == y.shape[0]:
        transfo = a*np.sum(x*y) + b
        d = np.tanh(transfo)
        return d
    else:
        for i in range(dim):
            transfo = a*np.sum(x[i]*y) + b
            vect[i] = np.tanh(transfo)
            return vect

In [20]:
def rbf(x,y,l=1):
    """Gaussian kernel

    Parameters
    -------------------------------
    x : float
    a real number

    y : float
    a real number

    l: float, non zero
    a scale parameter
    -------------------------------
    """
    dim = x.shape[0]
    vect = np.empty(dim)
    if dim == y.shape[0]  :
        d = np.exp(-((np.linalg.norm(x-y))**2)/(2*(l**2)))
        return d
    else :
        for i in range(dim):
            vect[i] = np.exp(-((np.linalg.norm(x[i] - y))**2)/(2*(l**2)))
        return vect

In [21]:
gamma = 1

In [22]:
positives = mushrooms[mushrooms['cluster'] == positive_cluster]
positives = positives[positives['label'] == 1]

In [23]:
positives

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class,label,cluster
2853,232,2,2,5,0.959362,269,11,0.943195,1,1.0,4
2861,176,0,2,5,1.345324,277,11,0.943195,1,1.0,4
2940,197,2,2,5,0.744497,289,11,0.888450,1,1.0,4
2977,216,2,2,5,0.911614,272,11,0.888450,1,1.0,4
3015,179,2,2,5,0.955383,276,11,0.888450,1,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...
51993,286,3,3,2,2.013794,0,2,1.804273,1,1.0,4
51998,176,3,3,2,2.013794,0,2,1.804273,1,1.0,4
52000,261,3,3,2,2.013794,0,2,0.027372,1,1.0,4
52006,238,3,3,2,2.013794,0,2,0.888450,1,1.0,4


In [24]:
reliable_negatives

Unnamed: 0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class,label,cluster
4707,1440,6,1,10,1.949279,2884,11,1.804273,0,-1.0,0
4952,1281,6,0,10,1.439968,2826,11,1.804273,0,-1.0,0
14089,946,2,0,10,1.077880,2791,6,1.804273,0,-1.0,0
14107,884,6,0,10,0.592444,2815,6,1.804273,0,-1.0,0
14126,1224,2,0,10,0.707834,2918,6,0.943195,0,-1.0,0
...,...,...,...,...,...,...,...,...,...,...,...
53214,1121,3,3,2,0.784287,2813,6,0.888450,1,-1.0,0
53261,1067,3,3,2,0.533610,2761,6,0.888450,1,-1.0,0
53285,1111,3,3,2,0.533610,2773,6,0.027372,1,-1.0,0
53320,1009,3,3,2,1.007110,2739,6,0.888450,1,-1.0,0


In [25]:
positives_array = positives.drop(['class','cluster'], axis=1)
negatives_array = reliable_negatives.drop(['class', 'cluster'], axis=1)

In [26]:
data_svm = np.vstack((positives_array,negatives_array))

In [27]:
n_reliable = data_svm.shape[0]

In [28]:
outcome = data_svm[:,8]

In [29]:
data_svm = data_svm[:,:8]

In [112]:
omega = np.empty((n_reliable,n_reliable))
for k in range(n_reliable):
    for i in range(n_reliable):
        omega[k,i] = outcome[k]*outcome[i]*rbf(x=data_svm[k,:],y=data_svm[i,:],l=1)

In [31]:
first_row = np.hstack((0,-np.transpose(outcome)))
first_row = first_row.reshape(1,first_row.shape[0])
bot_of_mat_right = omega + (1/gamma)*np.eye(n_reliable)
bot_of_mat = np.hstack((outcome.reshape(n_reliable,1), bot_of_mat_right))
whole_mat = np.vstack((first_row, bot_of_mat))
right_hand = np.ones(n_reliable+1)
right_hand[0] = 0

In [32]:
coeffs = np.linalg.solve(a=whole_mat,b=right_hand)

In [33]:
b = coeffs[0]
alpha = coeffs[1:coeffs.shape[0]]

In [34]:
test_data = mushrooms.drop(['class','label','cluster'], axis=1).to_numpy()
results = np.empty(n_samples)

In [35]:
for i in tqdm(range(n_samples)):
    results[i] = np.sign(np.sum(alpha*outcome*rbf(x=data_svm,y=test_data[i,:],l=1))+b)

100%|██████████| 54035/54035 [16:00<00:00, 56.26it/s]


In [36]:
results_bis = results

In [37]:
for i in range(n_samples):
    if results[i] == -1:
        results[i] = 0

In [38]:
positives = np.sum(results)
true_positives = 0
mushrooms['result'] = results
for i in range(n_samples):
    if mushrooms.loc[i,'class'] == 1 and results[i] == 1:
        true_positives += 1

In [39]:
precision = true_positives/positives

In [40]:
precision

0.5452403572107163

So, a simple LS-SVM applied on a PU dataset has a precision of 0.54, which is not very good as we know that if we do a simple SVM on the fully labelled data, we obtain a precision of 0.99.

In [105]:
omega = np.empty((n_reliable,n_reliable))
for k in range(n_reliable):
    for i in range(n_reliable):
        omega[k,i] = outcome[k]*outcome[i]*sigmoid(x=data_svm[k,:],y=data_svm[i,:])

In [106]:
for i in range(n_reliable):
    for j in range(n_reliable):
        if np.isnan(omega[i,j]):
            omega[i,j] = 0

In [107]:
first_row = np.hstack((0,-np.transpose(outcome)))
first_row = first_row.reshape(1,first_row.shape[0])
bot_of_mat_right = omega + (1/gamma)*np.eye(n_reliable)
bot_of_mat = np.hstack((outcome.reshape(n_reliable,1), bot_of_mat_right))
whole_mat = np.vstack((first_row, bot_of_mat))
right_hand = np.ones(n_reliable+1)
right_hand[0] = 0

In [108]:
coeffs = np.linalg.solve(a=whole_mat,b=right_hand)

In [109]:
b = coeffs[0]
alpha = coeffs[1:coeffs.shape[0]]

In [110]:
test_data = mushrooms.drop(['class','label','cluster','result'], axis=1).to_numpy()
results = np.empty(n_samples)

In [111]:
for i in tqdm(range(n_samples)):
    results[i] = np.sign(np.sum(alpha*outcome*sigmoid(x=data_svm,y=test_data[i,:]))+b)
    
results_bis = results
for i in range(n_samples):
    if results[i] == -1:
        results[i] = 0
positives = np.sum(results)
true_positives = 0
mushrooms['result'] = results
for i in range(n_samples):
    if mushrooms.loc[i,'class'] == 1 and results[i] == 1:
        true_positives += 1
precision = true_positives/positives
print(precision)

100%|██████████| 54035/54035 [00:02<00:00, 19494.47it/s]


0.5491810863329324


In [86]:
#with a=1,b=-1 the values seem soo small to be computed...

In [87]:
### Now with the sklearn library

In [88]:
sklearn_classif = SVC(kernel='rbf').fit(X=data_svm,y=outcome)

In [89]:
results_sklearn = sklearn_classif.predict(test_data)

In [90]:
for i in range(n_samples):
    if results_sklearn[i] == -1:
        results_sklearn[i] = 0

In [91]:
positives_sklearn = np.sum(results_sklearn)
true_positives_sklearn = 0
for i in range(n_samples):
    if mushrooms.loc[i,'class'] == 1 and results_sklearn[i] == 1:
        true_positives_sklearn += 1
precision = true_positives_sklearn/positives_sklearn
print('the precision with a rbf kernel is : ', precision)

the precision with a rbf kernel is :  0.5953205816336489


The one from sklearn is a bit more precise. It is done with an rbf kernel.

In [92]:
sklearn_classif = SVC(kernel='sigmoid').fit(X=data_svm,y=outcome)
results_sklearn = sklearn_classif.predict(test_data)
for i in range(n_samples):
    if results_sklearn[i] == -1:
        results_sklearn[i] = 0
positives_sklearn = np.sum(results_sklearn)
true_positives_sklearn = 0
for i in range(n_samples):
    if mushrooms.loc[i,'class'] == 1 and results_sklearn[i] == 1:
        true_positives_sklearn += 1
precision = true_positives_sklearn/positives_sklearn
print('the precision with a sigmoid kernel is : ', precision)

the precision with a sigmoid kernel is :  0.7022766078542971


Done with a sigmoid kernel the precision is a lot higher.

In [100]:
sigmoid(x=test_1,y=test_4)

  d = (np.exp(transfo)-np.exp(-transfo))/(np.exp(transfo)+np.exp(-transfo))
  d = (np.exp(transfo)-np.exp(-transfo))/(np.exp(transfo)+np.exp(-transfo))


nan