In [1]:
import pandas as pd
import numpy as np
from scipy.stats import bernoulli
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from tqdm import tqdm

In [2]:
def rbf(x,y,l=1):
    """Gaussian kernel

    Parameters
    -------------------------------
    x : float
    a real number

    y : float
    a real number

    l: float, non zero
    a scale parameter
    -------------------------------
    """
    dim = x.shape[0]
    vect = np.empty(dim)
    if dim == y.shape[0]  :
        d = np.exp(-((np.linalg.norm(x-y))**2)/(2*(l**2)))
        return d
    else :
        for i in range(dim):
            vect[i] = np.exp(-((np.linalg.norm(x[i] - y))**2)/(2*(l**2)))
        return vect

In [3]:
#we 'create' unlabelled data

mushrooms = pd.read_csv('mushroom.csv')
n_samples = mushrooms.shape[0]

mushrooms['label'] = np.ones(n_samples)*(-1)
for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if mushrooms.loc[i,'class'] == 1 and random == 0:
        mushrooms.loc[i,'label'] = 1
    else:
        mushrooms.loc[i,'label'] = -1

mushrooms.groupby('label').count()

Unnamed: 0_level_0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,class
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-1.0,46601,46601,46601,46601,46601,46601,46601,46601,46601
1.0,7434,7434,7434,7434,7434,7434,7434,7434,7434


In [4]:
n_samples = mushrooms.shape[0]

In [5]:
#we proceed as in the article, by first making clusters. 
#the choice of 8 is not based on the usual 'plot of inertia', but more on the fact that from 8
#and after, there appears a cluster that has less than a thousand positive examples with many unlabelled
#one could thus interpret that it might be the 'true' negative cluster
n_cluster = 8
clusterized_data = KMeans(n_clusters=n_cluster, init='random').fit(mushrooms.drop(['label', 'class'], axis=1).to_numpy())
mushrooms['cluster'] = clusterized_data.labels_
mushrooms[['class','label','cluster']].groupby('cluster').sum()

Unnamed: 0_level_0,class,label
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,8523,-7440.0
1,1517,-1302.0
2,1385,-3598.0
3,3285,-5631.0
4,4749,-6994.0
5,7063,-7708.0
6,2181,-4407.0
7,972,-2087.0


In [6]:
#we now define positive and negative clusters
#the positive cluster is defined by the number of positive examples it contains where the negative cluster is
#defined as the one with the highest distance to the centroid of the positive cluster
#but as it turns out it doesn't work very well in this setting
"""
positive_cluster = np.where(mushrooms[['class','label','cluster']].groupby('cluster').sum().to_numpy()[:,1] == 
                            np.max(mushrooms[['class','label','cluster']].groupby('cluster').sum().to_numpy()[:,1]))[0][0]
positive_centroid = clusterized_data.cluster_centers_[positive_cluster]
dist_to_positive = np.empty(n_cluster)
for i in range(n_cluster):
    dist_to_positive[i] = np.linalg.norm(positive_centroid - clusterized_data.cluster_centers_[i])
negative_cluster = np.where(dist_to_positive == np.max(dist_to_positive))[0][0]
print('the cluster containing positive values is : ', positive_cluster, '\n', 
      'the one containing negative exampels is : ', negative_cluster)
"""
list_of_ratio = []
for i in range(n_cluster):
    list_of_ratio.append(mushrooms[mushrooms['cluster'] == i]['label'].sum()/mushrooms[mushrooms['cluster'] == i]['class'].shape[0])
list_of_ratio = np.array(list_of_ratio)
positive_cluster = np.argmax(list_of_ratio)
negative_cluster = np.argmin(list_of_ratio)
print('the cluster containing positive values is : ', positive_cluster, '\n', 
      'the one containing negative exampels is : ', negative_cluster)

the cluster containing positive values is :  0 
 the one containing negative exampels is :  2


In [7]:
reliable_positives = mushrooms[mushrooms['cluster'] == positive_cluster]
reliable_positives = reliable_positives[reliable_positives['label'] == 1]
reliable_negatives = mushrooms[mushrooms['cluster'] == negative_cluster]
reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]

In [8]:
print(reliable_positives.shape[0], reliable_negatives.shape[0])

2118 3960


In [9]:
#now, we suppose (it is true here but in theory we have to suppose what ratio do we want of labeled and unlabeled
#data) that 1/2 of the data is positive

In [10]:
reliable_negatives = reliable_negatives.sample(n=reliable_positives.shape[0])

In [11]:
print(reliable_positives.shape[0], reliable_negatives.shape[0])

2118 2118


Now, comes an important choice, because as we change the kernel we use, we have to change the 
phi matrix. To begin, I start with a rbf kernel as it is standard procedure.
In fact, we know that : $ \omega ^T = \hat{\alpha}  \Phi (\textbf{X})$, hence, we can say that $\textbf{for a rbf kernel}$ $\Phi(X) = \kappa (X,0)$. With the 0 corresponding to the 0 of $\mathbb{R}^n$, n=number of samples.

## LS-SVM

In [12]:
#steps to compute alpha and b
#first, computation of the 'omega' matrix
gamma = 1
positives_array = reliable_positives.drop(['class','cluster'], axis=1)
negatives_array = reliable_negatives.drop(['class', 'cluster'], axis=1)
data_svm = np.vstack((positives_array,negatives_array))
n_reliable = data_svm.shape[0]
outcome = data_svm[:,8]
data_svm = data_svm[:,:8]
omega = np.empty((n_reliable,n_reliable))
for k in range(n_reliable):
    for i in range(n_reliable):
        omega[k,i] = outcome[k]*outcome[i]*rbf(x=data_svm[k,:],y=data_svm[i,:],l=10)
#now, computation of the rest of the matrix
first_row = np.hstack((0,-np.transpose(outcome)))
first_row = first_row.reshape(1,first_row.shape[0])
bot_of_mat_right = omega + (1/gamma)*np.eye(n_reliable)
bot_of_mat = np.hstack((outcome.reshape(n_reliable,1), bot_of_mat_right))
whole_mat = np.vstack((first_row, bot_of_mat))
right_hand = np.ones(n_reliable+1)
right_hand[0] = 0

In [13]:
#now, we get the coefficients by solving the set of linear equations
coeffs = np.linalg.solve(a=whole_mat,b=right_hand)
b = coeffs[0]
alpha = coeffs[1:coeffs.shape[0]]

In [None]:
#first test/iteration
test_data = mushrooms.drop(['class','label','cluster'], axis=1).to_numpy()
results = np.empty(n_samples)
for i in tqdm(range(n_samples)):
    results[i] = np.sign(np.sum(alpha*outcome*rbf(x=data_svm,y=test_data[i,:],l=10))+b)


 29%|██▊       | 15436/54035 [05:46<13:14, 48.56it/s]

In [None]:
#computation of the precision
results_bis = results
for i in range(n_samples):
    if results[i] == -1:
        results[i] = 0
positives = np.sum(results)
true_positives = 0
mushrooms['result'] = results
for i in range(n_samples):
    if mushrooms.loc[i,'class'] == 1 and results[i] == 1:
        true_positives += 1
precision = true_positives/positives
print(precision)

In [None]:
#compared to the simple svm and the gmm, it is a lot better

In [None]:
#but not many points labeled positive

In [None]:
negative_cluster

In [None]:
reliable_negatives['class'].sum()/reliable_negatives.shape[0]

In [None]:
#little pb extracting reliable negatives as the rate is not very good

In [None]:
true_positives/mushrooms['class'].sum()

In [None]:
#the true positive rate is quite low

In [None]:
b