In [1]:
%%capture
import pandas as pd
import numpy as np
from scipy.stats import bernoulli
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
!pip install qpsolvers
import qpsolvers
!pip install qpsolvers[cvxopt]
!pip install qpsolvers[open_source_solvers]
!pip install qpsolvers[clarabel]
from qpsolvers import solve_qp

In [2]:
mushroom_whole = pd.read_csv('mushroom.csv')
classes_to_keep = mushroom_whole['class'].copy()
mushroom_whole = mushroom_whole.drop(['class'], axis=1)
cols = mushroom_whole.columns.copy()
mushroom_whole = StandardScaler().fit_transform(X=mushroom_whole.to_numpy())
mushroom_whole = pd.DataFrame(mushroom_whole)
mushroom_whole.columns = cols
mushroom_whole['class'] = classes_to_keep
mushroom_data = mushroom_whole.sample(frac=0.8)
list_train = mushroom_data.index
list_test = []
for i in mushroom_whole.index:
    if i not in list_train:
        list_test.append(i)
mushroom_test = mushroom_whole.filter(items=list_test, axis=0)

mushroom_test = mushroom_test.reset_index(drop=True)
mushroom_data = mushroom_data.reset_index(drop=True)
n_samples = mushroom_data.shape[0]
n_samples_test = mushroom_test.shape[0]

In [3]:
def rbf(x,y,l=1):
    """Gaussian kernel

    Parameters
    -------------------------------
    x : float
    a real number

    y : float
    a real number

    l: float, non zero
    a scale parameter
    -------------------------------
    """
    dim = x.shape[0]
    vect = np.empty(dim)
    if dim == y.shape[0]  :
        d = np.exp((-1)*((np.linalg.norm(x-y))/(2*(l**2))))
        return d
    else :
        for i in range(dim):
            vect[i] = np.exp((-1)*(np.linalg.norm(x[i] - y))/(2*(l**2)))
        return vect

In [4]:
np.random.seed(1452234)
label = np.zeros(n_samples)
for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if mushroom_data.loc[i,'class'] == 1 and random == 0:
        label[i] = 1
    else:
        label[i] = -1
mushroom_data['label'] = label

svm_train = SVC(kernel='sigmoid', probability = True).fit(X=mushroom_data.to_numpy()[:,:-2],
                                                          y=mushroom_data.to_numpy()[:,-1])
probas = svm_train.predict_proba(mushroom_data.to_numpy()[:,:-2])

In [5]:
proba_gap = np.zeros(n_samples)
for i in range(n_samples):
    proba_gap[i] = probas[i,1] - probas[i,0]


mushroom_data['proba_gap'] = proba_gap

n_min = 3 #as in the article

l_boundary = np.mean(np.sort(mushroom_data[mushroom_data['label'] == 1]['proba_gap'])[:n_min])


relab = np.empty(n_samples)
for i in range(n_samples):
    if mushroom_data.loc[i,'proba_gap'] < l_boundary:
        relab[i] = -1
    elif mushroom_data.loc[i,'label'] == 1 or mushroom_data.loc[i,'proba_gap'] >= 0:
        relab[i] = 1
    else:
        relab[i] = 0
mushroom_data['relab'] = relab

In [6]:
B=1000
labeled_data = mushroom_data[mushroom_data['relab'] != 0].copy()
output_labeled = labeled_data['relab'].to_numpy()
list_of_index = labeled_data.index
labeled_data = labeled_data.reset_index(drop=True)
labeled_data = labeled_data.to_numpy()[:,:-4]
unlabeled_data = mushroom_data.drop(index=list_of_index,axis=0)
unlabeled_data = unlabeled_data.to_numpy()[:,:-4]
n_unlabeled = unlabeled_data.shape[0]
n_labels = labeled_data.shape[0]
capital_k = np.zeros((n_labels,n_labels))
kappa = np.zeros(n_labels)


#construction of capital_k
for i in range(n_labels):
    for j in range(i,n_labels):
        capital_k[i,j] = rbf(x=labeled_data[i,:],y=labeled_data[j,:])

capital_k = capital_k + capital_k.T
for i in range(n_labels):
    capital_k[i,i] = 1

capital_k[np.where(np.isnan(capital_k) == True)] = 0

#construction of kappa
ratio_lab_unlab = n_labels/n_unlabeled

for i in range(n_labels):
    vector = np.empty(n_unlabeled)
    for k in range(n_unlabeled):
        vector[k] = rbf(x=labeled_data[i,:],y=unlabeled_data[k,:])    
    kappa[i] = ratio_lab_unlab*np.sum(vector)

kappa = -kappa



ones_transposed = np.ones(n_labels).reshape(1,n_labels)
a_mat = np.vstack((ones_transposed,ones_transposed*-1,
                   np.eye(n_labels),np.eye(n_labels)*-1))
epsilon = (np.sqrt(n_labels)-1)/np.sqrt(n_labels)
ub_mat = np.vstack((n_labels*(1+epsilon),n_labels*(epsilon-1),
                    np.ones(n_labels).reshape(n_labels,1)*B,
                    np.zeros(n_labels).reshape(n_labels,1)))



beta_opti = solve_qp(P=capital_k,q=kappa,G=a_mat,h=ub_mat,solver='cvxopt')


svm_weighted = SVC().fit(X=labeled_data,y=output_labeled,sample_weight=beta_opti)

predictions_weighted = svm_weighted.predict(mushroom_test.to_numpy()[:,:-1])

positive = 0
true_positive = 0
for i in range(n_samples_test):
    if predictions_weighted[i] == 1:
        positive += 1
        if mushroom_test.loc[i,'class'] == 1:
            true_positive += 1

precision_pgpu = true_positive/positive
recall_pgpu = true_positive/mushroom_test[mushroom_test['class'] == 1].shape[0]
f_1_pgpu = (2*precision_pgpu*recall_pgpu)/(precision_pgpu+recall_pgpu)
weird_estim_pgpu = (recall_pgpu**2)/(positive/mushroom_test.shape[0])
print('the precision of pgpu on test set is :', precision_pgpu, '\n',
      'the recall of pgpu on test set is :', recall_pgpu, '\n', 
      'the f_1 score of pgpu on test set is :', f_1_pgpu, '\n', 
      'the weird metric of pgpu on test set is :', weird_estim_pgpu)

the precision of pgpu on test set is : 0.5612013600302229 
 the recall of pgpu on test set is : 0.9973145350788856 
 the f_1 score of pgpu on test set is : 0.7182400580200652 
 the weird metric of pgpu on test set is : 1.0152091328176223


In [7]:
positive

10588

In [8]:
mushroom_test.shape[0]

10807