In [1]:
%%capture
import pandas as pd
import numpy as np
from scipy.stats import bernoulli
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
!pip install qpsolvers
import qpsolvers
!pip install qpsolvers[cvxopt]
!pip install qpsolvers[open_source_solvers]
!pip install qpsolvers[clarabel]
from qpsolvers import solve_qp

In [2]:
raisin_whole_data = pd.read_csv('raisin.csv')
raisin_whole_data.columns = ['area','majlength','minlength','eccentric','cvx',
                             'extent','perimeter','class']
classes_to_keep = raisin_whole_data['class'].copy()
raisin_whole_data = raisin_whole_data.drop(['class'], axis=1)
raisin_whole_data = raisin_whole_data.to_numpy()
raisin_whole_data = StandardScaler().fit_transform(X=raisin_whole_data)
raisin_whole_data = pd.DataFrame(raisin_whole_data)
raisin_whole_data.columns = ['area','majlength','minlength','eccentric','cvx',
                             'extent','perimeter']
raisin_whole_data['class'] = classes_to_keep
raisin_whole_data = raisin_whole_data.replace('Besni',-1)
raisin_whole_data = raisin_whole_data.replace('Kecimen',1)
raisin, raisin_test = train_test_split(raisin_whole_data, train_size=0.8)

n_samples = raisin.shape[0]
n_samples_test = raisin_test.shape[0]

raisin = raisin.reset_index(drop=True)
raisin_test = raisin_test.reset_index(drop=True)

  raisin_whole_data = raisin_whole_data.replace('Kecimen',1)


In [3]:
def rbf(x,y,l=1):
    """Gaussian kernel

    Parameters
    -------------------------------
    x : float
    a real number

    y : float
    a real number

    l: float, non zero
    a scale parameter
    -------------------------------
    """
    dim = x.shape[0]
    vect = np.empty(dim)
    if dim == y.shape[0]  :
        d = np.exp((-1)*((np.linalg.norm(x-y))/(2*(l**2))))
        return d
    else :
        for i in range(dim):
            vect[i] = np.exp((-1)*(np.linalg.norm(x[i] - y))/(2*(l**2)))
        return vect

In [4]:
np.random.seed(1452234)
label = np.zeros(n_samples)
for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if raisin.loc[i,'class'] == 1 and random == 0:
        label[i] = 1
    else:
        label[i] = -1
raisin['label'] = label



svm_train = SVC(kernel='sigmoid', probability = True,
               gamma = 1).fit(X=raisin.to_numpy()[:,:-2], y=raisin.to_numpy()[:,-1])
probas = svm_train.predict_proba(raisin.to_numpy()[:,:-2])

In [5]:
proba_gap = np.zeros(n_samples)
for i in range(n_samples):
    proba_gap[i] = probas[i,1] - probas[i,0]


raisin['proba_gap'] = proba_gap

n_min = 3 #as in the article

l_boundary = np.mean(np.sort(raisin[raisin['label'] == 1]['proba_gap'])[:n_min])


relab = np.empty(n_samples)
for i in range(n_samples):
    if raisin.loc[i,'proba_gap'] < l_boundary:
        relab[i] = -1
    elif raisin.loc[i,'label'] == 1 or raisin.loc[i,'proba_gap'] >= 0:
        relab[i] = 1
    else:
        relab[i] = 0
raisin['relab'] = relab
n_new_pos = raisin[raisin['relab'] == 1].shape[0]
n_new_ne = raisin[raisin['relab'] == -1].shape[0]

if n_new_pos > n_new_ne :
    ordered_raisin = raisin.sort_values('proba_gap')
    ordered_raisin = ordered_raisin.reset_index(drop=True)
    negatively_relab = 0
    position = 0

    while negatively_relab < n_new_pos-n_new_ne:
        if ordered_raisin.loc[position,'relab'] == 0:
            ordered_raisin.loc[position,'relab'] = -1
            position += 1
            negatively_relab += 1
        else:
            position+=1
elif n_new_pos==n_new_ne:
    print('luck')

else:
    ordered_raisin = raisin.sort_values('proba_gap',ascending=False)
    ordered_raisin = ordered_raisin.reset_index(drop=True)
    positively_relab = 0
    position = 0
    while positively_relab<n_new_ne-n_new_pos:
        if ordered_raisin.loc[position,'relab'] == 0:
            ordered_raisin.loc[position,'relab'] = 1
            position += 1
            positively_relab += 1
        else:
            position += 1

raisin = ordered_raisin
print(raisin[raisin['relab'] == 1].shape[0], raisin[raisin['relab'] == -1].shape[0])

115 115


In [6]:
B=2
labeled_data = raisin[raisin['relab'] != 0].copy()
output_labeled = labeled_data['relab'].to_numpy()
list_of_index = labeled_data.index
labeled_data = labeled_data.reset_index(drop=True)
labeled_data = labeled_data.to_numpy()[:,:-4]
unlabeled_data = raisin.drop(index=list_of_index,axis=0)
unlabeled_data = unlabeled_data.to_numpy()[:,:-4]
n_unlabeled = unlabeled_data.shape[0]
n_labels = labeled_data.shape[0]
capital_k = np.zeros((n_labels,n_labels))
kappa = np.zeros(n_labels)


#construction of capital_k
for i in range(n_labels):
    for j in range(i,n_labels):
        capital_k[i,j] = rbf(x=labeled_data[i,:],y=labeled_data[j,:])

capital_k = capital_k + capital_k.T
for i in range(n_labels):
    capital_k[i,i] = 1

capital_k[np.where(np.isnan(capital_k) == True)] = 0

#construction of kappa
ratio_lab_unlab = n_labels/n_unlabeled

for i in range(n_labels):
    vector = np.empty(n_unlabeled)
    for k in range(n_unlabeled):
        vector[k] = rbf(x=labeled_data[i,:],y=unlabeled_data[k,:])    
    kappa[i] = ratio_lab_unlab*np.sum(vector)

kappa = -kappa



ones_transposed = np.ones(n_labels).reshape(1,n_labels)
a_mat = np.vstack((ones_transposed,ones_transposed*-1,
                   np.eye(n_labels),np.eye(n_labels)*-1))
epsilon = (np.sqrt(n_labels)-1)/np.sqrt(n_labels)
ub_mat = np.vstack((n_labels*(1+epsilon),n_labels*(epsilon-1),
                    np.ones(n_labels).reshape(n_labels,1)*B,
                    np.zeros(n_labels).reshape(n_labels,1)))



beta_opti = solve_qp(P=capital_k,q=kappa,G=a_mat,h=ub_mat,solver='cvxopt')


svm_weighted = SVC().fit(X=labeled_data,y=output_labeled,sample_weight=beta_opti)

predictions_weighted = svm_weighted.predict(raisin_test.to_numpy()[:,:-1])

positive = 0
true_positive = 0
for i in range(n_samples_test):
    if predictions_weighted[i] == 1:
        positive += 1
        if raisin.loc[i,'class'] == 1:
            true_positive += 1

precision_pgpu = true_positive/positive
recall_pgpu = true_positive/raisin_test[raisin_test['class'] == 1].shape[0]
f_1_pgpu = (2*precision_pgpu*recall_pgpu)/(precision_pgpu+recall_pgpu)
weird_estim_pgpu = (recall_pgpu**2)/(positive/raisin_test.shape[0])
print('the precision of pgpu on test set is :', precision_pgpu, '\n',
      'the recall of pgpu on test set is :', recall_pgpu, '\n', 
      'the f_1 score of pgpu on test set is :', f_1_pgpu, '\n', 
      'the weird metric of pgpu on test set is :', weird_estim_pgpu)

the precision of pgpu on test set is : 0.7456140350877193 
 the recall of pgpu on test set is : 0.8673469387755102 
 the f_1 score of pgpu on test set is : 0.8018867924528301 
 the weird metric of pgpu on test set is : 1.187827440320919


In [7]:
full_data = pd.read_csv('raisin.csv')
full_data = full_data.replace('Besni',-1)
full_data = full_data.replace('Kecimen',1)
full_data_output = full_data['Class'].to_numpy()
full_data = full_data.to_numpy()[:,:-1]
svm_full = SVC(kernel = 'rbf').fit(X=full_data,y=full_data_output)
predictions_full = svm_full.predict(full_data)
positive_full = 0
true_positive_full = 0
for i in range(full_data.shape[0]):
    if predictions_full[i] == 1:
        positive_full += 1
        if full_data_output[i] == 1:
            true_positive_full += 1
precision_whole = true_positive_full/positive_full
print(precision_whole)

0.7808219178082192


  full_data = full_data.replace('Kecimen',1)


In [8]:
#so the data is not fully separable
#so now let's see what happens with training and test

In [9]:
svm_train = SVC().fit(X=raisin.to_numpy()[:,:-4],y=raisin.to_numpy()[:,-4])

In [10]:
predictions_test = svm_train.predict(raisin_test.to_numpy()[:,:-1])

In [11]:
positive_test_full = 0
true_positive_test_full = 0
for i in range(raisin_test.shape[0]):
    if predictions_test[i] == 1:
        positive_test_full += 1
        if raisin_test.loc[i,'class'] == 1:
            true_positive_test_full += 1

precision_test_labeled = true_positive_test_full/positive_test_full
recall_test = true_positive_test_full/raisin_test[raisin_test['class'] == 1].shape[0]
f_1_test = (2*precision_test_labeled*recall_test)/(precision_test_labeled+recall_test)
weird_metric_test = (recall_test**2)/(positive_test_full/raisin_test[raisin_test['class'] == 1].shape[0])
print('precision : ', precision_test_labeled, '\n', 
      'recall : ', recall_test, '\n', 
      'f_1 score on test : ', f_1_test, '\n',
      'weird metric on test : ', weird_metric_test)

precision :  0.8095238095238095 
 recall :  0.8673469387755102 
 f_1 score on test :  0.8374384236453202 
 weird metric on test :  0.7021379980563653


In [12]:
print(true_positive, '\n', true_positive_test_full)

85 
 85


In [13]:
print(positive, '\n', positive_test_full)

114 
 105


In [14]:
print('precision for the pgpu method : ', precision_pgpu, '\n',
      'precision on the whole data : ', precision_whole, '\n',
      'test precision : ', precision_test_labeled)

precision for the pgpu method :  0.7456140350877193 
 precision on the whole data :  0.7808219178082192 
 test precision :  0.8095238095238095


In [15]:
#so the precision is actually quite bad on the test data

In [16]:
positive

114

In [17]:
raisin_test.shape[0]

180

In [18]:
raisin.groupby('relab').count()

Unnamed: 0_level_0,area,majlength,minlength,eccentric,cvx,extent,perimeter,class,label,proba_gap
relab,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
-1.0,115,115,115,115,115,115,115,115,115,115
0.0,490,490,490,490,490,490,490,490,490,490
1.0,115,115,115,115,115,115,115,115,115,115


In [19]:
n_unlabeled

490

In [20]:
raisin[raisin['label'] == 1].shape[0]

87