In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import bernoulli
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.svm import SVC

In [2]:
def rbf(x,y,l=1):
    """Gaussian kernel

    Parameters
    -------------------------------
    x : float
    a real number

    y : float
    a real number

    l: float, non zero
    a scale parameter
    -------------------------------
    """
    dim = x.shape[0]
    vect = np.empty(dim)
    if dim == y.shape[0]  :
        d = np.exp(-((np.linalg.norm(x-y)))/(2*(l**2)))
        return d
    else :
        for i in range(dim):
            vect[i] = np.exp(-((np.linalg.norm(x[i] - y)))/(2*(l**2)))
        return vect

In [3]:
raisin_whole = pd.read_csv('raisin.csv')
raisin = raisin_whole.sample(frac=0.8)
list_train = raisin.index.copy()
list_test = []
for i in raisin_whole.index:
    if i not in list_train:
        list_test.append(i)
raisin_test = raisin_whole.filter(items=list_test, axis=0)

raisin = raisin.reset_index(drop=True)
raisin_test = raisin_test.reset_index(drop=True)

In [4]:
n_samples = raisin.shape[0]
raisin = raisin.replace('Kecimen', 1)
raisin = raisin.replace('Besni', -1)

  raisin = raisin.replace('Besni', -1)


In [5]:
raisin.columns = ['area', 'maj_length','min_length', 'eccentricity','convex','extent',
                  'perimeter','class']
class_kept = raisin['class']

In [6]:
raisin_positive = raisin[raisin['class'] == 1]
raisin_negative = raisin[raisin['class'] == -1]
#First, the cluster step, to 'initialize' the labels, and the creation of unlabeled data
raisin_copy = raisin.drop(['class'], axis=1)
raisin_copy = StandardScaler().fit_transform(X=raisin_copy)
n_samples = raisin.shape[0]
raisin = pd.DataFrame(raisin_copy)
raisin.columns = ['area', 'maj_length','min_length', 'eccentricity','convex','extent',
                  'perimeter']
raisin['class'] = class_kept 
raisin['label'] = np.ones(n_samples)*(-1)

for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if raisin.loc[i,'class'] == 1 and random == 0:
        raisin.loc[i,'label'] = 1
    else:
        raisin.loc[i,'label'] = -1

labels_ref = raisin['label'].copy().to_numpy()

mean_positive_labels = raisin.drop(['class'], axis=1)[raisin['label'] == 1].to_numpy()[:,:7].mean(axis=0)
centroid_2 = -mean_positive_labels
centroid_3 = np.zeros(7)
centroid_4 = np.ones(7)
initial_points = np.vstack((mean_positive_labels,centroid_2,centroid_3,centroid_4))

n_cluster = 4
clusterized_data = KMeans(n_clusters=n_cluster, init=initial_points).fit(raisin_copy)
raisin['cluster'] = clusterized_data.labels_


for i in range(n_samples):
    if raisin.loc[i,'label'] == 1:
        raisin.loc[i,'label'] = 1
    else:
        raisin.loc[i,'label'] = 0
positive_cluster = np.argmax(raisin[['label','cluster']].groupby('cluster').sum().to_numpy())

list_dist = np.zeros(n_cluster)
for i in range(n_cluster):
    list_dist[i] = np.linalg.norm(positive_cluster-clusterized_data.cluster_centers_[i,:])
negative_cluster = np.argmax(list_dist)

raisin['label'] = labels_ref
reliable_positives = raisin[raisin['cluster'] == positive_cluster]
reliable_positives = reliable_positives[reliable_positives['label'] == 1]
reliable_negatives = raisin[raisin['cluster'] == negative_cluster]
reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]

In [7]:
#first svm part
gamma = 1
positives_array = reliable_positives.drop(['class','cluster'], axis=1)
negatives_array = reliable_negatives.drop(['class', 'cluster'], axis=1)
data_svm = np.vstack((positives_array,negatives_array))
n_reliable = data_svm.shape[0]
outcome = data_svm[:,7]
data_svm = data_svm[:,:7]
omega = np.zeros((n_reliable,n_reliable))
for k in range(n_reliable):
    for i in range(k,n_reliable):
        omega[k,i] = outcome[k]*outcome[i]*rbf(x=data_svm[k,:],y=data_svm[i,:])
omega_t = np.transpose(omega)
omega = omega_t + omega
for i in range(n_reliable):
    omega[i,i] = 1


#now, computation of the rest of the matrix
first_row = np.hstack((0,-np.transpose(outcome)))
first_row = first_row.reshape(1,first_row.shape[0])
bot_of_mat_right = omega + (1/gamma)*np.eye(n_reliable)
bot_of_mat = np.hstack((outcome.reshape(n_reliable,1), bot_of_mat_right))
whole_mat = np.vstack((first_row, bot_of_mat))
right_hand = np.ones(n_reliable+1)
right_hand[0] = 0

#we get the coefficients
coeffs = np.linalg.solve(a=whole_mat,b=right_hand)
b = coeffs[0]
alpha = coeffs[1:coeffs.shape[0]]

#now we compute the wt \phi(x) and then we order them 
test_data = raisin.drop(['class','label','cluster'], axis=1).to_numpy()
results = np.empty(n_samples)
for i in tqdm(range(n_samples)):
    results[i] = np.sum(alpha*outcome*rbf(x=data_svm,y=test_data[i,:]))
sorted_results = np.sort(results)
good_ratio = int(n_samples/2)
b = sorted_results[good_ratio]

last_results = np.empty(n_samples)
for i in range(n_samples):
    last_results[i] = np.sign(results[i] - b)

raisin['it_results'] = last_results
correct_with_b = 0
for i in range(reliable_positives.shape[0]):
    if raisin.loc[reliable_positives.index[i],'it_results'] == 1:
        correct_with_b += 1
missclass = reliable_positives.shape[0] - correct_with_b

100%|██████████| 720/720 [00:00<00:00, 1583.40it/s]


In [8]:
positive = 0
true_positive = 0
for i in range(n_samples):
    if last_results[i] == 1 and raisin.loc[raisin.index[i],'class'] == 1:
        true_positive += 1
for i in range(n_samples):
    if last_results[i] == 1:
        positive += 1
print(true_positive/positive, positive)

0.8272980501392758 359


In [9]:
data_sklearn = raisin.to_numpy()[:,:7]
labels_sklearn = np.zeros(n_samples)
for i in range(n_samples):
    if raisin.to_numpy()[i,7] == 1:
        labels_sklearn[i] = 1
svc_sklearn = SVC().fit(X=data_sklearn,y=labels_sklearn)

In [10]:
predictions_sklearn = svc_sklearn.predict(data_sklearn)

In [11]:
true_positives_sklearn = 0
positives_sklearn = np.sum(predictions_sklearn)
for i in range(n_samples):
    if raisin.loc[i,'class'] == 1 and predictions_sklearn[i] == 1:
        true_positives_sklearn += 1

In [12]:
print(true_positives_sklearn/positives_sklearn)

0.8389610389610389


In [13]:
#the performance is very close to the unlabeled case

In [14]:
compteur = 0
max_iter = 500
while missclass!=0 and compteur<max_iter:
    compteur += 1
    b = (1+0.05)*b
    last_results = np.empty(n_samples)
    for i in range(n_samples):
        last_results[i] = np.sign(results[i] - b)

    raisin['it_results'] = last_results
    correct_with_b = 0
    for i in range(reliable_positives.shape[0]):
        if raisin.loc[reliable_positives.index[i],'it_results'] == 1:
            correct_with_b += 1
    missclass = reliable_positives.shape[0] - correct_with_b

In [15]:
raisin_it = raisin

In [16]:
compteur=0
max_iter=10
good_ratio = int(n_samples/2)
while True and compteur<max_iter:
    compteur+=1
    for i in range(n_samples):
        if raisin_it.loc[raisin.index[i],'it_results'] == 0:
            raisin_it.loc[raisin.index[i],'it_results'] = -1
    positives_new = raisin_it[raisin_it['it_results'] == 1]
    positives_new = positives_new[positives_new['label'] == 1]
    negatives_new = raisin_it[raisin_it['it_results'] == -1]
    negatives_new = negatives_new[negatives_new['label'] == -1]
    negatives_new = negatives_new.sample(n=positives_new.shape[0])
    #first svm part
    gamma = 1
    positives_array_new = positives_new.drop(['class','cluster','label'], axis=1)
    negatives_array_new = negatives_new.drop(['class', 'cluster','label'], axis=1)
    data_svm_it = np.vstack((positives_array_new,negatives_array_new))
    n_reliable = data_svm_it.shape[0]
    outcome_it = data_svm_it[:,7].copy()
    data_svm_it = data_svm_it[:,:7].copy()
    #compute omega
    omega_it = np.zeros((n_reliable,n_reliable))
    for k in range(n_reliable):
        for i in range(k,n_reliable):
            omega_it[k,i] = outcome_it[k]*outcome_it[i]*rbf(x=data_svm_it[k,:],y=data_svm_it[i,:])
    omega_it_t = np.transpose(omega_it)
    omega_it = omega_it+omega_it_t
    for i in range(n_reliable):
        omega_it[i,i] = 1
    
    first_row_it = np.hstack((0,-np.transpose(outcome_it)))
    first_row_it = first_row_it.reshape(1,first_row_it.shape[0])
    bot_of_mat_right_it = omega_it + (1/gamma)*np.eye(n_reliable)
    bot_of_mat_it = np.hstack((outcome_it.reshape(n_reliable,1), bot_of_mat_right_it))
    whole_mat_it = np.vstack((first_row_it, bot_of_mat_it))
    right_hand_it = np.ones(n_reliable+1)
    right_hand_it[0] = 0
    coeffs_it = np.linalg.solve(a=whole_mat_it,b=right_hand_it)
    b_it = coeffs_it[0]
    alpha_it = coeffs_it[1:coeffs_it.shape[0]]
    test_data_it = raisin_it.drop(['class','label','cluster','it_results'], axis=1).to_numpy()
    results_new = np.empty(n_samples)
    #the results in the previous algo is now 'new_results'

    
    for i in range(n_samples):
        results_new[i] = np.sum(alpha_it*outcome_it*rbf(x=data_svm_it,y=test_data_it[i,:]))
    sorted_results_it = np.sort(results_new)
    b_it = sorted_results_it[good_ratio]
    last_results_it = np.empty(n_samples)
    for i in range(n_samples):
        last_results_it[i] = np.sign(results_new[i] - b)
    correct_with_b_it = 0 
    for i in range(raisin_it[raisin['label'] == 1].shape[0]):
        if last_results_it[i] == 1:
            correct_with_b_it += 1
    missclass_it = positives_new.shape[0] - correct_with_b_it
    compteur_bis = 0
    max_iter_bis = 200
    while missclass_it!=0 and compteur_bis<max_iter_bis:
        compteur_bis += 1
        b_it = (1-0.05)*b_it
        last_results_bis = np.empty(n_samples)
        for i in range(n_samples):
            last_results_bis[i] = np.sign(results_new[i] - b_it)
        correct_with_b_bis = 0

        for i in range(raisin_it[raisin['label'] == 1].shape[0]):
            if last_results_bis[i] == 1:
                correct_with_b_bis += 1
        missclass_it = positives_new.shape[0] - correct_with_b_bis
    stop_counter = 0
    for i in range(n_samples):
        if raisin_it.loc[raisin.index[i],'it_results'] != last_results_bis[i]:
            stop_counter += 1
    if stop_counter == 0:
        break
        compteur = max_iter
    else:
        raisin_it['it_results'] = last_results_bis

In [17]:
raisin_test_class = raisin_test['Class'].copy()
for i in range(raisin_test.shape[0]):
    if raisin_test_class[i] == 'Kecimen':
        raisin_test_class[i] = 1
    else:
        raisin_test_class[i] = -1
raisin_test_class = np.array(raisin_test_class)
raisin_test = raisin_test.to_numpy()[:,:-1]
raisin_test = StandardScaler().fit_transform(X=raisin_test)
data_svm_test = raisin_test

In [18]:
results_test = np.empty(raisin_test.shape[0])
for i in range(raisin_test.shape[0]):
        results_test[i] = np.sign(np.sum(alpha_it*outcome_it*rbf(x=data_svm_it,y=data_svm_test[i,:],l=10))-b_it)
positive_test = 0
true_positive_test = 0
for i in range(raisin_test.shape[0]):
    if results_test[i] == 1:
        positive_test += 1
        if raisin_test_class[i] == 1:
            true_positive_test += 1

In [19]:
precision_it_svm_test = true_positive_test/positive_test
number_of_positive = 0
for i in range(raisin_test.shape[0]):
    if raisin_test_class[i] == 1:
        number_of_positive += 1
recall_it_svm_test = true_positive_test/number_of_positive
f_1_score_test = (2*precision_it_svm_test*recall_it_svm_test)/(precision_it_svm_test+recall_it_svm_test)
weird_estim_it_svm_test = (recall_it_svm_test**2)/(positive_test/number_of_positive)

In [21]:
print('the precision on the test data is : ', precision_it_svm_test,'\n',
      'the recall on the test data is : ', recall_it_svm_test, '\n', 
      'the f_1 score on the test data is :', f_1_score_test, '\n', 
      'the weird metric on the test data is : ', weird_estim_it_svm_test)

the precision on the test data is :  0.7058823529411765 
 the recall on the test data is :  1.0 
 the f_1 score on the test data is : 0.8275862068965517 
 the weird metric on the test data is :  0.7058823529411764


In [22]:
#it is actually not bad at all

In [23]:
predictions_test_sklearn = svc_sklearn.predict(raisin_test)

In [24]:
positive_last = 0
true_positive_last = 0
for i in range(raisin_test.shape[0]):
    if predictions_test_sklearn[i] == 1:
        positive_last += 1
        if raisin_test_class[i] == 1:
            true_positive_last += 1

precision_oracle = true_positive_last/positive_last
recall_oracle = true_positive_last/number_of_positive
f_1_score_oracle = (2*precision_oracle*recall_oracle)/(precision_oracle+recall_oracle)
weird_estim_oracle = (recall_oracle**2)/(positive_last/number_of_positive)

In [25]:
print('precision of sklearn svm on test data : ', precision_oracle, '\n', 
      'recall of sklearn svm on test data : ', recall_oracle, '\n', 
      'f_1 score of sklearn svm on test data : ', f_1_score_oracle, '\n', 
      'weird metric of sklearn svm on test data : ', weird_estim_oracle)

precision of sklearn svm on test data :  0.8725490196078431 
 recall of sklearn svm on test data :  0.9270833333333334 
 f_1 score of sklearn svm on test data :  0.898989898989899 
 weird metric of sklearn svm on test data :  0.8089256535947713


In [26]:
#a svm trained on a fully labeled dataset only has 8% more precision than the iterative ls-svm on unlabeled
#data, which is not bad knowing that the data is not fully separable