In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from scipy.stats import bernoulli
from tqdm import tqdm

In [2]:
def rbf(x,y,l_squared=1):
    """Gaussian kernel

    Parameters
    -------------------------------
    x : float
    a real number

    y : float
    a real number

    l: float, non zero
    a scale parameter
    -------------------------------
    """
    dim = x.shape[0]
    vect = np.zeros(dim)
    type_x = x.shape
    type_y = y.shape
    if len(type_x) == len(type_y):
        d = np.exp(-((np.linalg.norm(x-y))**2)/(2*l_squared))
        return d
    else :
        for i in range(dim):
            vect[i] = np.exp(-((np.linalg.norm(x[i] - y))**2)/(2*l_squared))
        return vect


In [3]:
raisin_whole = pd.read_csv('raisin.csv')
raisin_whole = raisin_whole.replace('Kecimen', 1)
raisin_whole = raisin_whole.replace('Besni', -1)
classes_to_keep = raisin_whole['Class'].copy()
colnames = raisin_whole.drop(['Class'], axis=1).columns
raisin_whole = StandardScaler().fit_transform(X=raisin_whole.drop(['Class'], axis=1).to_numpy())
raisin_whole = pd.DataFrame(raisin_whole)
raisin_whole.columns = colnames
raisin_whole['Class'] = classes_to_keep
raisin, raisin_test = train_test_split(raisin_whole,train_size=0.8)
raisin = raisin.reset_index(drop=True)
raisin_test = raisin_test.reset_index(drop=True)

  raisin_whole = raisin_whole.replace('Besni', -1)


In [4]:
n_samples = raisin.shape[0]

In [5]:
raisin['label'] = np.zeros(n_samples)
for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if raisin.loc[i,'Class'] == 1 and random == 0:
        raisin.loc[i,'label'] = 1

n_cluster = 5
clustering = KMeans(n_clusters=n_cluster).fit(X=raisin.to_numpy()[:,:-2])
raisin['cluster'] = clustering.labels_

list_of_ratio = np.zeros(5)
for i in range(5):
    list_of_ratio[i] = raisin[raisin['cluster'] == i]['label'].sum()/raisin[raisin['cluster'] == i].shape[0]

positive_cluster = np.argmax(list_of_ratio)
    
#we cannot exactly compute the ratios because the classes are so unbalanced that in any cases the number of positive
#instances will be very small compared to the ones of unlabelled instances

list_of_dist = np.zeros(5)
for i in range(5):
    list_of_dist[i] = np.linalg.norm(clustering.cluster_centers_[positive_cluster,:] - clustering.cluster_centers_[i,:])

negative_cluster = np.argmax(list_of_dist)

In [6]:
for i in range(n_samples):
    if raisin.loc[i,'label'] == 0:
        raisin.loc[i,'label'] = -1

df_unlab_pop = raisin[raisin['label'] == -1]
list_of_pop = pd.DataFrame(df_unlab_pop.groupby('cluster')['Class'].count())
list_of_pop.columns = ['pop']
list_of_pop['dist'] = list_of_dist #distance to the positive cluster
list_of_pop = list_of_pop.sort_values('dist',ascending=False)
list_of_pop['cumsum'] = np.cumsum(list_of_pop['pop'])

In [7]:
reliable_positives = raisin[raisin['label'] == 1]
n_positives = reliable_positives.shape[0]
last_step = np.where(np.array(list_of_pop['cumsum'])>n_positives)[0][0]
index_ordered_distance = list(list_of_pop.index)
if last_step == 0:
    reliable_negatives = raisin[raisin['cluster'] == negative_cluster]
    reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]
else:
    compteur=0
    reliable_negatives = raisin[raisin['cluster'] == negative_cluster]
    reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]
    while compteur<last_step:
        interm_negatives = raisin[raisin['cluster'] == index_ordered_distance[compteur+1]]
        interm_negatives = interm_negatives[interm_negatives['label'] == -1]
        reliable_negatives = pd.concat([reliable_negatives,interm_negatives])
        compteur += 1
    del interm_negatives, compteur

#let's now delete the useless variables for the next steps
del df_unlab_pop, list_of_pop, last_step, clustering, classes_to_keep
del list_of_dist, index_ordered_distance

In [8]:
reliable_negatives = reliable_negatives.sample(n=n_positives)

In [9]:
#little precision that is quite 'funny', they use a 'random svm' for the first step of classification of the unlabelled
#instances

In [10]:
#Step of initialization of labels
train_clf_data = pd.concat([reliable_positives,reliable_negatives])
index_of_labels = list(train_clf_data.index)
unlabelled_data = raisin.drop(labels=index_of_labels,axis=0)
index_of_unlabelled = list(unlabelled_data.index)
first_step_clf = SVC().fit(X=train_clf_data.drop(['Class','label','cluster'],axis=1).to_numpy(),
                          y=train_clf_data['label'].to_numpy())
unlabelled_data['relab'] = first_step_clf.predict(unlabelled_data.drop(['Class','label','cluster'],axis=1).to_numpy())

In [11]:
gamma = 1
good_ratio = 1/2
max_iter = 100
compteur = 0
train_clf_data['relab'] = train_clf_data['label'].copy()
updated_data = pd.concat([train_clf_data,unlabelled_data])
updated_data['is_label'] = np.zeros(n_samples)
for i in range(n_samples):
    if i in index_of_labels:
        updated_data.loc[updated_data.index[i],'is_label'] = 1
updated_data = updated_data.reset_index(drop=True)
up_data_np = updated_data.to_numpy()[:,:-5].copy()




while compteur<max_iter:
    compteur += 1
    labels = updated_data['relab'].to_numpy().reshape(1,-1)
    first_row = np.hstack((np.array(0).reshape(1,1),(-1)*labels))
    
    #computation of omega and the coefficients
    omega = np.zeros((n_samples,n_samples))
    for i in range(n_samples):
        for k in range(i,n_samples):
            omega[i,k] = rbf(x=up_data_np[i,:],y=up_data_np[k,:],l_squared=1)*labels[0,i]*labels[0,k]
            omega[k,i] = omega[i,k]
        omega[i,i] = 1

    bot_right = omega + gamma*np.eye(n_samples)
    bot = np.hstack((updated_data['relab'].to_numpy().reshape(n_samples,1), bot_right))
    whole_mat = np.vstack((first_row, bot))
    
    del bot_right, bot, first_row

    right_side = np.vstack((np.zeros(1).reshape(1,1),np.ones(n_samples).reshape(n_samples,1)))

    coeffs = np.linalg.solve(a=whole_mat,b=right_side)


    alpha = coeffs[1:]

    #once we have the coefficients, we can compute the labels of the unlabelled instances

    train_clf_data = pd.concat([reliable_positives,reliable_negatives])
    index_of_labels = list(train_clf_data.index)
    unlabelled_data = raisin.drop(labels=index_of_labels,axis=0)
    index_of_unlabelled = list(unlabelled_data.index)

    to_det_b = np.zeros(n_samples)
    for i in range(n_samples):
        to_det_b[i] = np.sum(alpha*labels*rbf(x=up_data_np,y=up_data_np[i,:],l_squared=1))

    b = np.sort(to_det_b)[int(good_ratio*n_samples)]
    
    check_array = np.zeros(n_samples)
    count_diff = 0
    
    for i in range(n_samples):
        if i in index_of_labels:
            check_array[i] = 1
        else:
            check_array[i] = np.sign(to_det_b[i]-b)
            if check_array[i] != updated_data.loc[i,'relab']:
                count_diff += 1

    if count_diff == 0:
        break
    else:
        updated_data['relab'] = check_array    

In [12]:
positives = 0
true_positives = 0
unlabelled_test = updated_data[updated_data['is_label'] == 0]
for i in unlabelled_test.index:
    if unlabelled_test.loc[i,'relab'] == 1:
        positives += 1
        if unlabelled_test.loc[i,'Class'] == 1:
            true_positives += 1
    

In [13]:
precision = true_positives/positives
recall = true_positives/updated_data[updated_data['Class'] == 1].shape[0]
f_1 = (2*precision*recall)/(precision+recall)
weird = (recall**2)/(positives/n_samples)

In [14]:
print('precision : ', precision, '\n', 
      'recall : ', recall, '\n', 
      'f_1 : ', f_1, '\n', 
      'weird : ', weird)

precision :  0.7359154929577465 
 recall :  0.5789473684210527 
 f_1 :  0.648062015503876 
 weird :  0.8497522531309742


In [15]:
small_test = raisin_test

In [16]:
small_results = np.zeros(small_test.shape[0])
small_test_np = small_test.to_numpy()[:,:-1]

In [17]:
for i in tqdm(range(small_test.shape[0])):
    small_results[i] =np.sign(np.sum(alpha*labels*rbf(x=up_data_np,y=small_test_np[i,:],l_squared=1))-b)

100%|██████████| 180/180 [00:01<00:00, 154.73it/s]


In [18]:
pos_sm_t = 0
tp_sm_t = 0
for i in range(small_test.shape[0]):
    if small_results[i] == 1:
        pos_sm_t += 1
        if small_test.loc[small_test.index[i],'Class'] == 1:
            tp_sm_t += 1

precision_test = tp_sm_t/pos_sm_t
recall_test = tp_sm_t/raisin_test[raisin_test['Class'] == 1].shape[0]
f_1_test = (2*precision_test*recall_test)/(precision_test+recall_test)
weird_test = (recall_test**2)/(pos_sm_t/raisin_test.shape[0])

In [19]:
pos_sm_t

95

In [20]:
tp_sm_t

72

In [21]:
tp_sm_t/pos_sm_t

0.7578947368421053

In [22]:
print('test precision : ', precision_test, '\n', 
      'test recall : ', recall_test, '\n', 
      'f_1 test : ', f_1_test, '\n', 
      'weird test : ', weird_test)

test precision :  0.7578947368421053 
 test recall :  0.8089887640449438 
 f_1 test :  0.782608695652174 
 weird test :  1.240034817507093


In [23]:
compteur

12