In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import bernoulli
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.svm import SVC

In [2]:
def rbf(x,y,l=1):
    """Gaussian kernel

    Parameters
    -------------------------------
    x : float
    a real number

    y : float
    a real number

    l: float, non zero
    a scale parameter
    -------------------------------
    """
    dim = x.shape[0]
    vect = np.empty(dim)
    if dim == y.shape[0]  :
        d = np.exp(-((np.linalg.norm(x-y))**2)/(2*(l**2)))
        return d
    else :
        for i in range(dim):
            vect[i] = np.exp(-((np.linalg.norm(x[i] - y))**2)/(2*(l**2)))
        return vect

In [3]:
raisin = pd.read_csv('raisin.csv')

In [4]:
n_samples = raisin.shape[0]
for i in range(n_samples):
    if raisin.loc[i,'Class'] == 'Kecimen':
        raisin.loc[i,'Class'] = 1
    else:
        raisin.loc[i,'Class'] = -1

In [5]:
raisin

Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,Extent,Perimeter,Class
0,87524,442.246011,253.291155,0.819738,90546,0.758651,1184.040,1
1,75166,406.690687,243.032436,0.801805,78789,0.684130,1121.786,1
2,90856,442.267048,266.328318,0.798354,93717,0.637613,1208.575,1
3,45928,286.540559,208.760042,0.684989,47336,0.699599,844.162,1
4,79408,352.190770,290.827533,0.564011,81463,0.792772,1073.251,1
...,...,...,...,...,...,...,...,...
895,83248,430.077308,247.838695,0.817263,85839,0.668793,1129.072,-1
896,87350,440.735698,259.293149,0.808629,90899,0.636476,1214.252,-1
897,99657,431.706981,298.837323,0.721684,106264,0.741099,1292.828,-1
898,93523,476.344094,254.176054,0.845739,97653,0.658798,1258.548,-1


In [6]:
raisin.columns = ['area', 'maj_length','min_length', 'eccentricity','convex','extent',
                  'perimeter','class']
class_kept = raisin['class']

In [7]:
raisin_positive = raisin[raisin['class'] == 1]
raisin_negative = raisin[raisin['class'] == -1]
#First, the cluster step, to 'initialize' the labels, and the creation of unlabeled data
raisin_copy = raisin.drop(['class'], axis=1)
raisin_copy = StandardScaler().fit_transform(X=raisin_copy)
n_samples = raisin.shape[0]
raisin = pd.DataFrame(raisin_copy)
raisin.columns = ['area', 'maj_length','min_length', 'eccentricity','convex','extent',
                  'perimeter']
raisin['class'] = class_kept 
raisin['label'] = np.ones(n_samples)*(-1)

for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if raisin.loc[i,'class'] == 1 and random == 0:
        raisin.loc[i,'label'] = 1
    else:
        raisin.loc[i,'label'] = -1

labels_ref = raisin['label'].copy().to_numpy()

mean_positive_labels = raisin.drop(['class'], axis=1)[raisin['label'] == 1].to_numpy()[:,:7].mean(axis=0)
other_centroids = np.random.uniform(low=-1,high=1, size=(3,7))
initial_points = np.vstack((mean_positive_labels,other_centroids))
n_cluster = 4
clusterized_data = KMeans(n_clusters=n_cluster, init=initial_points).fit(raisin_copy)
raisin['cluster'] = clusterized_data.labels_


for i in range(n_samples):
    if raisin.loc[i,'label'] == 1:
        raisin.loc[i,'label'] = 1
    else:
        raisin.loc[i,'label'] = 0
positive_cluster = np.argmax(raisin[['label','cluster']].groupby('cluster').sum().to_numpy())

list_dist = np.zeros(n_cluster)
for i in range(n_cluster):
    list_dist[i] = np.linalg.norm(positive_cluster-clusterized_data.cluster_centers_[i,:])
negative_cluster = np.argmax(list_dist)

raisin['label'] = labels_ref
reliable_positives = raisin[raisin['cluster'] == positive_cluster]
reliable_positives = reliable_positives[reliable_positives['label'] == 1]
reliable_negatives = raisin[raisin['cluster'] == negative_cluster]
reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]



In [8]:
#first svm part
gamma = 1
positives_array = reliable_positives.drop(['class','cluster'], axis=1)
negatives_array = reliable_negatives.drop(['class', 'cluster'], axis=1)
data_svm = np.vstack((positives_array,negatives_array))
n_reliable = data_svm.shape[0]
outcome = data_svm[:,7]
data_svm = data_svm[:,:7]
omega = np.empty((n_reliable,n_reliable))
for k in range(n_reliable):
    for i in range(k,n_reliable):
        omega[k,i] = outcome[k]*outcome[i]*rbf(x=data_svm[k,:],y=data_svm[i,:])
omega_t = np.transpose(omega)
omega = omega_t + omega
for i in range(n_reliable):
    omega[i,i] = 1


#now, computation of the rest of the matrix
first_row = np.hstack((0,-np.transpose(outcome)))
first_row = first_row.reshape(1,first_row.shape[0])
bot_of_mat_right = omega + (1/gamma)*np.eye(n_reliable)
bot_of_mat = np.hstack((outcome.reshape(n_reliable,1), bot_of_mat_right))
whole_mat = np.vstack((first_row, bot_of_mat))
right_hand = np.ones(n_reliable+1)
right_hand[0] = 0

#we get the coefficients
coeffs = np.linalg.solve(a=whole_mat,b=right_hand)
b = coeffs[0]
alpha = coeffs[1:coeffs.shape[0]]

#now we compute the wt \phi(x) and then we order them 
test_data = raisin.drop(['class','label','cluster'], axis=1).to_numpy()
results = np.empty(n_samples)
for i in tqdm(range(n_samples)):
    results[i] = np.sum(alpha*outcome*rbf(x=data_svm,y=test_data[i,:]))
sorted_results = np.sort(results)
good_ratio = int(n_samples/2)
b = sorted_results[good_ratio]

last_results = np.empty(n_samples)
for i in range(n_samples):
    last_results[i] = np.sign(results[i] - b)

raisin['it_results'] = last_results
correct_with_b = 0
for i in range(reliable_positives.shape[0]):
    if raisin.loc[reliable_positives.index[i],'it_results'] == 1:
        correct_with_b += 1
missclass = reliable_positives.shape[0] - correct_with_b

100%|██████████| 900/900 [00:00<00:00, 1139.00it/s]


In [9]:
positive = 0
true_positive = 0
for i in range(n_samples):
    if last_results[i] == 1 and raisin.loc[raisin.index[i],'class'] == 1:
        true_positive += 1
for i in range(n_samples):
    if last_results[i] == 1:
        positive += 1
print(true_positive/positive, positive)

0.8329621380846325 449


In [10]:
data_sklearn = raisin.to_numpy()[:,:7]
labels_sklearn = np.zeros(n_samples)
for i in range(n_samples):
    if raisin.to_numpy()[i,7] == 1:
        labels_sklearn[i] = 1
svc_sklearn = SVC().fit(X=data_sklearn,y=labels_sklearn)

In [11]:
predictions_sklearn = svc_sklearn.predict(data_sklearn)

In [12]:
true_positives_sklearn = 0
positives_sklearn = np.sum(predictions_sklearn)
for i in range(n_samples):
    if raisin.loc[i,'class'] == 1 and predictions_sklearn[i] == 1:
        true_positives_sklearn += 1

In [13]:
print(true_positives_sklearn/positives_sklearn)

0.8397565922920892


In [14]:
#the performance is very close to the unlabeled case