In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import bernoulli
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
mushrooms = pd.read_csv('mushroom.csv')

In [3]:
def rbf(x,y,l=1):
    """Gaussian kernel

    Parameters
    -------------------------------
    x : float
    a real number

    y : float
    a real number

    l: float, non zero
    a scale parameter
    -------------------------------
    """
    dim = x.shape[0]
    vect = np.empty(dim)
    if dim == y.shape[0]  :
        d = np.exp(-((np.linalg.norm(x-y))**2)/(2*(l**2)))
        return d
    else :
        for i in range(dim):
            vect[i] = np.exp(-((np.linalg.norm(x[i] - y))**2)/(2*(l**2)))
        return vect

In [4]:
#First, the cluster step, to 'initialize' the labels, and the creation of unlabeled data
mushrooms = pd.read_csv('mushroom.csv')
mushrooms_copy = mushrooms.drop(['class'], axis=1)
mushrooms_copy = StandardScaler().fit_transform(X=mushrooms)
n_samples = mushrooms.shape[0]
mushrooms['label'] = np.ones(n_samples)*(-1)

for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if mushrooms.loc[i,'class'] == 1 and random == 0:
        mushrooms.loc[i,'label'] = 1
    else:
        mushrooms.loc[i,'label'] = -1

n_cluster = 8
clusterized_data = KMeans(n_clusters=n_cluster, init='random').fit(mushrooms_copy)
mushrooms['cluster'] = clusterized_data.labels_

list_of_ratio = []
for i in range(n_cluster):
    list_of_ratio.append(mushrooms[mushrooms['cluster'] == i]['label'].sum()/mushrooms[mushrooms['cluster'] == i]['class'].shape[0])
list_of_ratio = np.array(list_of_ratio)
positive_cluster = np.argmax(list_of_ratio)
negative_cluster = np.argmin(list_of_ratio)
print('the cluster containing positive values is : ', positive_cluster, '\n', 
      'the one containing negative exampels is : ', negative_cluster)

reliable_positives = mushrooms[mushrooms['cluster'] == positive_cluster]
reliable_positives = reliable_positives[reliable_positives['label'] == 1]
reliable_negatives = mushrooms[mushrooms['cluster'] == negative_cluster]
reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]
reliable_negatives = reliable_negatives.sample(n=reliable_positives.shape[0]) #to adjust the class balance ratio

the cluster containing positive values is :  3 
 the one containing negative exampels is :  1


In [5]:
#first svm part
gamma = 1
positives_array = reliable_positives.drop(['class','cluster'], axis=1)
negatives_array = reliable_negatives.drop(['class', 'cluster'], axis=1)
data_svm = np.vstack((positives_array,negatives_array))
n_reliable = data_svm.shape[0]
outcome = data_svm[:,8]
data_svm = data_svm[:,:8]
omega = np.empty((n_reliable,n_reliable))
for k in range(n_reliable):
    for i in range(k,n_reliable):
        omega[k,i] = outcome[k]*outcome[i]*rbf(x=data_svm[k,:],y=data_svm[i,:],l=10)
        omega[i,k] = omega[k,i]


#now, computation of the rest of the matrix
first_row = np.hstack((0,-np.transpose(outcome)))
first_row = first_row.reshape(1,first_row.shape[0])
bot_of_mat_right = omega + (1/gamma)*np.eye(n_reliable)
bot_of_mat = np.hstack((outcome.reshape(n_reliable,1), bot_of_mat_right))
whole_mat = np.vstack((first_row, bot_of_mat))
right_hand = np.ones(n_reliable+1)
right_hand[0] = 0

#we get the coefficients
coeffs = np.linalg.solve(a=whole_mat,b=right_hand)
b = coeffs[0]
alpha = coeffs[1:coeffs.shape[0]]

#now we compute the wt \phi(x) and then we order them 
test_data = mushrooms.drop(['class','label','cluster'], axis=1).to_numpy()
results = np.empty(n_samples)
for i in tqdm(range(n_samples)):
    results[i] = np.sum(alpha*outcome*rbf(x=data_svm,y=test_data[i,:],l=10))
sorted_results = np.sort(results)
good_ratio = int(n_samples/2)
b = sorted_results[good_ratio]

last_results = np.empty(n_samples)
for i in range(n_samples):
    last_results[i] = np.sign(results[i] - b)

mushrooms['it_results'] = last_results
correct_with_b = 0
for i in range(reliable_positives.shape[0]):
    if mushrooms.loc[reliable_positives.index[i],'it_results'] == 1:
        correct_with_b += 1
missclass = reliable_positives.shape[0] - correct_with_b

100%|██████████| 54035/54035 [10:02<00:00, 89.68it/s] 


In [6]:
positive = 0
true_positive = 0
for i in range(n_samples):
    if last_results[i] == 1 and mushrooms.loc[i,'class'] == 1:
        true_positive += 1
for i in range(n_samples):
    if last_results[i] == 1:
        positive += 1
print(true_positive/positive, positive)

0.6552540992708294 27017


In [7]:
compteur = 0
max_iter = 500
while missclass!=0 and compteur<max_iter:
    compteur += 1
    b = (1+0.05)*b
    last_results = np.empty(n_samples)
    for i in range(n_samples):
        last_results[i] = np.sign(results[i] - b)

    mushrooms['it_results'] = last_results
    correct_with_b = 0
    for i in range(reliable_positives.shape[0]):
        if mushrooms.loc[reliable_positives.index[i],'it_results'] == 1:
            correct_with_b += 1
    missclass = reliable_positives.shape[0] - correct_with_b

In [8]:
mushrooms_it = mushrooms

In [9]:
positive = int(n_samples/2)
true_positive = 0
for i in range(n_samples):
    if mushrooms_it.loc[i,'it_results'] == 1 and mushrooms_it.loc[i,'class'] == 1:
        true_positive += 1

precision = true_positive/positive
print(precision)

0.6571417996076544


In [10]:
true_positive

17754

In [11]:
mushrooms_it.groupby('class').count()

Unnamed: 0_level_0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,label,cluster,it_results
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,24360,24360,24360,24360,24360,24360,24360,24360,24360,24360,24360
1,29675,29675,29675,29675,29675,29675,29675,29675,29675,29675,29675


In [14]:
compteur=0
max_iter=10
good_ratio = int(n_samples/2)
while True and compteur<max_iter:
    compteur+=1
    print(compteur)
    for i in range(n_samples):
        if mushrooms_it.loc[i,'it_results'] == 0:
            mushrooms_it.loc[i,'it_results'] = -1
    positives_new = mushrooms_it[mushrooms_it['it_results'] == 1]
    positives_new = positives_new[positives_new['label'] == 1]
    negatives_new = mushrooms_it[mushrooms_it['it_results'] == -1]
    negatives_new = negatives_new[negatives_new['label'] == -1]
    negatives_new = negatives_new.sample(n=positives_new.shape[0])
    #first svm part
    gamma = 1
    positives_array_new = positives_new.drop(['class','cluster','label'], axis=1)
    negatives_array_new = negatives_new.drop(['class', 'cluster','label'], axis=1)
    data_svm_it = np.vstack((positives_array_new,negatives_array_new))
    n_reliable = data_svm_it.shape[0]
    outcome_it = data_svm_it[:,8]
    data_svm_it = data_svm_it[:,:8]
    omega_it = np.empty((n_reliable,n_reliable))
    for k in range(n_reliable):
        for i in range(k,n_reliable):
            omega_it[k,i] = outcome_it[k]*outcome_it[i]*rbf(x=data_svm_it[k,:],y=data_svm_it[i,:],l=10)
            omega_it[i,k] = omega_it[k,i]
    
    first_row_it = np.hstack((0,-np.transpose(outcome_it)))
    first_row_it = first_row_it.reshape(1,first_row_it.shape[0])
    bot_of_mat_right_it = omega_it + (1/gamma)*np.eye(n_reliable)
    bot_of_mat_it = np.hstack((outcome_it.reshape(n_reliable,1), bot_of_mat_right_it))
    whole_mat_it = np.vstack((first_row_it, bot_of_mat_it))
    right_hand_it = np.ones(n_reliable+1)
    right_hand_it[0] = 0
    coeffs_it = np.linalg.solve(a=whole_mat_it,b=right_hand_it)
    b_it = coeffs_it[0]
    alpha_it = coeffs_it[1:coeffs_it.shape[0]]
    test_data_it = mushrooms_it.drop(['class','label','cluster','it_results'], axis=1).to_numpy()
    results_new = np.empty(n_samples)
    #the results in the previous algo is now 'new_results'
    for i in range(n_samples):
        results_new[i] = np.sum(alpha_it*outcome_it*rbf(x=data_svm_it,y=test_data_it[i,:],l=10))
    sorted_results_it = np.sort(results_new)
    b_it = sorted_results_it[good_ratio]
    last_results_it = np.empty(n_samples)
    for i in range(n_samples):
        last_results_it[i] = np.sign(results_new[i] - b)
    mushrooms['it_next_step'] = last_results_it
    correct_with_b_it = 0
    for i in range(positives_new.shape[0]):
        if mushrooms_it.loc[positives_new.index[i],'it_next_step'] == 1:
            correct_with_b_it += 1
    missclass_it = positives_new.shape[0] - correct_with_b_it

    compteur_bis = 0
    max_iter_bis = 200
    while missclass_it!=0 and compteur_bis<max_iter_bis:
        if compteur_bis%25 == 0:
            print(compteur)
        compteur_bis += 1
        b_it = (1+0.05)*b_it
        last_results_bis = np.empty(n_samples)
        for i in range(n_samples):
            last_results_bis[i] = np.sign(results_new[i] - b_it)

        mushrooms_it['it_next_step'] = last_results_bis
        correct_with_b_bis = 0
        #sort out a pb infra bc the condition is going to be redundent if i just do as before, maybe use the original
        #mushrooms 
        #maybe useless bc mushrooms_it still has labels i think
        for i in range(mushrooms_it[mushrooms_it['label'] == 1].shape[0]):
            if last_results_bis[mushrooms_it[mushrooms_it['label'] == 1].index[i]] == 1:
                correct_with_b += 1
        missclass = positives_new.shape[0] - correct_with_b
    stop_counter = 0
    for i in range(n_samples):
        if mushrooms_it.loc[i,'it_results'] != mushrooms_it.loc[i,'it_next_step']:
            stop_counter += 1
    if stop_counter == 0:
        break
    else:
        mushrooms_it['it_results'] = mushrooms_it['it_next_step']

1


KeyboardInterrupt: 

In [None]:
print(compteur, compteur_bis)

In [12]:
for i in range(n_samples):
    if mushrooms_it.loc[i,'it_results'] == 0:
        mushrooms_it.loc[i,'it_results'] = -1
positives_new = mushrooms_it[mushrooms_it['it_results'] == 1]
positives_new = positives_new[positives_new['label'] == 1]
negatives_new = mushrooms_it[mushrooms_it['it_results'] == -1]
negatives_new = negatives_new[negatives_new['label'] == -1]
negatives_new = negatives_new.sample(n=positives_new.shape[0])
#first svm part
gamma = 1
positives_array_new = positives_new.drop(['class','cluster','label'], axis=1)
negatives_array_new = negatives_new.drop(['class', 'cluster','label'], axis=1)
data_svm_it = np.vstack((positives_array_new,negatives_array_new))
n_reliable = data_svm_it.shape[0]
outcome_it = data_svm_it[:,8]
data_svm_it = data_svm_it[:,:8]
omega_it = np.zeros((n_reliable,n_reliable))

In [13]:
for k in tqdm(range(n_reliable)):
        for i in range(k,n_reliable):
            omega_it[k,i] = outcome_it[k]*outcome_it[i]*rbf(x=data_svm_it[k,:],y=data_svm_it[i,:],l=10)

omega_it_t = np.transpose(omega_it)
for i in range(n_reliable):
    omega_it_t[i,i] = 0
omega_it = omega_it+omega_it_t

for i in range(n_reliable):
    omega_it[i,i] = 1

100%|██████████| 9532/9532 [03:09<00:00, 50.31it/s] 


In [14]:
first_row_it = np.hstack((0,-np.transpose(outcome_it)))
first_row_it = first_row_it.reshape(1,first_row_it.shape[0])
bot_of_mat_right_it = omega_it + (1/gamma)*np.eye(n_reliable)
bot_of_mat_it = np.hstack((outcome_it.reshape(n_reliable,1), bot_of_mat_right_it))
whole_mat_it = np.vstack((first_row_it, bot_of_mat_it))
right_hand_it = np.ones(n_reliable+1)
right_hand_it[0] = 0
coeffs_it = np.linalg.solve(a=whole_mat_it,b=right_hand_it)
b_it = coeffs_it[0]
alpha_it = coeffs_it[1:coeffs_it.shape[0]]
test_data_it = mushrooms_it.drop(['class','label','cluster','it_results'], axis=1).to_numpy()
results_new = np.empty(n_samples)

In [15]:
for i in tqdm(range(n_samples)):
    results_new[i] = np.sum(alpha_it*outcome_it*rbf(x=data_svm_it,y=test_data_it[i,:],l=10))
sorted_results_it = np.sort(results_new)
b_it = sorted_results_it[good_ratio]
last_results_it = np.empty(n_samples)
for i in range(n_samples):
    last_results_it[i] = np.sign(results_new[i] - b)
mushrooms['it_next_step'] = last_results_it
correct_with_b_it = 0
for i in range(positives_new.shape[0]):
    if mushrooms_it.loc[positives_new.index[i],'it_next_step'] == 1:
        correct_with_b_it += 1
missclass_it = positives_new.shape[0] - correct_with_b_it

100%|██████████| 54035/54035 [25:52<00:00, 34.81it/s]


In [35]:
transpose_test = np.array(([1,1],[2,2]))

In [36]:
transpose_test

array([[1, 1],
       [2, 2]])

In [37]:
np.transpose(transpose_test)

array([[1, 2],
       [1, 2]])

In [16]:
omega_test_1 = np.zeros((n_reliable,n_reliable))
for k in tqdm(range(n_reliable)):
    for i in range(k,n_reliable):
        omega_test_1[k,i] = outcome_it[k]*outcome_it[i]*rbf(x=data_svm_it[k,:],y=data_svm_it[i,:],l=10)

omega_test_1_t = np.transpose(omega_test_1)
for i in range(n_reliable):
    omega_test_1_t[i,i] = 0
omega_test_1 = omega_test_1+omega_test_1_t

omega_test_2 = np.zeros((n_reliable,n_reliable))
for k in tqdm(range(n_reliable)):
    for i in range(k,n_reliable):
        omega_test_2[k,i] = outcome_it[k]*outcome_it[i]*rbf(x=data_svm_it[k,:],y=data_svm_it[i,:],l=10)
        omega_test_2[i,k] = omega_test_2[k,i]

omega_test_1 - omega_test_2

100%|██████████| 9532/9532 [02:55<00:00, 54.41it/s] 
100%|██████████| 9532/9532 [03:09<00:00, 50.34it/s] 


array([[-1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0., -1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0., -1., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ..., -1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0., -1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0., -1.]])

In [17]:
omega_test_3 = np.empty((n_reliable,n_reliable))
for k in tqdm(range(n_reliable)):
    for i in range(n_reliable):
        omega_test_3[k,i] = outcome_it[k]*outcome_it[i]*rbf(x=data_svm_it[k,:],y=data_svm_it[i,:],l=10)


100%|██████████| 9532/9532 [06:12<00:00, 25.60it/s]


In [18]:
omega_test_3 - omega_test_1

array([[ 1.,  0.,  0., ..., -0., -0., -0.],
       [ 0.,  1.,  0., ..., -0., -0., -0.],
       [ 0.,  0.,  1., ..., -0., -0., -0.],
       ...,
       [-0., -0., -0., ...,  1.,  0.,  0.],
       [-0., -0., -0., ...,  0.,  1.,  0.],
       [-0., -0., -0., ...,  0.,  0.,  1.]])

In [19]:
omega_test_3 - omega_test_2

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
for i in range(n_reliable):
    omega_test_1[i,i] = 1

In [23]:
omega_test_1 - omega_test_2

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
omega_test_1 - omega_test_3

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])