In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import bernoulli
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
mushrooms = pd.read_csv('mushroom.csv')

In [3]:
def rbf(x,y,l=1):
    """Gaussian kernel

    Parameters
    -------------------------------
    x : float
    a real number

    y : float
    a real number

    l: float, non zero
    a scale parameter
    -------------------------------
    """
    dim = x.shape[0]
    vect = np.empty(dim)
    if dim == y.shape[0]  :
        d = np.exp(-((np.linalg.norm(x-y))**2)/(2*(l**2)))
        return d
    else :
        for i in range(dim):
            vect[i] = np.exp(-((np.linalg.norm(x[i] - y))**2)/(2*(l**2)))
        return vect

In [4]:
#First, the cluster step, to 'initialize' the labels, and the creation of unlabeled data
mushrooms = pd.read_csv('mushroom.csv')
mushrooms_copy = mushrooms.drop(['class'], axis=1)
mushrooms_copy = StandardScaler().fit_transform(X=mushrooms)
n_samples = mushrooms.shape[0]
mushrooms['label'] = np.ones(n_samples)*(-1)

for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if mushrooms.loc[i,'class'] == 1 and random == 0:
        mushrooms.loc[i,'label'] = 1
    else:
        mushrooms.loc[i,'label'] = -1

n_cluster = 8
clusterized_data = KMeans(n_clusters=n_cluster, init='random').fit(mushrooms_copy)
mushrooms['cluster'] = clusterized_data.labels_

list_of_ratio = []
for i in range(n_cluster):
    list_of_ratio.append(mushrooms[mushrooms['cluster'] == i]['label'].sum()/mushrooms[mushrooms['cluster'] == i]['class'].shape[0])
list_of_ratio = np.array(list_of_ratio)
positive_cluster = np.argmax(list_of_ratio)
negative_cluster = np.argmin(list_of_ratio)
print('the cluster containing positive values is : ', positive_cluster, '\n', 
      'the one containing negative exampels is : ', negative_cluster)

reliable_positives = mushrooms[mushrooms['cluster'] == positive_cluster]
reliable_positives = reliable_positives[reliable_positives['label'] == 1]
reliable_negatives = mushrooms[mushrooms['cluster'] == negative_cluster]
reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]
reliable_negatives = reliable_negatives.sample(n=reliable_positives.shape[0]) #to adjust the class balance ratio

the cluster containing positive values is :  0 
 the one containing negative exampels is :  5


In [5]:
#first svm part
gamma = 1
positives_array = reliable_positives.drop(['class','cluster'], axis=1)
negatives_array = reliable_negatives.drop(['class', 'cluster'], axis=1)
data_svm = np.vstack((positives_array,negatives_array))
n_reliable = data_svm.shape[0]
outcome = data_svm[:,8]
data_svm = data_svm[:,:8]
omega = np.empty((n_reliable,n_reliable))
for k in range(n_reliable):
    for i in range(k,n_reliable):
        omega[k,i] = outcome[k]*outcome[i]*rbf(x=data_svm[k,:],y=data_svm[i,:],l=10)
omega_t = np.transpose(omega)
omega = omega_t + omega
for i in range(n_reliable):
    omega[i,i] = 1


#now, computation of the rest of the matrix
first_row = np.hstack((0,-np.transpose(outcome)))
first_row = first_row.reshape(1,first_row.shape[0])
bot_of_mat_right = omega + (1/gamma)*np.eye(n_reliable)
bot_of_mat = np.hstack((outcome.reshape(n_reliable,1), bot_of_mat_right))
whole_mat = np.vstack((first_row, bot_of_mat))
right_hand = np.ones(n_reliable+1)
right_hand[0] = 0

#we get the coefficients
coeffs = np.linalg.solve(a=whole_mat,b=right_hand)
b = coeffs[0]
alpha = coeffs[1:coeffs.shape[0]]

#now we compute the wt \phi(x) and then we order them 
test_data = mushrooms.drop(['class','label','cluster'], axis=1).to_numpy()
results = np.empty(n_samples)
for i in tqdm(range(n_samples)):
    results[i] = np.sum(alpha*outcome*rbf(x=data_svm,y=test_data[i,:],l=10))
sorted_results = np.sort(results)
good_ratio = int(n_samples/2)
b = sorted_results[good_ratio]

last_results = np.empty(n_samples)
for i in range(n_samples):
    last_results[i] = np.sign(results[i] - b)

mushrooms['it_results'] = last_results
correct_with_b = 0
for i in range(reliable_positives.shape[0]):
    if mushrooms.loc[reliable_positives.index[i],'it_results'] == 1:
        correct_with_b += 1
missclass = reliable_positives.shape[0] - correct_with_b

100%|██████████| 54035/54035 [08:11<00:00, 109.84it/s]


In [6]:
positive = 0
true_positive = 0
for i in range(n_samples):
    if last_results[i] == 1 and mushrooms.loc[i,'class'] == 1:
        true_positive += 1
for i in range(n_samples):
    if last_results[i] == 1:
        positive += 1
print(true_positive/positive, positive)

0.65991782951475 27017


In [7]:
compteur = 0
max_iter = 500
while missclass!=0 and compteur<max_iter:
    compteur += 1
    b = (1+0.05)*b
    last_results = np.empty(n_samples)
    for i in range(n_samples):
        last_results[i] = np.sign(results[i] - b)

    mushrooms['it_results'] = last_results
    correct_with_b = 0
    for i in range(reliable_positives.shape[0]):
        if mushrooms.loc[reliable_positives.index[i],'it_results'] == 1:
            correct_with_b += 1
    missclass = reliable_positives.shape[0] - correct_with_b

In [8]:
mushrooms_it = mushrooms

In [9]:
positive = int(n_samples/2)
true_positive = 0
for i in range(n_samples):
    if mushrooms_it.loc[i,'it_results'] == 1 and mushrooms_it.loc[i,'class'] == 1:
        true_positive += 1

precision = true_positive/positive
print(precision)

0.65991782951475


In [10]:
true_positive

17829

In [11]:
mushrooms_it.groupby('class').count()

Unnamed: 0_level_0,cap-diameter,cap-shape,gill-attachment,gill-color,stem-height,stem-width,stem-color,season,label,cluster,it_results
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,24360,24360,24360,24360,24360,24360,24360,24360,24360,24360,24360
1,29675,29675,29675,29675,29675,29675,29675,29675,29675,29675,29675


In [12]:
compteur=0
max_iter=10
good_ratio = int(n_samples/2)
while True and compteur<max_iter:
    compteur+=1
    print(compteur)
    for i in range(n_samples):
        if mushrooms_it.loc[i,'it_results'] == 0:
            mushrooms_it.loc[i,'it_results'] = -1
    positives_new = mushrooms_it[mushrooms_it['it_results'] == 1]
    positives_new = positives_new[positives_new['label'] == 1]
    negatives_new = mushrooms_it[mushrooms_it['it_results'] == -1]
    negatives_new = negatives_new[negatives_new['label'] == -1]
    negatives_new = negatives_new.sample(n=positives_new.shape[0])
    #first svm part
    gamma = 1
    positives_array_new = positives_new.drop(['class','cluster','label'], axis=1)
    negatives_array_new = negatives_new.drop(['class', 'cluster','label'], axis=1)
    data_svm_it = np.vstack((positives_array_new,negatives_array_new))
    n_reliable = data_svm_it.shape[0]
    outcome_it = data_svm_it[:,8]
    data_svm_it = data_svm_it[:,:8]
    #compute omega
    omega_it = np.empty((n_reliable,n_reliable))
    for k in tqdm(range(n_reliable)):
        for i in range(k,n_reliable):
            omega_it[k,i] = outcome_it[k]*outcome_it[i]*rbf(x=data_svm_it[k,:],y=data_svm_it[i,:],l=10)
    omega_it_t = np.transpose(omega_it)
    omega_it = omega_it+omega_it_t
    for i in range(n_reliable):
        omega_it[i,i] = 1
    
    first_row_it = np.hstack((0,-np.transpose(outcome_it)))
    first_row_it = first_row_it.reshape(1,first_row_it.shape[0])
    bot_of_mat_right_it = omega_it + (1/gamma)*np.eye(n_reliable)
    bot_of_mat_it = np.hstack((outcome_it.reshape(n_reliable,1), bot_of_mat_right_it))
    whole_mat_it = np.vstack((first_row_it, bot_of_mat_it))
    right_hand_it = np.ones(n_reliable+1)
    right_hand_it[0] = 0
    coeffs_it = np.linalg.solve(a=whole_mat_it,b=right_hand_it)
    b_it = coeffs_it[0]
    alpha_it = coeffs_it[1:coeffs_it.shape[0]]
    test_data_it = mushrooms_it.drop(['class','label','cluster','it_results'], axis=1).to_numpy()
    results_new = np.empty(n_samples)
    #the results in the previous algo is now 'new_results'

    
    for i in tqdm(range(n_samples)):
        results_new[i] = np.sum(alpha_it*outcome_it*rbf(x=data_svm_it,y=test_data_it[i,:],l=10))

    print('w step done')
    sorted_results_it = np.sort(results_new)
    b_it = sorted_results_it[good_ratio]
    last_results_it = np.empty(n_samples)
    for i in range(n_samples):
        last_results_it[i] = np.sign(results_new[i] - b)
    print('first result done')
    correct_with_b_it = 0 
    for i in mushrooms_it[mushrooms['label'] == 1].index:
        if last_results_it[i] == 1:
            correct_with_b_it += 1
    missclass_it = positives_new.shape[0] - correct_with_b_it
    print('correct with b done')
    compteur_bis = 0
    max_iter_bis = 200
    while missclass_it!=0 and compteur_bis<max_iter_bis:
        if compteur_bis%50 == 0:
            print(compteur_bis)
        compteur_bis += 1
        b_it = (1-0.05)*b_it
        last_results_bis = np.empty(n_samples)
        for i in range(n_samples):
            last_results_bis[i] = np.sign(results_new[i] - b_it)
        correct_with_b_bis = 0

        for i in mushrooms_it[mushrooms['label'] == 1].index:
            if last_results_bis[i] == 1:
                correct_with_b_bis += 1
        missclass_it = positives_new.shape[0] - correct_with_b_bis
    print('correction of b done')
    stop_counter = 0
    for i in range(n_samples):
        if mushrooms_it.loc[i,'it_results'] != last_results_bis[i]:
            stop_counter += 1
    if stop_counter == 0:
        break
        compteur = max_iter
    else:
        mushrooms_it['it_results'] = last_results_bis

1


100%|██████████| 9186/9186 [03:21<00:00, 45.65it/s] 
100%|██████████| 54035/54035 [27:11<00:00, 33.12it/s]


w step done
first result done
correct with b done
0
50
100
150
correction of b done
2


100%|██████████| 9532/9532 [03:25<00:00, 46.48it/s] 
100%|██████████| 54035/54035 [27:01<00:00, 33.33it/s]


w step done
first result done
correct with b done
0
50
100
150
correction of b done
3


  1%|          | 114/9738 [00:04<05:57, 26.93it/s]


KeyboardInterrupt: 

In [None]:
print(compteur, compteur_bis)

for i in range(n_samples):
    if mushrooms_it.loc[i,'it_results'] == 0:
        mushrooms_it.loc[i,'it_results'] = -1
positives_new = mushrooms_it[mushrooms_it['it_results'] == 1]
positives_new = positives_new[positives_new['label'] == 1]
negatives_new = mushrooms_it[mushrooms_it['it_results'] == -1]
negatives_new = negatives_new[negatives_new['label'] == -1]
negatives_new = negatives_new.sample(n=positives_new.shape[0])
#first svm part
gamma = 1
positives_array_new = positives_new.drop(['class','cluster','label'], axis=1)
negatives_array_new = negatives_new.drop(['class', 'cluster','label'], axis=1)
data_svm_it = np.vstack((positives_array_new,negatives_array_new))
n_reliable = data_svm_it.shape[0]
outcome_it = data_svm_it[:,8]
data_svm_it = data_svm_it[:,:8]
omega_it = np.zeros((n_reliable,n_reliable))

for k in tqdm(range(n_reliable)):
        for i in range(k,n_reliable):
            omega_it[k,i] = outcome_it[k]*outcome_it[i]*rbf(x=data_svm_it[k,:],y=data_svm_it[i,:],l=10)

omega_it_t = np.transpose(omega_it)
omega_it = omega_it+omega_it_t
for i in range(n_reliable):
    omega_it[i,i] = 1

first_row_it = np.hstack((0,-np.transpose(outcome_it)))
first_row_it = first_row_it.reshape(1,first_row_it.shape[0])
bot_of_mat_right_it = omega_it + (1/gamma)*np.eye(n_reliable)
bot_of_mat_it = np.hstack((outcome_it.reshape(n_reliable,1), bot_of_mat_right_it))
whole_mat_it = np.vstack((first_row_it, bot_of_mat_it))
right_hand_it = np.ones(n_reliable+1)
right_hand_it[0] = 0
coeffs_it = np.linalg.solve(a=whole_mat_it,b=right_hand_it)
b_it = coeffs_it[0]
alpha_it = coeffs_it[1:coeffs_it.shape[0]]
test_data_it = mushrooms_it.drop(['class','label','cluster','it_results'], axis=1).to_numpy()
results_new = np.empty(n_samples)

for i in tqdm(range(n_samples)):
    results_new[i] = np.sum(alpha_it*outcome_it*rbf(x=data_svm_it,y=test_data_it[i,:],l=10))
sorted_results_it = np.sort(results_new)
b_it = sorted_results_it[good_ratio]
last_results_it = np.empty(n_samples)
for i in range(n_samples):
    last_results_it[i] = np.sign(results_new[i] - b)
mushrooms['it_next_step'] = last_results_it
correct_with_b_it = 0
for i in range(positives_new.shape[0]):
    if mushrooms_it.loc[positives_new.index[i],'it_next_step'] == 1:
        correct_with_b_it += 1
missclass_it = positives_new.shape[0] - correct_with_b_it

compteur_bis = 0
max_iter_bis = 200
while missclass_it!=0 and compteur_bis<max_iter_bis:
    if compteur_bis%25 == 0:
        print(compteur_bis)
    compteur_bis += 1
    b_it = (1-0.05)*b_it
    last_results_bis = np.empty(n_samples)
    for i in range(n_samples):
        last_results_bis[i] = np.sign(results_new[i] - b_it)
        
    mushrooms_it['it_next_step'] = last_results_bis
    correct_with_b_bis = 0

    for i in mushrooms_it[mushrooms['label'] == 1].index:
        if last_results_bis[i] == 1:
            correct_with_b_bis += 1
    missclass_it = positives_new.shape[0] - correct_with_b_bis
stop_counter = 0
for i in range(n_samples):
    if mushrooms_it.loc[i,'it_results'] != mushrooms_it.loc[i,'it_next_step']:
        stop_counter += 1

positives = 0
true_positive = 0
for i in range(n_samples):
    if mushrooms_it.loc[i,'it_next_step'] == 1:
        positives+=1
        if mushrooms_it.loc[i,'class'] == 1:
            true_positive += 1

In [None]:
print(precision)

In [None]:
missclass_it

In [None]:
print('gamberge')

In [13]:
positives = 0
true_positives = 0
for i in range(n_samples):
    if last_results_bis[i] == 1:
        positives += 1
        if mushrooms_it.loc[i,'class'] == 1:
            true_positives += 1

In [14]:
print(true_positives/positives)

0.654671982095398


In [15]:
true_positive

17829

In [16]:
positives

28596