In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import bernoulli
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.svm import SVC

In [2]:
mushrooms = pd.read_csv('mushroom.csv')
mushrooms_train = mushrooms.sample(frac=0.8)
list_train = mushrooms_train.index
list_test = []
for i in mushrooms.index:
    if i not in list_train:
        list_test.append(i)
mushroom_test = mushrooms.filter(items=list_test,axis=0)

mushrooms = mushrooms_train.copy().reset_index(drop=True)
mushroom_test = mushroom_test.reset_index(drop=True)

In [3]:
def rbf(x,y,l=1):
    """Gaussian kernel

    Parameters
    -------------------------------
    x : float
    a real number

    y : float
    a real number

    l: float, non zero
    a scale parameter
    -------------------------------
    """
    dim = x.shape[0]
    vect = np.empty(dim)
    if dim == y.shape[0]  :
        d = np.exp(-((np.linalg.norm(x-y))**2)/(2*(l**2)))
        return d
    else :
        for i in range(dim):
            vect[i] = np.exp(-((np.linalg.norm(x[i] - y))**2)/(2*(l**2)))
        return vect

In [4]:
#First, the cluster step, to 'initialize' the labels, and the creation of unlabeled data
mushrooms_copy = mushrooms.drop(['class'], axis=1)
mushrooms_copy = StandardScaler().fit_transform(X=mushrooms_copy)
n_samples = mushrooms.shape[0]
mushrooms['label'] = np.ones(n_samples)*(-1)

for i in mushrooms.index:
    random = bernoulli.rvs(p=3/4)
    if mushrooms.loc[i,'class'] == 1 and random == 0:
        mushrooms.loc[i,'label'] = 1
    else:
        mushrooms.loc[i,'label'] = -1

n_cluster = 8
clusterized_data = KMeans(n_clusters=n_cluster, init='random').fit(mushrooms_copy)
mushrooms['cluster'] = clusterized_data.labels_

list_of_ratio = []
for i in range(n_cluster):
    list_of_ratio.append(mushrooms[mushrooms['cluster'] == i]['label'].sum()/mushrooms[mushrooms['cluster'] == i]['class'].shape[0])
list_of_ratio = np.array(list_of_ratio)
positive_cluster = np.argmax(list_of_ratio)
negative_cluster = np.argmin(list_of_ratio)
print('the cluster containing positive values is : ', positive_cluster, '\n', 
      'the one containing negative exampels is : ', negative_cluster)

reliable_positives = mushrooms[mushrooms['cluster'] == positive_cluster]
reliable_positives = reliable_positives[reliable_positives['label'] == 1]
reliable_negatives = mushrooms[mushrooms['cluster'] == negative_cluster]
reliable_negatives = reliable_negatives[reliable_negatives['label'] == -1]
reliable_negatives = reliable_negatives.sample(n=reliable_positives.shape[0]) #to adjust the class balance ratio

the cluster containing positive values is :  7 
 the one containing negative exampels is :  2


In [5]:
#first svm part
gamma = 1
positives_array = reliable_positives.drop(['class','cluster'], axis=1)
negatives_array = reliable_negatives.drop(['class', 'cluster'], axis=1)
data_svm = np.vstack((positives_array,negatives_array))
n_reliable = data_svm.shape[0]
outcome = data_svm[:,8]
data_svm = data_svm[:,:8]
omega = np.empty((n_reliable,n_reliable))
for k in range(n_reliable):
    for i in range(k,n_reliable):
        omega[k,i] = outcome[k]*outcome[i]*rbf(x=data_svm[k,:],y=data_svm[i,:],l=10)
omega_t = np.transpose(omega)
omega = omega_t + omega
for i in range(n_reliable):
    omega[i,i] = 1


#now, computation of the rest of the matrix
first_row = np.hstack((0,-np.transpose(outcome)))
first_row = first_row.reshape(1,first_row.shape[0])
bot_of_mat_right = omega + (1/gamma)*np.eye(n_reliable)
bot_of_mat = np.hstack((outcome.reshape(n_reliable,1), bot_of_mat_right))
whole_mat = np.vstack((first_row, bot_of_mat))
right_hand = np.ones(n_reliable+1)
right_hand[0] = 0

#we get the coefficients
coeffs = np.linalg.solve(a=whole_mat,b=right_hand)
b = coeffs[0]
alpha = coeffs[1:coeffs.shape[0]]

#now we compute the wt \phi(x) and then we order them 
test_data = mushrooms.drop(['class','label','cluster'], axis=1).to_numpy()
results = np.empty(n_samples)
for i in tqdm(range(n_samples)):
    results[i] = np.sum(alpha*outcome*rbf(x=data_svm,y=test_data[i,:],l=10))
sorted_results = np.sort(results)
good_ratio = int(n_samples/2)
b = sorted_results[good_ratio]

last_results = np.empty(n_samples)
for i in range(n_samples):
    last_results[i] = np.sign(results[i] - b)

mushrooms['it_results'] = last_results
correct_with_b = 0
for i in range(reliable_positives.shape[0]):
    if mushrooms.loc[reliable_positives.index[i],'it_results'] == 1:
        correct_with_b += 1
missclass = reliable_positives.shape[0] - correct_with_b

100%|██████████| 43228/43228 [07:55<00:00, 90.82it/s] 


In [6]:
positive = 0
true_positive = 0
for i in range(n_samples):
    if last_results[i] == 1 and mushrooms.loc[mushrooms.index[i],'class'] == 1:
        true_positive += 1
for i in range(n_samples):
    if last_results[i] == 1:
        positive += 1
print(true_positive/positive, positive)

0.6345255170499237 21613


In [7]:
compteur = 0
max_iter = 500
while missclass!=0 and compteur<max_iter:
    compteur += 1
    b = (1+0.05)*b
    last_results = np.empty(n_samples)
    for i in range(n_samples):
        last_results[i] = np.sign(results[i] - b)

    mushrooms['it_results'] = last_results
    correct_with_b = 0
    for i in range(reliable_positives.shape[0]):
        if mushrooms.loc[reliable_positives.index[i],'it_results'] == 1:
            correct_with_b += 1
    missclass = reliable_positives.shape[0] - correct_with_b

In [8]:
mushrooms_it = mushrooms

In [9]:
positive = int(n_samples/2)
true_positive = 0
for i in range(n_samples):
    if mushrooms_it.loc[mushrooms.index[i],'it_results'] == 1 and mushrooms_it.loc[mushrooms.index[i],'class'] == 1:
        true_positive += 1

precision = true_positive/positive
print(precision)

0.6344961598963634


In [10]:
true_positive

13714

In [11]:
compteur=0
max_iter=10
good_ratio = int(n_samples/2)
while True and compteur<max_iter:
    compteur+=1
    for i in range(n_samples):
        if mushrooms_it.loc[mushrooms.index[i],'it_results'] == 0:
            mushrooms_it.loc[mushrooms.index[i],'it_results'] = -1
    positives_new = mushrooms_it[mushrooms_it['it_results'] == 1]
    positives_new = positives_new[positives_new['label'] == 1]
    negatives_new = mushrooms_it[mushrooms_it['it_results'] == -1]
    negatives_new = negatives_new[negatives_new['label'] == -1]
    negatives_new = negatives_new.sample(n=positives_new.shape[0])
    #first svm part
    gamma = 1
    positives_array_new = positives_new.drop(['class','cluster','label'], axis=1)
    negatives_array_new = negatives_new.drop(['class', 'cluster','label'], axis=1)
    data_svm_it = np.vstack((positives_array_new,negatives_array_new))
    n_reliable = data_svm_it.shape[0]
    outcome_it = data_svm_it[:,8].copy()
    data_svm_it = data_svm_it[:,:8].copy()
    #compute omega
    omega_it = np.zeros((n_reliable,n_reliable))
    for k in range(n_reliable):
        for i in range(k,n_reliable):
            omega_it[k,i] = outcome_it[k]*outcome_it[i]*rbf(x=data_svm_it[k,:],y=data_svm_it[i,:],l=10)
    omega_it_t = np.transpose(omega_it)
    omega_it = omega_it+omega_it_t
    for i in range(n_reliable):
        omega_it[i,i] = 1
    
    first_row_it = np.hstack((0,-np.transpose(outcome_it)))
    first_row_it = first_row_it.reshape(1,first_row_it.shape[0])
    bot_of_mat_right_it = omega_it + (1/gamma)*np.eye(n_reliable)
    bot_of_mat_it = np.hstack((outcome_it.reshape(n_reliable,1), bot_of_mat_right_it))
    whole_mat_it = np.vstack((first_row_it, bot_of_mat_it))
    right_hand_it = np.ones(n_reliable+1)
    right_hand_it[0] = 0
    coeffs_it = np.linalg.solve(a=whole_mat_it,b=right_hand_it)
    b_it = coeffs_it[0]
    alpha_it = coeffs_it[1:coeffs_it.shape[0]]
    test_data_it = mushrooms_it.drop(['class','label','cluster','it_results'], axis=1).to_numpy()
    results_new = np.empty(n_samples)
    #the results in the previous algo is now 'new_results'

    
    for i in range(n_samples):
        results_new[i] = np.sum(alpha_it*outcome_it*rbf(x=data_svm_it,y=test_data_it[i,:],l=10))
    sorted_results_it = np.sort(results_new)
    b_it = sorted_results_it[good_ratio]
    last_results_it = np.empty(n_samples)
    for i in range(n_samples):
        last_results_it[i] = np.sign(results_new[i] - b)
    correct_with_b_it = 0 
    for i in range(mushrooms_it[mushrooms['label'] == 1].shape[0]):
        if last_results_it[i] == 1:
            correct_with_b_it += 1
    missclass_it = positives_new.shape[0] - correct_with_b_it
    compteur_bis = 0
    max_iter_bis = 200
    while missclass_it!=0 and compteur_bis<max_iter_bis:
        compteur_bis += 1
        b_it = (1-0.05)*b_it
        last_results_bis = np.empty(n_samples)
        for i in range(n_samples):
            last_results_bis[i] = np.sign(results_new[i] - b_it)
        correct_with_b_bis = 0

        for i in range(mushrooms_it[mushrooms['label'] == 1].shape[0]):
            if last_results_bis[i] == 1:
                correct_with_b_bis += 1
        missclass_it = positives_new.shape[0] - correct_with_b_bis
    stop_counter = 0
    for i in range(n_samples):
        if mushrooms_it.loc[mushrooms.index[i],'it_results'] != last_results_bis[i]:
            stop_counter += 1
    if stop_counter == 0:
        break
        compteur = max_iter
    else:
        mushrooms_it['it_results'] = last_results_bis
alpha_it_final = alpha_it.copy()
outcome_it_final = outcome_it.copy()

In [12]:
positives = 0
true_positives = 0
for i in range(n_samples):
    if last_results_bis[i] == 1:
        positives += 1
        if mushrooms_it.loc[mushrooms.index[i],'class'] == 1:
            true_positives += 1

precision_train = true_positives/positives
recall_train = true_positives/mushrooms[mushrooms['class'] == 1].shape[0]
f_1_score_train = (2*recall_train*precision_train)/(precision_train+recall_train)
weird_thing_train = (recall_train**2)/(positives/n_samples)

In [13]:
print('the precision on the training data is :', precision_train, '\n',
      'the recall on the training data is :', recall_train, '\n',
      'the f1 score on the training data is :', f_1_score_train, '\n',
      'the weird estimator is :', weird_thing_train)

the precision on the training data is : 0.6828688416325452 
 the recall on the training data is : 0.652036826838189 
 the f1 score on the training data is : 0.6670967741935484 
 the weird estimator is : 0.8091609067123828


In [14]:
true_positives

15510

In [15]:
positives

22713

In [16]:
false_positive = positives - true_positives
print(false_positive)

7203


In [17]:
svc_sklearn = SVC().fit(X=mushrooms.to_numpy()[:,:8],y=mushrooms.to_numpy()[:,8])

In [18]:
labels_sklearn = svc_sklearn.predict(X=mushrooms.to_numpy()[:,:8])

In [19]:
positives_sklearn = 0
true_positives_sklearn = 0
for i in range(labels_sklearn.shape[0]):
    if labels_sklearn[i] == 1:
        positives_sklearn += 1
        if mushrooms.to_numpy()[i,8] == 1:
            true_positives_sklearn += 1

precision_sk_train = true_positives_sklearn/positives_sklearn
recall_sk_train = true_positives_sklearn/mushrooms[mushrooms['class'] == 1].shape[0]
f_1_score_sk_train = (2*precision_sk_train*recall_sk_train)/(precision_sk_train+recall_sk_train)
weird_estim_sk = (recall_sk_train**2)/(positives_sklearn/n_samples)

In [20]:
print('precision of sklearn svm on training set : ',precision_sk_train, '\n',
     'recall of sklearn svm on training set : ', recall_sk_train, '\n', 
      'f_1 score sklearn svm on training set : ', f_1_score_sk_train, '\n',
      'weird thing sklearn score on training set :', weird_estim_sk)

precision of sklearn svm on training set :  0.6716285806243966 
 recall of sklearn svm on training set :  0.7018119140707109 
 f_1 score sklearn svm on training set :  0.68638858623029 
 weird thing sklearn score on training set : 0.8565946857483658


In [21]:
true_positives_sklearn

16694

In [22]:
positives_sklearn

24856

In [23]:
#So, we got a better result than the svm on the fully labelled data (but reduced by half).
#The method works, however, very slow on large datasets (took 10 hours to run on the full dataset and the kernel
#died before it could end). And the thing is that, the svm on the whole dataset had better performances.

In [24]:
data_svm_test = mushroom_test.to_numpy()[:,:8]
n_samples_test = data_svm_test.shape[0]

In [25]:
results_test = np.empty(n_samples_test)
for i in range(n_samples_test):
        results_test[i] = np.sign(np.sum(alpha_it_final*outcome_it_final*rbf(x=data_svm_it,y=data_svm_test[i,:],l=10))-b_it)
positive_test = 0
true_positive_test = 0
for i in range(n_samples_test):
    if results_test[i] == 1:
        positive_test += 1
        if mushroom_test.loc[i,'class'] == 1:
            true_positive_test += 1


precision_test = true_positive_test/positive_test
recall_test = true_positive_test/mushroom_test[mushroom_test['class'] == 1].shape[0]
f_1_score_test = (2*precision_test*recall_test)/(precision_test+recall_test)
weird_estimation_test = (recall_test**2)/(positive_test/n_samples_test)

In [26]:
print('the precision on the test data is :', precision_test, '\n',
      'the recall on the test data is :', recall_test, '\n',
      'the f1 score on the test data is :', f_1_score_test, '\n',
      'the weird estimator is :', weird_estimation_test)

the precision on the test data is : 0.6688187052870355 
 the recall on the test data is : 0.650985054347826 
 the f1 score on the test data is : 0.6597813925466908 
 the weird estimator is : 0.7991287931280237


In [27]:
#doesn't perform so bad on the test data

In [28]:
labels_sklearn_test = svc_sklearn.predict(data_svm_test)
positives_sk_test = 0
true_positives_sk_test = 0
for i in range(n_samples_test):
    if labels_sklearn_test[i] == 1:
        positives_sk_test += 1
        if mushroom_test.loc[i,'class'] == 1:
            true_positives_sk_test += 1

precision_sk_test = true_positives_sk_test/positives_sk_test
recall_sk_test = true_positives_sk_test/mushroom_test[mushroom_test['class'] == 1].shape[0]
f_1_score_sk_test = (2*precision_sk_test*recall_sk_test)/(recall_sk_test+precision_sk_test)
weird_estim_sk_test = (recall_sk_test**2)/(positives_sk_test/n_samples_test)

In [29]:
print('precision on the test set by sklearn svm : ', precision_sk_test, '\n',
      'recall on the test set by sklearn svm : ', recall_sk_test, '\n', 
      'f1 score on test set by sklearn svm :', f_1_score_sk_test, '\n',
      'weird metric on test set by sklearn svm : ', '\n', weird_estim_sk_test)

precision on the test set by sklearn svm :  0.6652699435938759 
 recall on the test set by sklearn svm :  0.7010869565217391 
 f1 score on test set by sklearn svm : 0.6827090052096254 
 weird metric on test set by sklearn svm :  
 0.8560657861365536


In [30]:
omega[omega.shape[0]-2:,omega.shape[0]-2:]

array([[1., 0.],
       [0., 1.]])