In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from scipy.stats import bernoulli
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
mushrooms_whole = pd.read_csv('mushroom.csv')
classes = mushrooms_whole['class'].copy()
colnames = mushrooms_whole.drop(['class'],axis=1).columns.copy()
mushrooms_whole = StandardScaler().fit_transform(X=mushrooms_whole.to_numpy()[:,:-1])
mushrooms_whole = pd.DataFrame(mushrooms_whole)
mushrooms_whole.columns = colnames
mushrooms_whole['class'] = classes
mushrooms = mushrooms_whole.sample(frac=0.8)
index_train = list(mushrooms.index)
mushrooms_test = mushrooms_whole.drop(labels=index_train,axis=0)
mushrooms = mushrooms.reset_index(drop=True)
n_samples = mushrooms.shape[0]
mushrooms['label'] = np.ones(n_samples)*(-1)
for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if mushrooms.loc[i,'class'] == 1 and random == 0:
        mushrooms.loc[i,'label'] = 1

In [3]:
t=15
n_positives = mushrooms[mushrooms['label'] == 1].shape[0]
positive_instances = mushrooms[mushrooms['label'] == 1]
unlabelled_instances = mushrooms[mushrooms['label'] == -1]

In [4]:
df_results = mushrooms.copy()
df_results = mushrooms.drop(colnames,axis=1)

for i in tqdm(range(t)):
    u_t = unlabelled_instances.sample(n=n_positives)
    train_set = pd.concat([positive_instances, u_t])
    index_t = list(train_set.index)
    f_t = SVC(decision_function_shape='ovr').fit(X=train_set.to_numpy()[:,:-2],
                                          y=train_set['label'].to_numpy())
    to_test = mushrooms.copy()
    to_test = to_test.drop(labels=index_t,axis=0)
    predictions_t = f_t.decision_function(X=to_test.to_numpy()[:,:-2])
    to_test[f'score_{i}'] = predictions_t
    to_test = to_test.drop(colnames, axis=1)
    to_test = to_test.drop(['class','label'], axis=1)
    df_results = df_results.merge(to_test, how='left',left_index=True,right_index=True)

100%|██████████| 15/15 [03:58<00:00, 15.89s/it]


In [5]:
mushrooms_bis = mushrooms.copy()
df_results = df_results.drop(['class','label'], axis=1)
df_results_gen = mushrooms_bis.merge(df_results, how='left', left_index=True, right_index=True)

In [6]:
df_results_gen['average'] = df_results.mean(axis=1)

In [7]:
df_results_gen = df_results_gen.drop(colnames,axis=1)

In [8]:
positives_train = 0
true_positives_train = 0

for i in df_results_gen.index:
    df_results_gen.loc[i,'average'] = np.sign(df_results_gen.loc[i,'average'])

for i in df_results_gen.index:
    if df_results_gen.loc[i,'average'] == 1:
        positives_train += 1
        if mushrooms.loc[i,'class'] == 1:
            true_positives_train += 1

precision = true_positives_train/positives_train
recall = true_positives_train/(mushrooms[mushrooms['class']==1].shape[0]-n_positives)
f_1_train = (2*precision*recall)/(precision+recall)
weird_thing = (recall**2)/(positives_train/(n_samples-n_positives))

In [9]:
print('the precision on train set is : ', precision, '\n', 
      'recall on train set : ', recall, '\n', 
      'f_1 in train set : ', f_1_train, '\n',
      'weird on train : ', weird_thing)

the precision on train set is :  0.8423801420104454 
 recall on train set :  0.8087323943661971 
 f_1 in train set :  0.8252134172630853 
 weird on train :  1.4293028995579362


In [10]:
df_results_test = mushrooms_test.copy()
df_results_test = df_results_test.drop(colnames,axis=1)
df_results_test = df_results_test.drop(['class'],axis=1)

for i in tqdm(range(t)):
    u_t_t = unlabelled_instances.sample(n=n_positives)
    train_set = pd.concat([positive_instances, u_t_t])
    f_t_t = SVC(decision_function_shape='ovr').fit(X=train_set.to_numpy()[:,:-2],
                                          y=train_set['label'].to_numpy())
    to_test_t = mushrooms_test.copy()
    predictions_t_t = f_t_t.decision_function(X=to_test_t.to_numpy()[:,:-1])
    df_results_test[f'score_{i}'] = predictions_t_t

100%|██████████| 15/15 [01:51<00:00,  7.46s/it]


In [11]:
df_results_test_avg = df_results_test.mean(axis=1)

In [12]:
df_results_test_avg.index

Index([    5,    18,    23,    24,    27,    31,    36,    46,    50,    52,
       ...
       54001, 54003, 54005, 54011, 54012, 54013, 54016, 54018, 54019, 54027],
      dtype='int64', length=10807)

In [13]:
positives_test = 0
true_positives_test = 0
n_samples_test = df_results_test.shape[0]
for i in df_results_test_avg.index:
    df_results_test_avg[i] = np.sign(df_results_test_avg[i])

for i in df_results_test_avg.index:
    if df_results_test_avg[i] == 1:
        positives_test += 1
        if mushrooms_test.loc[i, 'class'] == 1:
            true_positives_test += 1

precision_test = true_positives_test/positives_test
recall_test = true_positives_test/(mushrooms_test[mushrooms_test['class'] == 1].shape[0])
f_1_test = (2*precision_test*recall_test)/(precision_test+recall_test)
weird_test = (recall_test**2)/(positives_test/n_samples_test)

In [19]:
print('precision on test : ', precision_test, '\n', 
      'recall on test : ', recall_test, '\n',
      'f_1 on test : ', f_1_test, '\n',
      'weird estim on test : ', weird_test)

precision on test :  0.8779239766081871 
 recall on test :  0.809162876873842 
 f_1 on test :  0.8421421684634937 
 weird estim on test :  1.293096577866466
