In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from scipy.stats import bernoulli
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
mushrooms_whole = pd.read_csv('mushroom.csv')
mushrooms = mushrooms_whole.sample(frac=0.8)
index_train = list(mushrooms.index)
mushrooms_test = mushrooms_whole.drop(labels=index_train,axis=0)
n_samples = mushrooms.shape[0]
mushrooms = mushrooms.sample(frac=1)
classes = mushrooms['class'].copy()
colnames = mushrooms.drop(['class'],axis=1).columns.copy()
mushrooms = StandardScaler().fit_transform(X=mushrooms.drop(['class'],axis=1).to_numpy())
mushrooms = pd.DataFrame(mushrooms)
mushrooms.columns = colnames
mushrooms['class'] = classes
mushrooms = mushrooms.reset_index(drop=True)
mushrooms['label'] = np.ones(n_samples)*(-1)
for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if mushrooms.loc[i,'class'] == 1 and random == 0:
        mushrooms.loc[i,'label'] = 1

In [3]:
t=15
n_positives = mushrooms[mushrooms['label'] == 1].shape[0]
positive_instances = mushrooms[mushrooms['label'] == 1]
unlabelled_instances = mushrooms[mushrooms['label'] == -1]

In [4]:
mushrooms_inter = mushrooms.copy()
df_results = np.zeros((n_samples,1))
df_results = pd.DataFrame(df_results)
df_results.columns = ['init']
for i in tqdm(range(t)):
    u_t = unlabelled_instances.sample(n=n_positives)
    train_set = pd.concat([positive_instances, u_t])
    index_t = list(train_set.index)
    f_t = SVC(decision_function_shape='ovr').fit(X=train_set.to_numpy()[:,:-2],
                                          y=train_set['label'].to_numpy())
    
    to_test = mushrooms.drop(labels = index_t)
    predictions_t = f_t.decision_function(X=to_test.to_numpy()[:,:-2])
    to_test[f'score_{i}'] = predictions_t
    df_pred = pd.DataFrame(to_test[f'score_{i}'])
    mushrooms_inter = mushrooms_inter.merge(df_pred, how='left', left_index=True,
                                           right_index=True)
    df_results[f'score_{i}'] = mushrooms_inter[f'score_{i}']

100%|██████████| 15/15 [03:54<00:00, 15.61s/it]


In [6]:
df_results = df_results.drop(['init'], axis=1)
df_results = df_results.mean(axis=1)

In [8]:
df_results = df_results.to_numpy()
for i in range(n_samples):
    df_results[i] = np.sign(df_results[i])


positives_train = 0
true_positives_train = 0   

for i in range(n_samples):
    if df_results[i] == 1:
        positives_train += 1
        if mushrooms.loc[i,'class'] == 1:
            true_positives_train += 1

precision_train = true_positives_train/positives_train
recall_train = true_positives_train/(mushrooms[mushrooms['class'] == 1].shape[0])
f_1_train = (2*precision_train*recall_train)/(precision_train+recall_train)
weird_train = (recall_train**2)/(positives_train/n_samples)
print('precision on train is : ', precision_train, '\n', 
      'recall on train is : ', recall_train, '\n', 
      'f_1 on train is : ', f_1_train, '\n', 
      'weird metric on train is : ', weird_train)

precision on train is :  0.37740984350192786 
 recall on train is :  0.3431458472959736 
 f_1 on train is :  0.359463181486782 
 weird metric on train is :  0.2886174249683622
