In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from scipy.stats import bernoulli
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
raisin_whole = pd.read_csv('raisin.csv')
raisin_whole = raisin_whole.replace('Kecimen', 1)
raisin_whole = raisin_whole.replace('Besni', -1)
colnames = ['area', 'maj_length','min_length', 'eccentricity','convex','extent',
            'perimeter']
classes_to_keep = raisin_whole['Class'].copy()
raisin_whole = StandardScaler().fit_transform(X=raisin_whole.drop(['Class'],axis=1))
raisin_whole = pd.DataFrame(raisin_whole)
raisin_whole.columns = colnames
raisin_whole['class'] = classes_to_keep

  raisin_whole = raisin_whole.replace('Besni', -1)


In [3]:
raisin, raisin_test = train_test_split(raisin_whole,test_size=0.2)
raisin = raisin.reset_index(drop=True)
raisin_test = raisin_test.reset_index(drop=True)

In [4]:
n_samples = raisin.shape[0]

In [5]:
raisin['label'] = np.ones(n_samples)*(-1)
for i in range(n_samples):
    random = bernoulli.rvs(p=3/4)
    if raisin.loc[i,'class'] == 1 and random == 0:
        raisin.loc[i,'label'] = 1

In [6]:
t = 15
positive_instances = raisin[raisin['label'] == 1]
n_positives = positive_instances.shape[0]
unlabelled_instances = raisin[raisin['label'] == -1]

In [7]:
df_results = raisin.copy()
df_results = raisin.drop(colnames,axis=1)

for i in tqdm(range(t)):
    u_t = unlabelled_instances.sample(n=n_positives)
    train_set = pd.concat([positive_instances, u_t])
    index_t = list(train_set.index)
    f_t = SVC(decision_function_shape='ovr').fit(X=train_set.to_numpy()[:,:-2],
                                          y=train_set['label'].to_numpy())
    to_test = raisin.copy()
    to_test = to_test.drop(labels=index_t,axis=0)
    predictions_t = f_t.decision_function(X=to_test.to_numpy()[:,:-2])
    to_test[f'score_{i}'] = predictions_t
    to_test = to_test.drop(colnames, axis=1)
    to_test = to_test.drop(['class','label'], axis=1)
    df_results = df_results.merge(to_test, how='left',left_index=True,right_index=True)

100%|██████████| 15/15 [00:00<00:00, 93.99it/s]


In [8]:
raisin_bis = raisin.copy()
df_results = df_results.drop(['class','label'], axis=1)
df_results_gen = raisin_bis.merge(df_results, how='left', left_index=True, right_index=True)

In [9]:
df_results_gen['average'] = df_results.mean(axis=1)
df_results_gen = df_results_gen.drop(colnames,axis=1)

In [10]:
positives_train = 0
true_positives_train = 0

for i in df_results_gen.index:
    df_results_gen.loc[i,'average'] = np.sign(df_results_gen.loc[i,'average'])

for i in df_results_gen.index:
    if df_results_gen.loc[i,'average'] == 1:
        positives_train += 1
        if raisin.loc[i,'class'] == 1:
            true_positives_train += 1

precision = true_positives_train/positives_train
recall = true_positives_train/(raisin[raisin['class']==1].shape[0]-n_positives)
f_1_train = (2*precision*recall)/(precision+recall)
weird_thing = (recall**2)/(positives_train/(n_samples-n_positives))

In [11]:
print('the precision on train set is : ', precision, '\n', 
      'recall on train set : ', recall, '\n', 
      'f_1 in train set : ', f_1_train, '\n',
      'weird on train : ', weird_thing)

the precision on train set is :  0.825503355704698 
 recall on train set :  0.8754448398576512 
 f_1 in train set :  0.8497409326424871 
 weird on train :  1.6331084864000056


In [12]:
df_results_test = raisin_test.copy()
df_results_test = df_results_test.drop(colnames,axis=1)
df_results_test = df_results_test.drop(['class'],axis=1)

for i in tqdm(range(t)):
    u_t_t = unlabelled_instances.sample(n=n_positives)
    train_set = pd.concat([positive_instances, u_t_t])
    f_t_t = SVC(decision_function_shape='ovr').fit(X=train_set.to_numpy()[:,:-2],
                                          y=train_set['label'].to_numpy())
    to_test_t = raisin_test.copy()
    predictions_t_t = f_t_t.decision_function(X=to_test_t.to_numpy()[:,:-1])
    df_results_test[f'score_{i}'] = predictions_t_t

100%|██████████| 15/15 [00:00<00:00, 182.82it/s]


In [13]:
df_results_test_avg = df_results_test.mean(axis=1)

In [14]:
positives_test = 0
true_positives_test = 0
n_samples_test = df_results_test.shape[0]
for i in df_results_test_avg.index:
    df_results_test_avg[i] = np.sign(df_results_test_avg[i])

for i in df_results_test_avg.index:
    if df_results_test_avg[i] == 1:
        positives_test += 1
        if raisin_test.loc[i, 'class'] == 1:
            true_positives_test += 1

precision_test = true_positives_test/positives_test
recall_test = true_positives_test/(raisin_test[raisin_test['class'] == 1].shape[0])
f_1_test = (2*precision_test*recall_test)/(precision_test+recall_test)
weird_test = (recall_test**2)/(positives_test/n_samples_test)

In [15]:
print('precision on test : ', precision_test, '\n', 
      'recall on test : ', recall_test, '\n',
      'f_1 on test : ', f_1_test, '\n',
      'weird estim on test : ', weird_test)

precision on test :  0.75 
 recall on test :  0.8928571428571429 
 f_1 on test :  0.8152173913043479 
 weird estim on test :  1.4349489795918366
