In [3]:
import os
os.chdir("C:\\Users\\tsarcevic\\PycharmProjects\\fingerprinting-toolbox")
import warnings
warnings.filterwarnings('ignore')

In [8]:
import pickle
import numpy as np
import pandas as pd
import collections

from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from scipy.stats import chi2
from imblearn.under_sampling import *

from datasets import GermanCredit
from attacks import *
from parameter_guidelines.guidelines import *

In [5]:
# original data
data = Nursery().preprocessed()
X = data.drop('target', axis=1).to_numpy()
y = data['target']

In [6]:
# fingerprinted data
gamma=1
fp_dir = "parameter_guidelines/fingerprinted_data/nursery/attr_subset_8/"
fingerprinted_data = pd.read_csv(fp_dir + "universal_g{}_x1_l32_u1_sk0.csv".format(gamma))
fp_data = Nursery().preprocessed(fp_data=fingerprinted_data)
#data = data.to_numpy()
X_fp = fp_data.drop('target', axis=1)#.to_numpy()
y_fp = fp_data['target']

In [9]:
# baseline
score_baseline = fp_cross_val_score(KNeighborsClassifier(), X, y, X_fp, y_fp, scoring='accuracy')['test_score']
score_baseline

array([0.7037037 , 0.65856481, 0.73842593, 0.69239676, 0.67155538])

In [10]:
# this function will serve as a pipeline for cross validation attacked dataset
def attack_cross_val_score(X, y, target, attack, attack_strength, n_folds=5):
    accuracy = []
    for fold in range(n_folds):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=fold, shuffle=True)
        train = pd.concat([X_train, y_train], axis=1)
        attacked_train = attack.run(train, attack_strength, random_state=fold)
        attacked_X = attacked_train.drop(target, axis=1)
        attacked_y = attacked_train[target]
        model = KNeighborsClassifier()
        model.fit(attacked_X, attacked_y)
        acc = accuracy_score(y_test, model.predict(X_test))
        accuracy.append(acc)
    return accuracy

In [11]:
# random attack
attack_utility_scores = attack_cross_val_score(X_fp, y_fp, 'target', HorizontalSubsetAttack(), 0.7)
attack_utility_scores

[0.7334104938271605,
 0.7118055555555556,
 0.7345679012345679,
 0.7291666666666666,
 0.7322530864197531]

### Targeted attack

In [12]:
def targeted_attack_cross_val_score(X, y, attack_strength, n_folds=5, sampler_version=1, sampler_n_neighbors=3):
    accuracy = []
    for fold in range(n_folds):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=fold, shuffle=True)
        train = pd.concat([X_train, y_train], axis=1)
        sampler = NearMiss(version=sampler_version, 
                           sampling_strategy=strategy(attack_strength, y_train), 
                           n_neighbors=sampler_n_neighbors)
        #sampler = InstanceHardnessThreshold(estimator=GradientBoostingClassifier(),
        #                                    sampling_strategy=strategy(attack_strength, y_train)) -- some internal error
        attacked_X, attacked_y = sampler.fit_resample(X_train, y_train)
        model = KNeighborsClassifier()
        model.fit(attacked_X, attacked_y)
        acc = accuracy_score(y_test, model.predict(X_test))
        accuracy.append(acc)
    return accuracy

In [13]:
def strategy(attack_strength, y):
    strategy = y.value_counts() * (1-attack_strength)
    strategy = {key: round(x) for key, x in strategy.items()}
    return strategy

In [18]:
# targeted attack
attack_utility_scores = targeted_attack_cross_val_score(X_fp, y_fp, 0.7)
attack_utility_scores
np.mean(attack_utility_scores) # version 1 ###### the best (and the only one actually better than random)

0.6371913580246915

In [19]:
# experimenting with the number of neighbors
res = dict()
for n in range(1, 10):
    # targeted attack
    attack_utility_scores = targeted_attack_cross_val_score(X_fp, y_fp, 0.7, sampler_n_neighbors=n) # 3 shows to be the best (default)
    res[n] = np.mean(attack_utility_scores) # version 1 ###### the best (and the only one actually better than random)
print('version 1')
print(res)
print(max(res.values()))

version 1
{1: 0.657716049382716, 2: 0.6479938271604938, 3: 0.6371913580246915, 4: 0.6342592592592593, 5: 0.6308641975308642, 6: 0.6287037037037037, 7: 0.6272376543209877, 8: 0.6281635802469137, 9: 0.6255401234567901}
0.657716049382716


In [20]:
# targeted attack
attack_utility_scores = targeted_attack_cross_val_score(X_fp, y_fp, 0.7, sampler_version=2)
attack_utility_scores
np.mean(attack_utility_scores) # version 2

0.71820987654321

In [21]:
# targeted attack
attack_utility_scores = targeted_attack_cross_val_score(X_fp, y_fp, 0.7, sampler_version=3)
np.mean(attack_utility_scores) # version 3

0.5409722222222222

Here update the targeted_attack_cross_val_score with the best strategy.

In [22]:
with open('parameter_guidelines/Nursery/evaluation/robustness_horizontal_universal_c95_e30.pickle', 'rb') as infile:
    robustness = pickle.load(infile)

In [23]:
robustness

{1: 0.9,
 1.11: 0.9,
 1.25: 0.9,
 1.43: 0.9,
 1.67: 0.9,
 2: 0.9,
 2.5: 0.9,
 3: 0.9,
 4: 0.9,
 5: 0.9,
 10: 0.8,
 18: 0.6}

In [24]:
fpattr=8
n_exp = 10
results = {gamma: [] for gamma in robustness}
for gamma, attack_strength in robustness.items():
    print(gamma, attack_strength)
    for exp in range(n_exp):
        # take one fp ds
        fp_dir = "parameter_guidelines/fingerprinted_data/nursery/attr_subset_{}/".format(fpattr)
        fingerprinted_data = pd.read_csv(fp_dir + "universal_g{}_x1_l32_u1_sk{}.csv".format(gamma, exp))
        fp_data = Nursery().preprocessed(fp_data=fingerprinted_data)
        #data = data.to_numpy()
        X_fp = fp_data.drop('target', axis=1)#.to_numpy()
        y_fp = fp_data['target']
        
        # calc baseline fp utility
        score_baseline = fp_cross_val_score(KNeighborsClassifier(), X, y, X_fp, y_fp, scoring='accuracy')['test_score']

        # calc attacked 
        attack_utility_scores = targeted_attack_cross_val_score(X_fp, y_fp, attack_strength, 
                                                                sampler_version=2, sampler_n_neighbors=1) # CHANGE HERE

        # take relative loss
        rel_loss = (score_baseline - attack_utility_scores) / score_baseline
        results[gamma].append(rel_loss)

1 0.9
1.11 0.9
1.25 0.9
1.43 0.9
1.67 0.9
2 0.9
2.5 0.9
3 0.9
4 0.9
5 0.9
10 0.8
18 0.6


In [25]:
results

{1: [array([ 0.02576754, -0.01874634,  0.08881923,  0.01988775,  0.02279095]),
  array([ 0.08373852, -0.04374241,  0.09546166,  0.0117386 ,  0.04726037]),
  array([0.08842795, 0.04456019, 0.14858841, 0.04758033, 0.04996758]),
  array([ 0.06287921, -0.02344666,  0.08227176,  0.0748843 ,  0.06911277]),
  array([ 0.06554713, -0.07321538,  0.1032419 ,  0.00996497,  0.03636742]),
  array([0.06883044, 0.03358425, 0.13863405, 0.02586844, 0.04165947]),
  array([ 0.09199782, -0.02248521,  0.09560327,  0.01822578,  0.05931745]),
  array([ 0.07921867, -0.00059312,  0.11973279,  0.07303382,  0.06766753]),
  array([ 0.08915145, -0.01305638,  0.12899949,  0.00745438,  0.05131062]),
  array([ 0.02847755, -0.05791045,  0.10311383,  0.01006316,  0.01681937])],
 1.11: [array([ 0.02351011, -0.01275362,  0.08423773,  0.02203208,  0.01348616]),
  array([ 0.07515991, -0.04620061,  0.08367347,  0.01381847,  0.02411887]),
  array([0.06688596, 0.00584112, 0.13483146, 0.01546119, 0.01123256]),
  array([0.036085

In [26]:
with open('parameter_guidelines/Nursery/evaluation/rel_undersampling_attack_utilities_knn_fpattr{}_e{}.pkl'.format(fpattr, n_exp), 'wb') as outfile:
    pickle.dump(results, outfile)