In [1]:
import os
os.chdir("C:\\Users\\tsarcevic\\PycharmProjects\\fingerprinting-toolbox")
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pickle
import numpy as np
import pandas as pd
import collections

from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from scipy.stats import chi2
from imblearn.under_sampling import *

from datasets import GermanCredit
from attacks import *
from parameter_guidelines.guidelines import *

In [3]:
# original data
data = Nursery().preprocessed()
X = data.drop('target', axis=1).to_numpy()
y = data['target']

In [4]:
# fingerprinted data
gamma=1
fp_dir = "parameter_guidelines/fingerprinted_data/nursery/attr_subset_8/"
fingerprinted_data = pd.read_csv(fp_dir + "universal_g{}_x1_l32_u1_sk0.csv".format(gamma))
fp_data = Nursery().preprocessed(fp_data=fingerprinted_data)
#data = data.to_numpy()
X_fp = fp_data.drop('target', axis=1)#.to_numpy()
y_fp = fp_data['target']

In [5]:
# baseline
score_baseline = fp_cross_val_score(SVC(random_state=9), X, y, X_fp, y_fp, scoring='accuracy')['test_score']
score_baseline

array([0.68942901, 0.80864198, 0.82445988, 0.8159012 , 0.6483983 ])

In [6]:
# this function will serve as a pipeline for cross validation attacked dataset
def attack_cross_val_score(X, y, target, attack, attack_strength, n_folds=5):
    accuracy = []
    for fold in range(n_folds):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=fold, shuffle=True)
        train = pd.concat([X_train, y_train], axis=1)
        attacked_train = attack.run(train, attack_strength, random_state=fold)
        attacked_X = attacked_train.drop(target, axis=1)
        attacked_y = attacked_train[target]
        model = SVC()
        model.fit(attacked_X, attacked_y)
        acc = accuracy_score(y_test, model.predict(X_test))
        accuracy.append(acc)
    return accuracy

In [7]:
# random attack
attack_utility_scores = attack_cross_val_score(X_fp, y_fp, 'target', HorizontalSubsetAttack(), 0.7)
attack_utility_scores

[0.8263888888888888,
 0.8190586419753086,
 0.8306327160493827,
 0.8132716049382716,
 0.8194444444444444]

### Targeted attack

In [8]:
def targeted_attack_cross_val_score(X, y, attack_strength, n_folds=5, sampler_version=1, sampler_n_neighbors=3):
    accuracy = []
    for fold in range(n_folds):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=fold, shuffle=True)
        train = pd.concat([X_train, y_train], axis=1)
        sampler = NearMiss(version=sampler_version, 
                           sampling_strategy=strategy(attack_strength, y_train), 
                           n_neighbors=sampler_n_neighbors)
        #sampler = InstanceHardnessThreshold(estimator=GradientBoostingClassifier(),
        #                                    sampling_strategy=strategy(attack_strength, y_train)) -- some internal error
        attacked_X, attacked_y = sampler.fit_resample(X_train, y_train)
        model = SVC()
        model.fit(attacked_X, attacked_y)
        acc = accuracy_score(y_test, model.predict(X_test))
        accuracy.append(acc)
    return accuracy

In [9]:
def strategy(attack_strength, y):
    strategy = y.value_counts() * (1-attack_strength)
    strategy = {key: round(x) for key, x in strategy.items()}
    return strategy

In [14]:
# targeted attack
attack_utility_scores = targeted_attack_cross_val_score(X_fp, y_fp, 0.7)
attack_utility_scores
np.mean(attack_utility_scores) # version 1 ###### the best (and the only one actually better than random)

0.7334104938271604

In [15]:
# experimenting with the number of neighbors
res = dict()
for n in range(1, 10):
    # targeted attack
    attack_utility_scores = targeted_attack_cross_val_score(X_fp, y_fp, 0.7, sampler_n_neighbors=n) # 3 shows to be the best (default)
    res[n] = np.mean(attack_utility_scores) # version 1 ###### the best (and the only one actually better than random)
print('version 1')
print(res)
print(max(res.values()))

version 1
{1: 0.7689814814814815, 2: 0.7435956790123457, 3: 0.7334104938271604, 4: 0.7234567901234568, 5: 0.7002314814814815, 6: 0.6812500000000001, 7: 0.6749228395061728, 8: 0.6682098765432098, 9: 0.6604166666666667}
0.7689814814814815


In [16]:
# targeted attack
attack_utility_scores = targeted_attack_cross_val_score(X_fp, y_fp, 0.7, sampler_version=2)
attack_utility_scores
np.mean(attack_utility_scores) # version 2

0.7657407407407407

In [17]:
# targeted attack
attack_utility_scores = targeted_attack_cross_val_score(X_fp, y_fp, 0.7, sampler_version=3)
np.mean(attack_utility_scores) # version 3

0.6665123456790123

Here update the targeted_attack_cross_val_score with the best strategy.

In [18]:
with open('parameter_guidelines/Nursery/evaluation/robustness_horizontal_universal_c95_e30.pickle', 'rb') as infile:
    robustness = pickle.load(infile)

In [19]:
robustness

{1: 0.9,
 1.11: 0.9,
 1.25: 0.9,
 1.43: 0.9,
 1.67: 0.9,
 2: 0.9,
 2.5: 0.9,
 3: 0.9,
 4: 0.9,
 5: 0.9,
 10: 0.8,
 18: 0.6}

In [20]:
fpattr=8
n_exp = 10
results = {gamma: [] for gamma in robustness}
for gamma, attack_strength in robustness.items():
    print(gamma, attack_strength)
    for exp in range(n_exp):
        # take one fp ds
        fp_dir = "parameter_guidelines/fingerprinted_data/nursery/attr_subset_{}/".format(fpattr)
        fingerprinted_data = pd.read_csv(fp_dir + "universal_g{}_x1_l32_u1_sk{}.csv".format(gamma, exp))
        fp_data = Nursery().preprocessed(fp_data=fingerprinted_data)
        #data = data.to_numpy()
        X_fp = fp_data.drop('target', axis=1)#.to_numpy()
        y_fp = fp_data['target']
        
        # calc baseline fp utility
        score_baseline = fp_cross_val_score(SVC(random_state=9), X, y, X_fp, y_fp, scoring='accuracy')['test_score']

        # calc attacked 
        attack_utility_scores = targeted_attack_cross_val_score(X_fp, y_fp, attack_strength, 
                                                                sampler_version=1, sampler_n_neighbors=1) # CHANGE HERE

        # take relative loss
        rel_loss = (score_baseline - attack_utility_scores) / score_baseline
        results[gamma].append(rel_loss)

1 0.9
1.11 0.9
1.25 0.9
1.43 0.9
1.67 0.9
2 0.9
2.5 0.9
3 0.9
4 0.9
5 0.9
10 0.8
18 0.6


In [21]:
results

{1: [array([-0.0201455 ,  0.14503817,  0.10622368,  0.09306526, -0.1126658 ]),
  array([ 0.12911523,  0.25199437,  0.17194143,  0.23686381, -0.00256989]),
  array([0.05075293, 0.17721519, 0.14312796, 0.17965933, 0.01936957]),
  array([0.07194245, 0.15975203, 0.16705772, 0.23411704, 0.00681231]),
  array([0.15697963, 0.15392109, 0.17589577, 0.20856062, 0.03357043]),
  array([0.10825893, 0.26937442, 0.21947743, 0.29245027, 0.05486714]),
  array([ 0.07066139,  0.175179  ,  0.1533269 ,  0.20498241, -0.00551511]),
  array([ 0.07261641,  0.1532567 ,  0.19734974,  0.22142261, -0.01258854]),
  array([0.125     , 0.17610063, 0.18190568, 0.18650445, 0.01736812]),
  array([ 0.17497556,  0.15217391,  0.18327822,  0.19677183, -0.04640714])],
 1.11: [array([-0.02927928,  0.14591255,  0.10858469,  0.1398341 , -0.10242826]),
  array([ 0.11283644,  0.19375873,  0.15202232,  0.20795851, -0.05095212]),
  array([0.03579418, 0.17768401, 0.16999536, 0.21235444, 0.08324624]),
  array([ 0.0282167 ,  0.1552619

In [22]:
with open('parameter_guidelines/Nursery/evaluation/rel_undersampling_attack_utilities_svm_fpattr{}_e{}.pkl'.format(fpattr, n_exp), 'wb') as outfile:
    pickle.dump(results, outfile)