In [None]:
!pip install xgboost

In [134]:
!pip install ucimlrepo

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [135]:
from utils import *
import numpy as np
from time import time
from ucimlrepo import fetch_ucirepo

from sklearn.model_selection import cross_val_score
from knn_scheme.experimental.blind_scheme import BlindNNScheme
from sklearn.model_selection import RandomizedSearchCV
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

## Breast Cancer: hyperparameter search & baseline performance 

In [272]:
# breast cancer data
breast_cancer, pk = import_dataset_from_file('breast-cancer', 'Id')

# preprocess
breast_cancer = breast_cancer.drop('Id', axis=1)
# separate target from the data
c = len(breast_cancer.columns)
target = breast_cancer.values[:, (c-1)]
breast_cancer = breast_cancer.drop("recurrence", axis=1)
# one-hot encode
breast_cancer = pd.get_dummies(breast_cancer)
# a bit more preprocessing
breast_cancer = breast_cancer.drop(['breast_left', 'irradiat_yes'], axis=1)
data = breast_cancer.values

### - Gradient Boosting

In [47]:
# define the model and possible hyperparameters
n_estimators = range(20, 200, 20)
loss = ['log_loss', 'exponential']
criterion = ['friedman_mse', 'squared_error']
hyperparams = dict(n_estimators=n_estimators,
                  loss=loss,
                  criterion=criterion)

# hyperparameter random search
gb = GradientBoostingClassifier()
clf = RandomizedSearchCV(gb, hyperparams, random_state=0)
search_gb = clf.fit(data, target)
search_gb.best_score_

0.6888687235329705

In [109]:
search_gb.best_params_

{'n_estimators': 40, 'loss': 'exponential', 'criterion': 'friedman_mse'}

### - XGBoost

In [118]:
# define the model and possible hyperparameters
n_estimators = range(20, 200, 20)
max_depth = range(3,12,1)
gamma = [i/10.0 for i in range(0,5)]
colsample_bytree = [i/10.0 for i in range(6,10)]
learning_rate = [i/1000.0 for i in range(1, 10, 2)]
subsample = [i/10.0 for i in range(5,9)]
reg_alpha = [1e-5, 1e-2, 0.1, 1, 100]

hyperparams = dict(n_estimators=n_estimators,
                  max_depth=max_depth,
                  gamma=gamma,
                  colsample_bytree=colsample_bytree, 
                  learning_rate=learning_rate,
                  subsample=subsample,
                  reg_alpha=reg_alpha)

# classifier definition
xgboost = xgb.XGBClassifier()
le = preprocessing.LabelEncoder()
target = le.fit_transform(target)

# random search
clf = RandomizedSearchCV(xgboost, hyperparams, random_state=4)
search_xgb = clf.fit(data, target)
search_xgb.best_score_

0.7273442226255293

In [128]:
search_xgb.best_params_

{'subsample': 0.6,
 'reg_alpha': 0.01,
 'n_estimators': 120,
 'max_depth': 7,
 'learning_rate': 0.009,
 'gamma': 0.4,
 'colsample_bytree': 0.8}

### - Random Forest

In [53]:
# define the model and possible hyperparameters
n_estimators = range(20, 200, 20)
criterion = ['gini', 'entropy', 'log_loss']

hyperparams = dict(n_estimators=n_estimators,
                  criterion=criterion)

# hyperparameter random search
rf = RandomForestClassifier()
clf = RandomizedSearchCV(rf, hyperparams, random_state=0)
search_rf = clf.fit(data, target)
search_rf.best_score_

0.7029643073200241

In [90]:
search_rf.best_params_

{'n_estimators': 120, 'criterion': 'entropy'}

### - Logistic Regression

In [58]:
solver = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
C = range(10, 100, 10)

hyperparams = dict(solver=solver, C=C)

# hyperparameter random search
lr = LogisticRegression(random_state=0)
clf = RandomizedSearchCV(lr, hyperparams, random_state=0)
search_lr = clf.fit(data, target)
search_lr.best_score_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6643678160919539

In [107]:
search_lr.best_params_

{'solver': 'newton-cg', 'C': 10}

### - k-NN

In [61]:
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
n_neighbors = range(1, 20, 1)

hyperparams = dict(algorithm=algorithm, n_neighbors=n_neighbors)

# hyperparameter random search
knn = KNeighborsClassifier()
clf = RandomizedSearchCV(knn, hyperparams, random_state=0)
search_knn = clf.fit(data, target)
search_knn.best_score_

0.7484573502722324

In [108]:
search_knn.best_params_

{'n_neighbors': 10, 'algorithm': 'ball_tree'}

## Breast Cancer: performance under fingerprints

In [273]:
n_exp=10
results = []
secret_key = 2000

In [262]:
search_gb.best_params_

{'n_estimators': 160, 'loss': 'exponential', 'criterion': 'friedman_mse'}

In [264]:
xgb_best_params_bc = {'subsample': 0.6,
 'reg_alpha': 0.01,
 'n_estimators': 120,
 'max_depth': 7,
 'learning_rate': 0.009,
 'gamma': 0.4,
 'colsample_bytree': 0.8}

In [274]:
gb_best_params_bc = {'n_estimators': 40, 'loss': 'exponential', 'criterion': 'friedman_mse'}
knn_best_params_bc = {'n_neighbors': 10, 'algorithm': 'ball_tree'}
lr_best_params_bc = {'solver': 'newton-cg', 'C': 10}
rf_best_params_bc = {'n_estimators': 120, 'criterion': 'entropy'}

In [267]:
le = preprocessing.LabelEncoder()
target = le.fit_transform(target)

### Random fingerprint

In [279]:
gamma = [1, 2, 3, 5]

GB_results_all_r_bc = dict()
XBG_results_all_r_bc = dict()
KNN_results_all_r_bc = dict()
RF_results_all_r_bc = dict()
LR_results_all_r_bc = dict()

for g in gamma:
    GB_results = []
    XGB_results = []
    KNN_results = []
    LR_results = []
    RF_results = []
    for n in range(3):
        # fingerprint the data
        #scheme = BlindNNScheme(gamma=g, xi=1, fingerprint_bit_length=8)

#        fp_dataset = scheme.insertion(dataset_name="breast-cancer", recipient_id=1, secret_key=secret_key, 
#                                     correlated_attributes=["age", "menopause", "inv-nodes", "node-caps"])
        # same prepocessing as above
        fp_dataset = pd.read_csv('knn_scheme/fp_datasets/random/breast_cancer/breast_cancer_fp_{}_random_{}.csv'.format(g, sk))
        fp_dataset = fp_dataset.drop(["Id", "recurrence"], axis=1)
        fp_dataset = pd.get_dummies(fp_dataset)
        fp_dataset = fp_dataset.drop(['breast_left', 'irradiat_yes'], axis=1)
        fp_dataset = fp_dataset.values
        # hyperparameter seach

#         GB_model = GradientBoostingClassifier(**search_gb.best_params_)
        GB_model = GradientBoostingClassifier(**gb_best_params_bc)
        GB_scores = cross_val_score(GB_model, fp_dataset, target, cv=10)
        GB_results.append(np.mean(GB_scores))

        #XGB_model = xgb.XGBClassifier(**search_xgb.best_params_)
#         XGB_model = xgb.XGBClassifier(**xgb_best_params_bc)
#         XGB_scores = cross_val_score(XGB_model, fp_dataset, target, cv=10)
#         XGB_results.append(np.mean(XGB_scores))

#         KNN_model = KNeighborsClassifier(**search_knn.best_params_)
        KNN_model = KNeighborsClassifier(**knn_best_params_bc)
        KNN_scores = cross_val_score(KNN_model, fp_dataset, target, cv=10)
        KNN_results.append(np.mean(KNN_scores))

#         LR_model = LogisticRegression(**search_lr.best_params_)
        LR_model = LogisticRegression(**lr_best_params_bc)
        LR_scores = cross_val_score(LR_model, fp_dataset, target, cv=10)
        LR_results.append(np.mean(LR_scores))

#         RF_model = RandomForestClassifier(**search_rf.best_params_)
        RF_model = RandomForestClassifier(**rf_best_params_bc)
        RF_scores = cross_val_score(RF_model, fp_dataset, target, cv=10)
        RF_results.append(np.mean(RF_scores))

        secret_key = secret_key - 3
        #print(secret_key)
        
    GB_results_all_r_bc[g] = GB_results
    XBG_results_all_r_bc[g] = XGB_results
    KNN_results_all_r_bc[g] = KNN_results
    LR_results_all_r_bc[g] = LR_results
    RF_results_all_r_bc[g] = RF_results

print("Time: " + str(int(time()-start)) + " sec.")

Time: 252244 sec.


In [269]:
XBG_results_all_r_bc

{1: [0.7305418719211823, 0.7305418719211823, 0.7305418719211823],
 2: [0.7342364532019705, 0.7342364532019705, 0.7342364532019705],
 3: [0.7270935960591134, 0.7270935960591134, 0.7270935960591134],
 5: [0.7270935960591134, 0.7270935960591134, 0.7270935960591134]}

In [280]:
GB_results_all_r_bc

{1: [0.7054187192118226, 0.7018472906403941, 0.7054187192118226],
 2: [0.6910098522167487, 0.6910098522167487, 0.6944581280788178],
 3: [0.687807881773399, 0.6915024630541872, 0.6913793103448276],
 5: [0.6880541871921183, 0.6844827586206897, 0.6844827586206897]}

In [281]:
KNN_results_all_r_bc

{1: [0.7201970443349753, 0.7201970443349753, 0.7201970443349753],
 2: [0.7203201970443349, 0.7203201970443349, 0.7203201970443349],
 3: [0.7238916256157635, 0.7238916256157635, 0.7238916256157635],
 5: [0.7236453201970443, 0.7236453201970443, 0.7236453201970443]}

In [282]:
RF_results_all_r_bc

{1: [0.6985221674876847, 0.7198275862068966, 0.716256157635468],
 2: [0.695320197044335, 0.698768472906404, 0.6950738916256157],
 3: [0.6880541871921182, 0.695320197044335, 0.6951970443349754],
 5: [0.7166256157635468, 0.7057881773399015, 0.702216748768473]}

In [283]:
LR_results_all_r_bc

{1: [0.669950738916256, 0.669950738916256, 0.669950738916256],
 2: [0.6525862068965517, 0.6525862068965517, 0.6525862068965517],
 3: [0.6488916256157635, 0.6488916256157635, 0.6488916256157635],
 5: [0.6386699507389163, 0.6386699507389163, 0.6386699507389163]}

### NN fingerprint

### - Gradient Boosting

In [126]:
gamma = [1, 2, 3, 5]

GB_results_all = dict()
XBG_results_all = dict()
KNN_results_all = dict()
RF_results_all = dict()
LR_results_all = dict()

for g in gamma:
    GB_results = XGB_results = KNN_results = LR_results = RF_results = []
    for n in range(n_exp):
        # fingerprint the data
        scheme = BlindNNScheme(gamma=g, xi=1, fingerprint_bit_length=8)

        fp_dataset = scheme.insertion(dataset_name="breast-cancer", recipient_id=1, secret_key=secret_key, 
                                     correlated_attributes=["age", "menopause", "inv-nodes", "node-caps"])
        # same prepocessing as above
        fp_dataset = fp_dataset.drop(["Id", "recurrence"], axis=1)
        fp_dataset = pd.get_dummies(fp_dataset)
        fp_dataset = fp_dataset.drop(['breast_left', 'irradiat_yes'], axis=1)
        fp_dataset = fp_dataset.values
        # hyperparameter seach

#         GB_model = GradientBoostingClassifier(**search_gb.best_params_)
#         GB_scores = cross_val_score(GB_model, fp_dataset, target, cv=10)
#         GB_results.append(np.mean(GB_scores))

        XGB_model = xgb.XGBClassifier(**search_xgb.best_params_)
        XGB_scores = cross_val_score(XGB_model, fp_dataset, target, cv=10)
        XGB_results.append(np.mean(XGB_scores))

#         KNN_model = KNeighborsClassifier(**search_knn.best_params_)
#         KNN_scores = cross_val_score(KNN_model, fp_dataset, target, cv=10)
#         KNN_results.append(np.mean(KNN_scores))

#         LR_model = LogisticRegression(**search_lr.best_params_)
#         LR_scores = cross_val_score(LR_model, fp_dataset, target, cv=10)
#         LR_results.append(np.mean(LR_scores))

#         RF_model = RandomForestClassifier(**search_rf.best_params_)
#         RF_scores = cross_val_score(RF_model, fp_dataset, target, cv=10)
#         RF_results.append(np.mean(RF_scores))

        secret_key = secret_key - 3
        #print(secret_key)
        
    GB_results_all[g] = GB_results
    XBG_results_all[g] = XGB_results
    KNN_results_all[g] = KNN_results
    LR_results_all[g] = LR_results
    RF_results_all[g] = RF_results

print("Time: " + str(int(time()-start)) + " sec.")

Start the blind insertion algorithm of a scheme for fingerprinting categorical data (neighbourhood) ...
	gamma: 1
	xi: 1

Generated fingerprint for recipient 1: 00001111

Generated fingerprint for recipient 1: 00001111
Inserting the fingerprint...

Training balltrees in: 0.02 sec.
Fingerprint inserted.
Time: 1 sec.
Start the blind insertion algorithm of a scheme for fingerprinting categorical data (neighbourhood) ...
	gamma: 1
	xi: 1

Generated fingerprint for recipient 1: 11110010

Generated fingerprint for recipient 1: 11110010
Inserting the fingerprint...

Training balltrees in: 0.02 sec.
Fingerprint inserted.
Time: 1 sec.
Start the blind insertion algorithm of a scheme for fingerprinting categorical data (neighbourhood) ...
	gamma: 1
	xi: 1

Generated fingerprint for recipient 1: 01011010

Generated fingerprint for recipient 1: 01011010
Inserting the fingerprint...

Training balltrees in: 0.02 sec.
Fingerprint inserted.
Time: 1 sec.
Start the blind insertion algorithm of a scheme f

Fingerprint inserted.
Time: 0 sec.
Start the blind insertion algorithm of a scheme for fingerprinting categorical data (neighbourhood) ...
	gamma: 2
	xi: 1

Generated fingerprint for recipient 1: 01010000

Generated fingerprint for recipient 1: 01010000
Inserting the fingerprint...

Training balltrees in: 0.02 sec.
Fingerprint inserted.
Time: 0 sec.
Start the blind insertion algorithm of a scheme for fingerprinting categorical data (neighbourhood) ...
	gamma: 2
	xi: 1

Generated fingerprint for recipient 1: 10001001

Generated fingerprint for recipient 1: 10001001
Inserting the fingerprint...

Training balltrees in: 0.02 sec.
Fingerprint inserted.
Time: 0 sec.
Start the blind insertion algorithm of a scheme for fingerprinting categorical data (neighbourhood) ...
	gamma: 2
	xi: 1

Generated fingerprint for recipient 1: 11000110

Generated fingerprint for recipient 1: 11000110
Inserting the fingerprint...

Training balltrees in: 0.01 sec.
Fingerprint inserted.
Time: 0 sec.
Start the blin

Fingerprint inserted.
Time: 0 sec.
Start the blind insertion algorithm of a scheme for fingerprinting categorical data (neighbourhood) ...
	gamma: 3
	xi: 1

Generated fingerprint for recipient 1: 01010000

Generated fingerprint for recipient 1: 01010000
Inserting the fingerprint...

Training balltrees in: 0.02 sec.
Fingerprint inserted.
Time: 0 sec.
Start the blind insertion algorithm of a scheme for fingerprinting categorical data (neighbourhood) ...
	gamma: 3
	xi: 1

Generated fingerprint for recipient 1: 11111000

Generated fingerprint for recipient 1: 11111000
Inserting the fingerprint...

Training balltrees in: 0.02 sec.
Fingerprint inserted.
Time: 0 sec.
Start the blind insertion algorithm of a scheme for fingerprinting categorical data (neighbourhood) ...
	gamma: 3
	xi: 1

Generated fingerprint for recipient 1: 10000110

Generated fingerprint for recipient 1: 10000110
Inserting the fingerprint...

Training balltrees in: 0.02 sec.
Fingerprint inserted.
Time: 0 sec.
Start the blin

In [124]:
print(np.mean(XBG_results_all[1])-0.7273)
print(np.mean(XBG_results_all[2])-0.7273)
print(np.mean(XBG_results_all[3])-0.7273)
print(np.mean(XBG_results_all[5])-0.7273)
# with 10 exp

0.7261576354679804
0.7278325123152709
0.727487684729064
0.7298522167487684


In [132]:
0.7298522167487684-0.7273

0.0025522167487684877

# Adult data:  hyperparameter search & baseline performance 

In [226]:
import warnings
warnings.filterwarnings('ignore')
os.chdir('C:/Users/tsarcevic/PycharmProjects/dissertation')

In [156]:
from datasets import Adult, Dataset
from utils import fp_cross_val_score
from sklearn.svm import LinearSVC
from sklearn import metrics, preprocessing, model_selection

In [235]:
# fetch dataset
original_data = Adult()

In [236]:
# cleaning the data 
#original_data.dropna()
original_data.dataframe = original_data.dataframe.dropna()
# encode categorical features and drop redundant 
original_data.number_encode_categorical()
original_data.dataframe = original_data.dataframe.drop(['fnlwgt','education'], axis=1)

# define target attribute
X = original_data.get_features()
y = original_data.get_target()

# scale features
scaler = preprocessing.StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns) #, index=X.index)
X.shape

(48842, 12)

### - Gradient Boosting

In [189]:
n_estimators = range(20, 200, 20)
loss = ['log_loss', 'exponential']
criterion = ['friedman_mse', 'squared_error']
hyperparams = dict(n_estimators=n_estimators,
                  loss=loss,
                  criterion=criterion)

gb = GradientBoostingClassifier()
clf = RandomizedSearchCV(gb, hyperparams, random_state=0)
search_gb = clf.fit(X, y)

In [190]:
search_gb.best_score_

0.8692519183306366

In [191]:
search_gb.best_params_

{'n_estimators': 160, 'loss': 'exponential', 'criterion': 'friedman_mse'}

### XGBoost

In [174]:
# define the model and possible hyperparameters
n_estimators = range(20, 200, 20)
max_depth = range(3,12,1)
gamma = [i/10.0 for i in range(0,5)]
colsample_bytree = [i/10.0 for i in range(6,10)]
learning_rate = [i/1000.0 for i in range(1, 10, 2)]
subsample = [i/10.0 for i in range(5,9)]
reg_alpha = [1e-5, 1e-2, 0.1, 1, 100]

hyperparams = dict(n_estimators=n_estimators,
                  max_depth=max_depth,
                  gamma=gamma,
                  colsample_bytree=colsample_bytree, 
                  learning_rate=learning_rate,
                  subsample=subsample,
                  reg_alpha=reg_alpha)

# classifier definition
xgboost = xgb.XGBClassifier()
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

# random search
clf = RandomizedSearchCV(xgboost, hyperparams, random_state=4)
search_xgb = clf.fit(X, y)
search_xgb.best_score_

0.8628025195278752

In [175]:
search_xgb.best_params_

{'subsample': 0.5,
 'reg_alpha': 0.01,
 'n_estimators': 160,
 'max_depth': 10,
 'learning_rate': 0.009,
 'gamma': 0.0,
 'colsample_bytree': 0.9}

### - Random Forest

In [177]:
# define the model and possible hyperparameters
n_estimators = range(20, 200, 20)
criterion = ['gini', 'entropy', 'log_loss']

hyperparams = dict(n_estimators=n_estimators,
                  criterion=criterion)

# hyperparameter random search
rf = RandomForestClassifier()
clf = RandomizedSearchCV(rf, hyperparams, random_state=0)
search_rf = clf.fit(X, y)
search_rf.best_score_

0.8531796560330699

In [178]:
search_rf.best_params_

{'n_estimators': 180, 'criterion': 'entropy'}

### - Logistic Regression

In [179]:
solver = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
C = range(10, 100, 10)

hyperparams = dict(solver=solver, C=C)

# hyperparameter random search
lr = LogisticRegression(random_state=0)
clf = RandomizedSearchCV(lr, hyperparams, random_state=0)
search_lr = clf.fit(X, y)
search_lr.best_score_

0.825437082687057

In [180]:
search_lr.best_params_

{'solver': 'saga', 'C': 90}

### - kNN

In [216]:
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
n_neighbors = range(1, 20, 1)

hyperparams = dict(algorithm=algorithm, n_neighbors=n_neighbors)

# hyperparameter random search
knn = KNeighborsClassifier()
clf = RandomizedSearchCV(knn, hyperparams, random_state=20)
search_knn = clf.fit(X, y)
search_knn.best_score_

0.8438229971472883

In [219]:
search_knn.best_params_

{'n_neighbors': 18, 'algorithm': 'ball_tree'}

In [220]:
cross_val_score(KNeighborsClassifier(**search_knn.best_params_), X, y, cv=10)

array([0.83807574, 0.84749232, 0.84377559, 0.84418509, 0.84930385,
       0.8507371 , 0.83947584, 0.84418509, 0.84418509, 0.84459459])

In [223]:
cross_val_score(KNeighborsClassifier(n_neighbors=16, algorithm='ball_tree'), X, y, cv=10)

array([0.83787103, 0.84380757, 0.84193284, 0.84459459, 0.84766585,
       0.84807535, 0.83988534, 0.8458231 , 0.84438984, 0.84377559])

In [221]:
np.mean([0.83807574, 0.84749232, 0.84377559, 0.84418509, 0.84930385,
       0.8507371 , 0.83947584, 0.84418509, 0.84418509, 0.84459459])

0.84460103

In [224]:
np.mean([0.83787103, 0.84380757, 0.84193284, 0.84459459, 0.84766585,
       0.84807535, 0.83988534, 0.8458231 , 0.84438984, 0.84377559])

0.8437821099999999

## Adult: performance under fingerprints

### - Baseline fingerprint

In [228]:
data_fp_rand = pd.read_csv('knn_scheme/fp_datasets/random/adult/adult_fp_5_random_100.csv')

In [229]:
n_exp=5
results = []
secret_key = 2000

categorical_attributes = list(original_data.categorical_attributes); categorical_attributes.remove('income'); categorical_attributes.remove('education')

In [253]:
search_xgb.best_params_

{'subsample': 0.5,
 'reg_alpha': 0.01,
 'n_estimators': 160,
 'max_depth': 10,
 'learning_rate': 0.009,
 'gamma': 0.0,
 'colsample_bytree': 0.9}

In [255]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [259]:
gamma = [5, 10, 20]  # 5 10 20 40

GB_results_all_rand = dict()
XBG_results_all_rand = dict()
KNN_results_all_rand = dict()
RF_results_all_rand = dict()
LR_results_all_rand = dict()

for g in gamma:
    sk = 100
    GB_results = []
    KNN_results = []
    LR_results = []
    RF_results = []
    XGB_results = []
    for n in range(1):
        # fingerprint the data
        #scheme = BlindNNScheme(gamma=g, xi=1, fingerprint_bit_length=64)

        #fp_dataset = scheme.insertion(dataset_name="adult", recipient_id=1, secret_key=secret_key, 
        #                             correlated_attributes=["relationship", "marital-status", "occupation", "workclass",
        #                                              "education-num"])
        # same prepocessing as above
        fp_dataset = pd.read_csv('knn_scheme/fp_datasets/random/adult/adult_fp_{}_random_{}.csv'.format(g, sk))
        sk += 1
        fp_dataset = fp_dataset.drop(["income", 'fnlwgt','education'], axis=1)
        #fp_dataset = fp_dataset.dropna()
        for cat in categorical_attributes:
            label_enc = preprocessing.LabelEncoder()  # the current version of label encoder works in alphanumeric order
            fp_dataset[cat] = label_enc.fit_transform(fp_dataset[cat])
        
        fp_dataset = pd.DataFrame(scaler.fit_transform(fp_dataset), columns=fp_dataset.columns)

        fp_dataset = fp_dataset.values
        # hyperparameter seach

   #     GB_model = GradientBoostingClassifier(**search_gb.best_params_)
  #      GB_scores = cross_val_score(GB_model, fp_dataset, y, cv=10)
 #       GB_results.append(np.mean(GB_scores))
#        print(GB_results)

        XGB_model = xgb.XGBClassifier(**search_xgb.best_params_)
        XGB_scores = cross_val_score(XGB_model, fp_dataset, y, cv=10)
        XGB_results.append(np.mean(XGB_scores))

#         KNN_model = KNeighborsClassifier(**search_knn.best_params_)
#         KNN_scores = cross_val_score(KNN_model, fp_dataset, y, cv=10)
#         KNN_results.append(np.mean(KNN_scores))

#         LR_model = LogisticRegression(**search_lr.best_params_)
#         LR_scores = cross_val_score(LR_model, fp_dataset, y, cv=10)
#         LR_results.append(np.mean(LR_scores))
#         print(LR_results)

#         RF_model = RandomForestClassifier(**search_rf.best_params_)
#         RF_scores = cross_val_score(RF_model, fp_dataset, y, cv=10)
#         RF_results.append(np.mean(RF_scores))
#         print(RF_results)

        secret_key = secret_key - 3
        #print(secret_key)
        
#    GB_results_all_rand[g] = GB_results
    XBG_results_all_rand[g] = XGB_results
    KNN_results_all_rand[g] = KNN_results
    #LR_results_all_rand[g] = LR_results
    #RF_results_all_rand[g] = RF_results

print("Time: " + str(int(time()-start)) + " sec.")

Time: 249796 sec.


In [260]:
XBG_results_all_rand

{5: [0.8619016620603108], 10: [0.8629663170195412], 20: [0.862863988022637]}

In [250]:
KNN_results_all_rand

{5: [0.8426969562844692],
 10: [0.8440891445087966],
 20: [0.8437615274155703],
 40: [0.8447033741660149]}

In [238]:
GB_results_all_rand

{5: [0.8677368920050601,
  0.8250071798792371,
  0.851029887242784,
  0.8672250542158423,
  0.8249048257338943,
  0.8512550412140996,
  0.8682897468977305,
  0.8250277303450282,
  0.8532615764550258,
  0.8678188172353988,
  0.8249867677298589,
  0.8523402634047466,
  0.8680849254390708,
  0.8253143596746462,
  0.8526062752060705]}

In [240]:
LR_results_all_rand

{10: [0.8678392335761835,
  0.8252324092958689,
  0.8517669502572266,
  0.8684740095077863,
  0.8253552594187189,
  0.8517054665161113,
  0.8688834721946289,
  0.8253553013327833,
  0.8517055335786144,
  0.8687606094975593,
  0.8254781388814143,
  0.8532615680722131,
  0.8688834638118159,
  0.8253553139070027,
  0.852319671024891]}

In [241]:
RF_results_all_rand

{10: [0.8678392335761835,
  0.8252324092958689,
  0.8517669502572266,
  0.8684740095077863,
  0.8253552594187189,
  0.8517054665161113,
  0.8688834721946289,
  0.8253553013327833,
  0.8517055335786144,
  0.8687606094975593,
  0.8254781388814143,
  0.8532615680722131,
  0.8688834638118159,
  0.8253553139070027,
  0.852319671024891]}

In [242]:
GB_results_all_rand

{10: [0.8678392335761835,
  0.8252324092958689,
  0.8517669502572266,
  0.8684740095077863,
  0.8253552594187189,
  0.8517054665161113,
  0.8688834721946289,
  0.8253553013327833,
  0.8517055335786144,
  0.8687606094975593,
  0.8254781388814143,
  0.8532615680722131,
  0.8688834638118159,
  0.8253553139070027,
  0.852319671024891]}

### - NN fingerprint

In [187]:
n_exp=5
results = []
secret_key = 2000

categorical_attributes = list(original_data.categorical_attributes); categorical_attributes.remove('income'); categorical_attributes.remove('education')

In [198]:
gamma = [5, 10, 20, 40]

GB_results_all = dict()
#XBG_results_all = dict()
KNN_results_all = dict()
RF_results_all = dict()
LR_results_all = dict()

for g in gamma:
    GB_results = KNN_results = LR_results = RF_results = []
    #XGB_results = []
    for n in range(n_exp):
        # fingerprint the data
        scheme = BlindNNScheme(gamma=g, xi=1, fingerprint_bit_length=64)

        fp_dataset = scheme.insertion(dataset_name="adult", recipient_id=1, secret_key=secret_key, 
                                     correlated_attributes=["relationship", "marital-status", "occupation", "workclass",
                                                      "education-num"])
        # same prepocessing as above
        fp_dataset = fp_dataset.drop(["Id", "income", 'fnlwgt','education'], axis=1)
        fp_dataset = fp_dataset.dropna()
        for cat in categorical_attributes:
            label_enc = preprocessing.LabelEncoder()  # the current version of label encoder works in alphanumeric order
            fp_dataset[cat] = label_enc.fit_transform(fp_dataset[cat])
        
        fp_dataset = pd.DataFrame(scaler.fit_transform(fp_dataset), columns=fp_dataset.columns)

        fp_dataset = fp_dataset.values
        # hyperparameter seach

        GB_model = GradientBoostingClassifier(**search_gb.best_params_)
        GB_scores = cross_val_score(GB_model, fp_dataset, y, cv=10)
        GB_results.append(np.mean(GB_scores))

#         XGB_model = xgb.XGBClassifier(**search_xgb.best_params_)
#         XGB_scores = cross_val_score(XGB_model, fp_dataset, y, cv=10)
#         XGB_results.append(np.mean(XGB_scores))

        KNN_model = KNeighborsClassifier(**search_knn.best_params_)
        KNN_scores = cross_val_score(KNN_model, fp_dataset, y, cv=10)
        KNN_results.append(np.mean(KNN_scores))

        LR_model = LogisticRegression(**search_lr.best_params_)
        LR_scores = cross_val_score(LR_model, fp_dataset, y, cv=10)
        LR_results.append(np.mean(LR_scores))

        RF_model = RandomForestClassifier(**search_rf.best_params_)
        RF_scores = cross_val_score(RF_model, fp_dataset, y, cv=10)
        RF_results.append(np.mean(RF_scores))

        secret_key = secret_key - 3
        #print(secret_key)
        
    GB_results_all[g] = GB_results
    #XBG_results_all[g] = XGB_results
    KNN_results_all[g] = KNN_results
    LR_results_all[g] = LR_results
    RF_results_all[g] = RF_results

print("Time: " + str(int(time()-start)) + " sec.")

Start the blind insertion algorithm of a scheme for fingerprinting categorical data (neighbourhood) ...
	gamma: 5
	xi: 1

Generated fingerprint for recipient 1: 11001011

Generated fingerprint for recipient 1: 11001011
Inserting the fingerprint...

Training balltrees in: 0.55 sec.
Fingerprint inserted.
Time: 100 sec.
Start the blind insertion algorithm of a scheme for fingerprinting categorical data (neighbourhood) ...
	gamma: 5
	xi: 1

Generated fingerprint for recipient 1: 00110011

Generated fingerprint for recipient 1: 00110011
Inserting the fingerprint...

Training balltrees in: 0.52 sec.
Fingerprint inserted.
Time: 113 sec.
Start the blind insertion algorithm of a scheme for fingerprinting categorical data (neighbourhood) ...
	gamma: 5
	xi: 1

Generated fingerprint for recipient 1: 00011111

Generated fingerprint for recipient 1: 00011111
Inserting the fingerprint...

Training balltrees in: 0.46 sec.
Fingerprint inserted.
Time: 100 sec.
Start the blind insertion algorithm of a sc

In [196]:
XBG_results_all

{5: [0.8601613817222823,
  0.8610008030734745,
  0.8601409486158719,
  0.8607346739127701,
  0.8614102992915684],
 10: [0.8619016662517174,
  0.8618812373367133,
  0.8628230505559064,
  0.8615331577972315,
  0.8617378786621366],
 20: [0.8627001962416496,
  0.862352108319355,
  0.8629663463593863,
  0.8622292959191629,
  0.8625978001822425],
 40: [0.8629663505507927,
  0.8629458713389113,
  0.8631915757760179,
  0.8632324839029035,
  0.8629458922959434]}

In [202]:
print(np.mean(GB_results_all[5]) - 0.8693)
print(np.mean(GB_results_all[10]) - 0.8693)
print(np.mean(GB_results_all[20]) - 0.8693)
print(np.mean(GB_results_all[40]) - 0.8693)

-0.023854427717938442
-0.023040582328862746
-0.022295313588455712
-0.022089550949479397


In [222]:
for g in gamma:
    print(g)
    print((np.mean(KNN_results_all[g][10:]) - 0.8446)*100)

5
-0.11393975440034954
10
-0.0306123728641583
20
0.06336579158481292
40
0.09325856702520063


In [217]:
for g in gamma:
    print(g)
    print((np.mean(LR_results_all[g][10:]) - 0.8554)*100)

5
-1.1939397544003527
10
-1.1106123728641615
20
-1.0166342084151903
40
-0.9867414329748025


In [218]:
for g in gamma:
    print(g)
    print((np.mean(RF_results_all[g][:]) - 0.8532)*100)

5
-0.7754427717938439
10
-0.6940582328862743
20
-0.6195313588455709
40
-0.5989550949479394
