In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_validation import train_test_split
    
def prepare_data(name, onehot=True, labelEncode=True):
    if name == "adult":
        X, y = prepare_adult(target='income', onehot=onehot)
    

            
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
    #X_train, y_train = X, y
    #X_test, y_test = X.copy(), y.copy()

    return X_train, y_train, X_test, y_test

def prepare_adult(target='income', onehot=True):
    data = pd.read_csv('data/adult.csv', sep=r'\s*,\s*', engine='python')

    cols = list(data.columns.values)
    print(cols)
    cols.remove('income')

    X = data[cols]
    y = data['income']
    
    scaler = MinMaxScaler(feature_range=(-1, 1))
    for i in range(X.shape[1]):
        try:
            X.values[:, i] = scaler.fit_transform(X.values[:, i])
        except ValueError:
            pass
        
    yy = []
    for i in range(y.shape[0]):
        a = y[i]
        if a.strip().startswith('<'):
            yy.append(0)
        else:
            yy.append(1)
    if onehot:
        X = pd.get_dummies(X)

    return X, np.array(yy)





In [22]:

from sklearn.linear_model import LogisticRegression
from collections import Counter
import argparse
import sys
import matplotlib.pyplot as plt
%run regression_stealer.ipynb
%run utils.ipynb



class LocalRegressionExtractor(RegressionExtractor):

    """
    Local logistic regression using the implementation in scikit
    
    """

    def __init__(self, X, y, multinomial, rounding=None):
        self.classes = y.unique()
        self.features = X.columns.values
        self.rounding = rounding

        # train a model on the whole dataset
        if multinomial:
            self.model = LogisticRegression(multi_class="multinomial",
                                            solver='lbfgs')
        else:
            self.model = LogisticRegression(multi_class="ovr")
        self.model.fit(X, y)

        self.w = self.model.coef_
        self.intercept = self.model.intercept_
        self.multinomial = multinomial

        RegressionExtractor.__init__(self)

    def num_features(self):
        return len(self.features)

    def get_classes(self):
        return self.classes

    def query_probas(self, X):
        #
        # There seems to be a bug in the LogisticRegression class, that makes
        # it use the OvR strategy to compute probabilities even when we set
        # 'multi_class = multinomial'. So we call the predict_probas method
        # ourselves.
        #
        p = predict_probas(X, self.w, self.intercept,
                           multinomial=(self.model.multi_class == "multinomial")
                           )
        print("Gen-Query Prop Size: ",p.shape)
        #print("Query Prop:", p)
        if self.rounding is None:
            return p
        else:
            p = np.round(p, self.rounding)
            return p / np.sum(p, axis=1)[:, np.newaxis]

    def query(self, X):
        return predict_classes(X, self.w, self.intercept, self.classes)
    


def main():
    
    X_train, y_train, X_test, y_test, _ = prepare_data("adult")
    
    #lr=LogisticRegression(multi_class="multinomial", solver='lbfgs')
    ext = LocalRegressionExtractor(X_train, y_train,
                                multinomial= "multinomial",
                                   rounding= None)
    print("Target Classes: ", ext.classes)
    
    print("Features: ", ext.features)
    
    y_pred = ext.model.predict(X_test)
    
    print ('training accuracy: ',format(accuracy_score(y_test, y_pred)))
    
    print (Counter(y_pred))
    
    ext.run("adult", X_test, random_seed=0,
            #alphas=[1],
            methods=['passive'], baseline=True
            )

   
    #ext = LocalRegressionExtractor(X_train, y_train,multinomial="multinomial",rounding=None)
    #y_pred = ext.model.predict(X_test)
    #print ('training accuracy: {}',format(accuracy_score(y_test, y_pred)))
    #print (Counter(y_pred))
    
if __name__ == "__main__":
    main()

Target Classes:  [4 2 1 3 0]
Features:  [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104]
training accuracy:  0.8766635272920846
Counter({4: 47264, 1: 1026, 2: 548, 3: 4})
Target Classes : 5 | num features : 105 | num unknown　: 530 
Generate query set......
Scale_type:  uniform
query set size:  (100000, 105)
passize budget :  265
Scale_type:  uniform
Gen-Query Prop Size:  (265, 5)
w0.shape (5, 106)
finding solution of system of 265 equations with 530 unknowns with BFGS
optimize logit BFGS
         Current function value: 198.234743
         Iterations: 100
         Function evaluations: 102
         Gradient evaluat



Gen-Query Prop Size:  (100000, 5)
adult,passive,265,extr,0.00e+00,0.00e+00,4.03e-07,2.91e-07,2.39e-01
adult,passive,265,base,2.79e-01,3.61e-01,9.84e-01,8.47e-01,1.60e+03
passize budget :  530
Scale_type:  uniform
Gen-Query Prop Size:  (530, 5)
w0.shape (5, 106)
finding solution of system of 530 equations with 530 unknowns with BFGS
optimize logit BFGS
         Current function value: 377.259200
         Iterations: 100
         Function evaluations: 106
         Gradient evaluations: 106
obtained train accuracy of 1.0
opti ran for 1.21 s
Scale_type:  uniform
acc: 1.00e+00, 9.89e-01
Gen-Query Prop Size:  (48842, 5)
l1: 3.05e-07,9.90e-01




Gen-Query Prop Size:  (100000, 5)
adult,passive,530,extr,0.00e+00,0.00e+00,3.05e-07,3.81e-07,2.39e-01
adult,passive,530,base,1.12e-02,2.50e-01,9.90e-01,8.71e-01,2.92e+03
passize budget :  1060
Scale_type:  uniform
Gen-Query Prop Size:  (1060, 5)
w0.shape (5, 106)
finding solution of system of 1060 equations with 530 unknowns with BFGS
optimize logit BFGS
         Current function value: 784.046434
         Iterations: 100
         Function evaluations: 124
         Gradient evaluations: 124
obtained train accuracy of 1.0
opti ran for 1.28 s
Scale_type:  uniform
acc: 1.00e+00, 9.92e-01
Gen-Query Prop Size:  (48842, 5)
l1: 4.62e-06,9.90e-01




Gen-Query Prop Size:  (100000, 5)
adult,passive,1060,extr,0.00e+00,0.00e+00,4.62e-06,6.25e-06,2.42e-01
adult,passive,1060,base,8.35e-03,1.61e-01,9.90e-01,8.79e-01,5.14e+03
passize budget :  2650
Scale_type:  uniform
Gen-Query Prop Size:  (2650, 5)
w0.shape (5, 106)
finding solution of system of 2650 equations with 530 unknowns with BFGS
optimize logit BFGS
         Current function value: 1977.192230
         Iterations: 100
         Function evaluations: 134
         Gradient evaluations: 134
obtained train accuracy of 0.9981132075471698
opti ran for 1.67 s
Scale_type:  uniform
acc: 1.00e+00, 9.90e-01
Gen-Query Prop Size:  (48842, 5)
l1: 2.37e-04,9.89e-01




Gen-Query Prop Size:  (100000, 5)
adult,passive,2650,extr,1.23e-04,4.50e-04,2.37e-04,3.60e-04,4.02e-01
adult,passive,2650,base,9.81e-03,7.57e-02,9.89e-01,8.84e-01,1.24e+04
passize budget :  5300
Scale_type:  uniform
Gen-Query Prop Size:  (5300, 5)
w0.shape (5, 106)
finding solution of system of 5300 equations with 530 unknowns with BFGS
optimize logit BFGS
         Current function value: 4036.885706
         Iterations: 100
         Function evaluations: 148
         Gradient evaluations: 148
obtained train accuracy of 0.999811320754717
opti ran for 2.35 s
Scale_type:  uniform
acc: 1.00e+00, 9.92e-01
Gen-Query Prop Size:  (48842, 5)
l1: 2.21e-04,9.90e-01
Gen-Query Prop Size:  (100000, 5)




adult,passive,5300,extr,1.02e-04,2.30e-04,2.21e-04,1.79e-04,3.31e-01
adult,passive,5300,base,8.29e-03,4.03e-02,9.90e-01,8.86e-01,2.43e+04
passize budget :  10600
Scale_type:  uniform
Gen-Query Prop Size:  (10600, 5)
w0.shape (5, 106)
finding solution of system of 10600 equations with 530 unknowns with BFGS
optimize logit BFGS
         Current function value: 8001.596390
         Iterations: 100
         Function evaluations: 155
         Gradient evaluations: 155
obtained train accuracy of 0.9986792452830189
opti ran for 3.97 s
Scale_type:  uniform
acc: 1.00e+00, 9.97e-01
Gen-Query Prop Size:  (48842, 5)
l1: 8.68e-04,9.90e-01
Gen-Query Prop Size:  (100000, 5)
adult,passive,10600,extr,6.14e-05,9.50e-04,8.68e-04,1.00e-03,7.45e-01
adult,passive,10600,base,2.78e-03,2.07e-02,9.90e-01,8.87e-01,4.65e+04
passize budget :  26500
Scale_type:  uniform




Gen-Query Prop Size:  (26500, 5)
w0.shape (5, 106)
finding solution of system of 26500 equations with 530 unknowns with BFGS
optimize logit BFGS
         Current function value: 19994.152046
         Iterations: 100
         Function evaluations: 168
         Gradient evaluations: 168
obtained train accuracy of 0.9987924528301887
opti ran for 6.72 s
Scale_type:  uniform
acc: 1.00e+00, 9.99e-01
Gen-Query Prop Size:  (48842, 5)
l1: 1.05e-03,9.90e-01
Gen-Query Prop Size:  (100000, 5)




adult,passive,26500,extr,1.02e-04,1.07e-03,1.05e-03,9.13e-04,6.65e-01
adult,passive,26500,base,1.21e-03,8.47e-03,9.90e-01,8.87e-01,1.29e+05
passize budget :  53000
Scale_type:  uniform
Gen-Query Prop Size:  (53000, 5)
w0.shape (5, 106)
finding solution of system of 53000 equations with 530 unknowns with BFGS
optimize logit BFGS
         Current function value: 40026.163091
         Iterations: 100
         Function evaluations: 169
         Gradient evaluations: 169
obtained train accuracy of 0.9965471698113207
opti ran for 10.93 s
Scale_type:  uniform
acc: 1.00e+00, 1.00e+00
Gen-Query Prop Size:  (48842, 5)
l1: 2.91e-03,9.90e-01
Gen-Query Prop Size:  (100000, 5)
adult,passive,53000,extr,2.05e-04,3.52e-03,2.91e-03,3.15e-03,1.81e+00
adult,passive,53000,base,2.05e-04,4.75e-03,9.90e-01,8.87e-01,3.40e+05




In [20]:
%run regression_stealer.ipynb  

In [27]:
%run utils.ipynb
