In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

First we need to simulate MIA. With our dataset, we split it into 3 parts: target model training set (60%), forget set(points that need to be unlearned) (10% of target model training set), and shadow models' data(40%). First we preprocess the data-cateogorical encoding, checking if any values are NA (there are no such values) and take a look at the columns. Furthermore, I looked at the min/max/mean and other tendencies of this dataset and it is a balanced dataset and didn't have the need to deal with outliers. However, I realized the scale could be adjusted for certain columns.

In [2]:
data_url = "/Users/trishanandakumar/Desktop/BURE/Datasets/statlog+german+credit+data/german.data"

columns = [
    'Checking Account Status', 'Duration', 'Credit History', 'Purpose', 'Credit Amount', 
    'Savings', 'Employment Since', 'Installment Rate', 'Personal Info', 
    'Debtors', 'Residence Since', 'Property', 'Age', 'Installment Plans', 
    'Housing', 'Existing Credits', 'Job', 'Liables', 'Telephone', 'Foreign', 
    'Target'
]

Attribute1 = {
    'A11': '< 0 DM',
    'A12': '0 <= ... < 200 DM',
    'A13': '>= 200 DM',
    'A14': 'no checking account'
}
Attribute3 = {
    'A30': 'no credits taken/all paid back duly',
    'A31': 'all credits paid back duly',
    'A32': 'existing credits paid back duly till now',
    'A33': 'delay in paying off in the past',
    'A34': 'critical account/other credits existing'
}
Attribute4 = {
    'A40': 'car (new)',
    'A41': 'car (used)',
    'A42': 'furniture/equipment',
    'A43': 'radio/television',
    'A44': 'domestic appliances',
    'A45': 'repairs',
    'A46': 'education',
    'A47': 'vacation',
    'A48': 'retraining',
    'A49': 'business',
    'A410': 'other'
}
Attribute6 = {
    'A61': '< 100 DM',
    'A62': '100 <= ... < 500 DM',
    'A63': '500 <= ... < 1000 DM',
    'A64': '>= 1000 DM',
    'A65': 'unknown/no savings account'
}
Attribute7 = {
    'A71': 'unemployed',
    'A72': '< 1 year',
    'A73': '1 <= ... < 4 years',
    'A74': '4 <= ... < 7 years',
    'A75': '>= 7 years'
}
Attribute9 = {
    'A91': 'male, divorced/separated',
    'A92': 'female, divorced/separated/married',
    'A93': 'male, single',
    'A94': 'male, married/widowed',
    'A95': 'female, single'
}
Attribute10 = {
    'A101': 'none',
    'A102': 'co-applicant',
    'A103': 'guarantor'
}
Attribute12 = {
    'A121': 'real estate',
    'A122': 'building society savings/life insurance',
    'A123': 'car or other',
    'A124': 'unknown / no property'
}
Attribute14 = {
    'A141': 'bank',
    'A142': 'stores',
    'A143': 'none'
}
Attribute15 = {
    'A151': 'rent',
    'A152': 'own',
    'A153': 'for free'
}
Attribute17 = {
    'A171': 'unemployed/unskilled - non-resident',
    'A172': 'unskilled - resident',
    'A173': 'skilled employee/official',
    'A174': 'management/self-employed/highly qualified'
}
Attribute19 = {
    'A191': 'none',
    'A192': 'yes, registered'
}
Attribute20 = {
    'A201': 'yes',
    'A202': 'no'
}


german_data = pd.read_csv(data_url, sep=' ', header=None, names=columns)

german_data['Checking Account Status'] = german_data['Checking Account Status'].map(Attribute1) 
german_data['Credit History'] = german_data['Credit History'].map(Attribute3)
german_data['Purpose'] = german_data['Purpose'].map(Attribute4)
german_data['Savings'] = german_data['Savings'].map(Attribute6)
german_data['Employment Since'] = german_data['Employment Since'].map(Attribute7)
german_data['Personal Info'] = german_data['Personal Info'].map(Attribute9)
german_data['Debtors'] = german_data['Debtors'].map(Attribute10)
german_data['Property'] = german_data['Property'].map(Attribute12)
german_data['Installment Plans'] = german_data['Installment Plans'].map(Attribute14)
german_data['Housing'] = german_data['Housing'].map(Attribute15)
german_data['Job'] = german_data['Job'].map(Attribute17)
german_data['Telephone'] = german_data['Telephone'].map(Attribute19)    
german_data['Foreign'] = german_data['Foreign'].map(Attribute20)


pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

german_data.head(5)

Unnamed: 0,Checking Account Status,Duration,Credit History,Purpose,Credit Amount,Savings,Employment Since,Installment Rate,Personal Info,Debtors,Residence Since,Property,Age,Installment Plans,Housing,Existing Credits,Job,Liables,Telephone,Foreign,Target
0,< 0 DM,6,critical account/other credits existing,radio/television,1169,unknown/no savings account,>= 7 years,4,"male, single",none,4,real estate,67,none,own,2,skilled employee/official,1,"yes, registered",yes,1
1,0 <= ... < 200 DM,48,existing credits paid back duly till now,radio/television,5951,< 100 DM,1 <= ... < 4 years,2,"female, divorced/separated/married",none,2,real estate,22,none,own,1,skilled employee/official,1,none,yes,2
2,no checking account,12,critical account/other credits existing,education,2096,< 100 DM,4 <= ... < 7 years,2,"male, single",none,3,real estate,49,none,own,1,unskilled - resident,2,none,yes,1
3,< 0 DM,42,existing credits paid back duly till now,furniture/equipment,7882,< 100 DM,4 <= ... < 7 years,2,"male, single",guarantor,4,building society savings/life insurance,45,none,for free,1,skilled employee/official,2,none,yes,1
4,< 0 DM,24,delay in paying off in the past,car (new),4870,< 100 DM,1 <= ... < 4 years,3,"male, single",none,4,unknown / no property,53,none,for free,2,skilled employee/official,2,none,yes,2


In [3]:
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
to_encode_columns = ['Purpose', 'Personal Info', 'Debtors', 'Job', 'Housing', 'Property', 'Installment Plans','Telephone','Foreign']

encoded_val = one_hot_encoder.fit_transform(german_data[to_encode_columns])
feature_names = one_hot_encoder.get_feature_names_out(to_encode_columns)

onehot_df = pd.DataFrame(encoded_val, columns=feature_names, index=german_data.index)

german_data = german_data.drop(to_encode_columns, axis=1)
german_data = pd.concat([german_data, onehot_df], axis=1)

ord_encoder = OrdinalEncoder()

encoded_val_2 = ord_encoder.fit_transform(german_data[['Checking Account Status', 'Credit History']])

german_data[['Checking Account Status', 'Credit History']] = pd.DataFrame(
    encoded_val_2,
    columns=['Checking Account Status', 'Credit History'],
    index=german_data.index
)

savings_category = [['unknown/no savings account', '< 100 DM', '100 <= ... < 500 DM', 
                      '500 <= ... < 1000 DM', '>= 1000 DM']]

savings_encoder = OrdinalEncoder(categories=savings_category)
german_data['Savings'] = savings_encoder.fit_transform(german_data[['Savings']])

employment_category = [['unemployed', '< 1 year', '1 <= ... < 4 years','4 <= ... < 7 years', '>= 7 years']]
employment_encoder = OrdinalEncoder(categories=employment_category)
german_data['Employment Since'] = employment_encoder.fit_transform(german_data[['Employment Since']])
german_data.head(20)


Unnamed: 0,Checking Account Status,Duration,Credit History,Credit Amount,Savings,Employment Since,Installment Rate,Residence Since,Age,Existing Credits,Liables,Target,Purpose_car (new),Purpose_car (used),Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_other,Purpose_radio/television,Purpose_repairs,Purpose_retraining,"Personal Info_male, divorced/separated","Personal Info_male, married/widowed","Personal Info_male, single",Debtors_guarantor,Debtors_none,Job_skilled employee/official,Job_unemployed/unskilled - non-resident,Job_unskilled - resident,Housing_own,Housing_rent,Property_car or other,Property_real estate,Property_unknown / no property,Installment Plans_none,Installment Plans_stores,"Telephone_yes, registered",Foreign_yes
0,1.0,6,1.0,1169,0.0,4.0,4,4,67,2,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
1,0.0,48,3.0,5951,1.0,2.0,2,2,22,1,1,2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,3.0,12,1.0,2096,1.0,3.0,2,3,49,1,2,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,1.0,42,3.0,7882,1.0,3.0,2,4,45,1,2,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.0,24,2.0,4870,1.0,2.0,3,4,53,2,2,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
5,3.0,36,3.0,9055,0.0,2.0,2,4,35,1,2,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
6,3.0,24,3.0,2835,3.0,4.0,3,4,53,1,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
7,0.0,36,3.0,6948,1.0,2.0,2,2,35,1,1,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
8,3.0,12,3.0,3059,4.0,3.0,2,4,61,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
9,0.0,30,1.0,5234,1.0,0.0,4,2,28,2,1,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [4]:
german_data = german_data.sample(frac=1, random_state=42).reset_index(drop=True)


target_len = int(len(german_data)*.6)+1
forget_len = int(target_len*.15)
shadow_len =int(len(german_data)-target_len)

target_data = german_data.iloc[:target_len]
shadow_data = german_data.iloc[target_len:]
keep_set = german_data.iloc[forget_len:]
forget_set = target_data.iloc[:forget_len]
print(shadow_len)

399


In [5]:
scaler = StandardScaler()

In [6]:
target_index = german_data.columns.get_loc('Target')

X = target_data.drop('Target', axis=1) 
Y = target_data['Target']  

X_train, X_test, Y_Train, Y_Test = train_test_split(X,Y, random_state=42)

og_x_scaled_train = scaler.fit_transform(X_train)
og_x_scaled_test = scaler.transform(X_test)

target = LogisticRegression(max_iter=1000)
target.fit(og_x_scaled_train,Y_Train)


In [7]:
shadow_scaler = StandardScaler()

In [8]:
shadow_1_data = shadow_data.iloc[:int(shadow_len/3)]
shadow_2_data = shadow_data.iloc[133:int((shadow_len/3)*2)]
shadow_3_data = shadow_data.iloc[int((shadow_len/3)*2):]

X1 = shadow_1_data.drop('Target', axis=1)
X2 = shadow_2_data.drop('Target', axis=1)
X3 = shadow_3_data.drop('Target', axis=1)

y1 = shadow_1_data['Target']
y2 = shadow_2_data['Target']
y3 = shadow_3_data['Target']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, stratify=y1, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, stratify=y2, random_state=42)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, stratify=y3, random_state=42)

X1_train = shadow_scaler.fit_transform(X1_train)
X1_test = shadow_scaler.transform(X1_test)


X2_train = shadow_scaler.fit_transform(X2_train)
X2_test = shadow_scaler.transform(X2_test)


X3_train = shadow_scaler.fit_transform(X3_train)
X3_test = shadow_scaler.transform(X3_test)


shadow_model_1 = LogisticRegression(max_iter=1000)
shadow_model_1.fit(X1_train, y1_train)

shadow_model_2 = LogisticRegression(max_iter=1000)
shadow_model_2.fit(X2_train, y2_train)

shadow_model_3 = LogisticRegression(max_iter=1000)
shadow_model_3.fit(X3_train, y3_train)


In [9]:
"""train_prob_1 = shadow_model_1.predict_proba(X1_train)
test_prob_1 = shadow_model_1.predict_proba(X1_test)

train_prob_2 = shadow_model_1.predict_proba(X2_train)
test_prob_2 = shadow_model_1.predict_proba(X2_test)

train_prob_3 = shadow_model_3.predict_proba(X3_train)
test_prob_3 = shadow_model_3.predict_proba(X3_test)"""

def get_prob_train(train_Set,model_num, y_train):
    if (model_num==1):
        train_prob = shadow_model_1.predict_proba(train_Set)
    elif (model_num==2):
        train_prob = shadow_model_2.predict_proba(train_Set)
    else:
        train_prob = shadow_model_3.predict_proba(train_Set)
    train_confidence = np.max(train_prob, axis=1)
    train_df = pd.DataFrame({
    'Confidence': train_confidence,
    'True_label': y_train.values,
    'Member': 1})
    return train_df

def get_prob_test(test_Set, model_num, y_test):
    if (model_num==1):
        test_prob = shadow_model_1.predict_proba(test_Set)
    elif (model_num==2):
        test_prob = shadow_model_2.predict_proba(test_Set)
    else:
        test_prob = shadow_model_3.predict_proba(test_Set)

    test_confidences = np.max(test_prob, axis=1)
    test_df = pd.DataFrame({
    'Confidence': test_confidences,
    'True_label': y_test.values,
    'Member': 0
    })

    return test_df


In [10]:

attack_data_1 = pd.concat([get_prob_train(X1_train,1, y1_train), get_prob_test(X1_test,1, y1_test)], ignore_index=True)

attack_data_2 = pd.concat([get_prob_train(X2_train,2, y2_train), get_prob_test(X2_test,2, y2_test)], ignore_index=True)

attack_data_3 = pd.concat([get_prob_train(X3_train,3, y3_train), get_prob_test(X3_test,3, y3_test)], ignore_index=True)


attack_data = pd.concat([attack_data_1, attack_data_2, attack_data_3], ignore_index=True)

In [11]:
X = german_data.drop('Target', axis=1) 
Y = german_data['Target']  

X_attack = attack_data.drop('Member',axis=1)
Y_attack = attack_data['Member']

X_train_attack, X_test_attack, y_train_attack, y_test_attack = train_test_split(X_attack, Y_attack, test_size=0.2, random_state=42)

attack = LogisticRegression(max_iter=1000)
attack.fit(X_train_attack, y_train_attack)


In [12]:
target_train = target.predict_proba(og_x_scaled_train)
target_test = target.predict_proba(og_x_scaled_test)

target_train_conf = np.max(target_train, axis=1)
target_test_conf = np.max(target_test, axis=1)

target_train_data = pd.DataFrame({'Confidence': target_train_conf, 'True_label': Y_Train.values, 'Member':1})
target_test_data = pd.DataFrame({'Confidence': target_test_conf, 'True_label': Y_Test.values, 'Member':0})

target_attack_data = pd.concat([target_train_data, target_test_data,], ignore_index=True)

X_target_attack = target_attack_data.drop('Member', axis=1)
Y_target_attack = target_attack_data['Member']

predicted_membership = attack.predict(X_target_attack)
print("Attack Accuracy:", accuracy_score(Y_target_attack, predicted_membership))


Attack Accuracy: 0.7487520798668885


In [None]:
unlearn_scaler = StandardScaler()

In [14]:
X_unlearn = keep_set.drop('Target', axis=1)
Y_unlearn = keep_set['Target']


X_unlearn_train, X_unlearn_test, Y_unlearn_Train, Y_unlearn_Test = train_test_split(X_unlearn,Y_unlearn, random_state=42)

X_unlearn_train = unlearn_scaler.fit_transform(X_unlearn_train)
X_unlearn_test = unlearn_scaler.transform(X_unlearn_test)

target = LogisticRegression(max_iter=1000)
target.fit(og_x_scaled_train,Y_Train)

unlearn = LogisticRegression(max_iter=1000)
unlearn.fit(X_unlearn_train,Y_unlearn_Train )


In [15]:
unlearn_train = unlearn.predict_proba(X_unlearn_train)
unlearn_test = unlearn.predict_proba(X_unlearn_test)

unlearn_train_conf = np.max(unlearn_train, axis=1)
unlearn_test_conf = np.max(unlearn_test, axis=1)

unlearn_train_data = pd.DataFrame({'Confidence': unlearn_train_conf, 'True_label': Y_unlearn_Train.values, 'Member':1})
unlearn_test_data = pd.DataFrame({'Confidence': unlearn_test_conf, 'True_label': Y_unlearn_Test.values, 'Member':0})

unlearn_attack_data = pd.concat([unlearn_train_data, unlearn_test_data,], ignore_index=True)

X_unlearn_attack = unlearn_attack_data.drop('Member', axis=1)
Y_unlearn_attack = unlearn_attack_data['Member']

predicted_membership = attack.predict(X_unlearn_attack)
print("Attack Accuracy:", accuracy_score(Y_unlearn_attack, predicted_membership))

Attack Accuracy: 0.7494505494505495
