In [1]:
import re
import nltk
import os
import numpy as np
import codecs
import multiprocessing as mp
import pandas as pd

from nltk.corpus import stopwords
from collections import Counter
from typing import Dict, List, Tuple

from nltk import stem

stemmer = stem.PorterStemmer()

# find the path of emails
def listdir(directory: str) -> List:
    """
    A specialized version of os.listdir() that ignores files that
    start with a leading period.
    
    Especially dismissing .DS_STORE s.
    """
    filelist = os.listdir(directory)
    return [x for x in filelist if not (x.startswith('.'))]

stopwords = stopwords.words('english')
cut_model = nltk.WordPunctTokenizer()


# Preprocess emails
def enron_processor(emails_dir: str, return_list: list) -> list:    
    """
    * remove numbers
    * remove stopwords
    * add lables
    """
    dirs = [os.path.join(emails_dir, f) for f in os.listdir(emails_dir)]
    for d in dirs:
        emails = [os.path.join(d, f) for f in os.listdir(d)]
        for mail in emails:
            # print(mail)
            with codecs.open(mail, "rb", encoding='utf_8_sig', errors='ignore') as m:
                email_list = []
                line_str = ""
                for line in m:
                    for word in line:
                        if word.startswith("http"):
                            print(word)
                            word = "URL"
                            print(word)
                        word = stemmer.stem(word)
                    line = re.sub(r'[^a-zA-Z\s]', '', string=line)
                    line = line.lower()
                    line = line.strip()
                    tokens = cut_model.tokenize(line)
                    line = [stemmer.stem(token) for token in tokens if token not in stopwords]
           
                    line = ' '.join(line)
                    line_str = line_str+line+" "
                email_list.append(line_str)
                
                if mail.split(".")[-2] == 'spam':
                    email_list.append("spam")
                else: 
                    email_list.append("ham")
                email_list.append(mail)
                return_list.append(email_list)
                

                
return_dict = {}   
root_dir = 'spampy/datasets/enron'
emails_dirs = [os.path.join(root_dir, f) for f in listdir(root_dir)]
return_list = []
for emails_dir in emails_dirs:
    enron_processor(emails_dir, return_list)


messages = pd.DataFrame(return_list, columns=['message', 'label', 'path'])

messages

Unnamed: 0,message,label,path
0,christma tree farm pictur,ham,spampy/datasets/enron\enron1\ham\0001.1999-12-...
1,vastar resourc inc gari product high island la...,ham,spampy/datasets/enron\enron1\ham\0002.1999-12-...
2,calpin daili ga nomin calpin daili ga nomin doc,ham,spampy/datasets/enron\enron1\ham\0003.1999-12-...
3,issu fyi see note alreadi done stella forward ...,ham,spampy/datasets/enron\enron1\ham\0004.1999-12-...
4,meter nov alloc fyi forward lauri allen hou ec...,ham,spampy/datasets/enron\enron1\ham\0005.1999-12-...
...,...,...,...
33711,iso q good news c edaliss val edumm vl eoggra ...,spam,spampy/datasets/enron\enron6\spam\5995.2005-07...
33712,prescript medicin special precis put buck back...,spam,spampy/datasets/enron\enron6\spam\5997.2005-07...
33713,next gener onlin pharmaci readi rock let man r...,spam,spampy/datasets/enron\enron6\spam\5998.2005-07...
33714,bloow time time learn last time longer bed rea...,spam,spampy/datasets/enron\enron6\spam\5999.2005-07...


In [1]:
messages.to_csv("messages.csv")

NameError: name 'messages' is not defined

In [2]:
# Split train and test and store the path in x_train_path

from sklearn.model_selection import train_test_split
messages['label'] = messages['label'].replace('ham', 0)
messages['label'] = messages['label'].replace('spam', 1)

messages_label = messages['label']
message_path = messages['path']
x = messages['message']
y = messages_label

x_train_path = []

# split the dataset into 5 parts randomly and choose 4 of 5 as train dataset and another part as test dataset

dataset41, dataset5, dataset41_y, dataset5_y = train_test_split(x, y, test_size=0.2)
dataset31, dataset4, dataset31_y, dataset4_y = train_test_split(dataset41, dataset41_y, test_size=0.25)
dataset21, dataset3, dataset21_y, dataset3_y = train_test_split(dataset31, dataset31_y, test_size=1/3)
dataset1, dataset2, dataset1_y, dataset2_y = train_test_split(dataset21, dataset21_y, test_size=0.5)

x_train = dataset1.append(dataset2)
x_train = x_train.append(dataset3)
x_train = x_train.append(dataset5)
y_train = dataset1_y.append(dataset2_y)
y_train = y_train.append(dataset3_y)
y_train = y_train.append(dataset5_y)

x_test = dataset4
y_test = dataset4_y

for i, v in x_train.items():
    x_train_path.append((i, message_path[i]))

x_test_path = []
for i, v in x_test.items():
    x_test_path.append((i, message_path[i]))




In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Tf-idf for train datasets
vect = TfidfVectorizer()
tfidf_train = vect.fit_transform(x_train)
tfidf_matrix_train = pd.DataFrame(tfidf_train.toarray(), columns = vect.get_feature_names())
headers = vect.get_feature_names()
tfidf_matrix_train

Unnamed: 0,aa,aaa,aaaaci,aaaahhhhhh,aaadrizzl,aaaenerfax,aaal,aaaplusdirect,aab,aababp,...,zzucpkow,zzvffofbj,zzw,zzx,zzxtfeerekvwkug,zzxxst,zzyudgpd,zzzglvaa,zzzxlqbha,zzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Tf-idf for test datsets
tfidf_test = vect.transform(x_test)
tfidf_matrix_test = pd.DataFrame(tfidf_test.toarray(), columns = vect.get_feature_names())
tfidf_matrix_test

Unnamed: 0,aa,aaa,aaaaci,aaaahhhhhh,aaadrizzl,aaaenerfax,aaal,aaaplusdirect,aab,aababp,...,zzucpkow,zzvffofbj,zzw,zzx,zzxtfeerekvwkug,zzxxst,zzyudgpd,zzzglvaa,zzzxlqbha,zzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
from sklearn.model_selection import KFold
from secml.data import CDataset
from secml.data.splitter import CDataSplitterKFold
from secml.ml.classifiers import CClassifierSVM
from secml.ml.peval.metrics import CMetricAccuracy
from secml.ml.peval.metrics import CMetricConfusionMatrix

from secml.ml.classifiers.multiclass import CClassifierMulticlassOVA
from secml.ml.kernels import CKernelLinear

nb_col = tfidf_matrix_train.size

tr_set = CDataset(tfidf_train, y_train)


# Train the SVM
print("Build SVM")
xval_splitter = CDataSplitterKFold()
clf_lin = CClassifierSVM()


xval_lin_params = {'C': [1]}

print("Find the best params")

best_lin_params = clf_lin.estimate_parameters(
    dataset = tr_set,
    parameters = xval_lin_params,
    splitter = xval_splitter,
    metric = 'accuracy',
    perf_evaluator = 'xval'
)


print("Finish Train")
print("The best training parameters are: ", [(k, best_lin_params[k]) for k in sorted(best_lin_params)])

print("Train SVM")
clf_lin.fit(tr_set.X, tr_set.Y)

Build SVM
Find the best params
Finish Train
The best training parameters are:  [('C', 1)]
Train SVM


CClassifierSVM{'classes': CArray(2,)(dense: [0 1]), 'n_features': 106864, 'preprocess': None, 'n_jobs': 1, 'C': 1.0, 'class_weight': None, 'w': CArray(1, 106864)(sparse: (0, 88839) 0.01876467867083321  (0, 46672) 0.0037529357341666418  (0, 54921) 0.06364231466202051  (0, 44456) 0.06703118972390458  (0, 14691) 0.06972109118812977  (0, 13249) 0.06972109118812977  (0, 13239) 0.06972109118812977  (0, 13169) 0.06972109118812977  (0, 11509) 0.06972109118812977  (0, 10283) 0.06972109118812977  (0, 9756) 0.06972109118812977  (0, 8772) 0.06972109118812977  (0, 8194) 0.06972109118812977  (0, 7829) 0.06972109118812977  (0, 5974) 0.065122674262414  (0, 5886) 0.06972109118812977  (0, 5877) 0.06972109118812977  (0, 5644) 0.06972109118812977  (0, 4469) 0.06972109118812977  (0, 4020) 0.06972109118812977  (0, 3571) 0.06972109118812977  (0, 2885) 0.06972109118812977  (0, 1840) 0.06972109118812977  (0, 1838) 0.06972109118812977  (0, 1767) 0.06972109118812977  : :  (0, 60788) -0.24834298423132914  (0, 607

In [6]:
# Test the Classifier
ts_set = CDataset(tfidf_test, y_test)
print(ts_set)

y_pred = clf_lin.predict(ts_set.X)
metric = CMetricAccuracy()
acc = metric.performance_score(y_true=ts_set.Y, y_pred=y_pred)

confusion_matrix = CMetricConfusionMatrix() 
cm = confusion_matrix.performance_score(y_true=ts_set.Y, y_pred=y_pred)

print("Accuracy on test set: {:.2%}".format(acc))
print("Confusion Matrix: ")
print(cm)
print("False Positive Rate: {:.2%}".format(39/(39+3445)))

CDataset{'X': CArray(6743, 106864)(sparse: (0, 102655) 0.09314222387921284  (0, 98856) 0.12583121502253042  (0, 91160) 0.4280877317095072  (0, 90045) 0.10767997674417029  (0, 86778) 0.18953191449341547  (0, 82276) 0.1808344113807485  (0, 78541) 0.17941693504108056  (0, 65688) 0.11748818870188402  (0, 54704) 0.10472401982594139  (0, 53594) 0.09710279002665793  (0, 41325) 0.19797428506624018  (0, 38496) 0.11879790043949613  (0, 38211) 0.13139514365718494  (0, 37719) 0.12086945931610499  (0, 32983) 0.18496364159392056  (0, 27036) 0.5955380305357458  (0, 21737) 0.22985482934388377  (0, 19402) 0.11502615594704604  (0, 13894) 0.15214166947654723  (0, 11528) 0.23830165863003458  (0, 5652) 0.15989541405088473  (1, 100774) 0.09540634246070365  (1, 96852) 0.13797945702538134  (1, 93529) 0.154933946920909  (1, 91293) 0.12053502225377948  : :  (6742, 70874) 0.1575192649911508  (6742, 63059) 0.0579772735045893  (6742, 55629) 0.16189184470418758  (6742, 55151) 0.12866285138255035  (6742, 53553) 0.11

In [7]:
import random
from secml.array import CArray
from secml.adv.attacks.evasion import CAttackEvasionPGD

nb_attack=100

class_to_attack=1
cnt = 0

ori_examples2_x = []
ori_examples2_y = []
number_list = []
for i in range(nb_attack):
    #take a point at random being the starting point of the attack
    idx_candidates = np.where(y_test == class_to_attack)
    #select nb_init_pts points randomly in candidates and make them move
    rn = np.random.choice(idx_candidates[0].size, 1)
    x0,y0 =ts_set[idx_candidates[0][rn[0]],:].X, ts_set[idx_candidates[0][rn[0]],:].Y
    number_list.append(x_test_path[idx_candidates[0][rn[0]]])
    
    x0=x0.astype(float)
    y0=y0.astype(int)
    x2 = x0.tondarray()[0]
    y2 = y0.tondarray()[0]
    
    ori_examples2_x.append(x2)
    ori_examples2_y.append(y2)
    
number_list

[(19247, 'spampy/datasets/enron\\enron4\\spam\\1602.2004-07-01.GP.spam.txt'),
 (25994,
  'spampy/datasets/enron\\enron5\\spam\\2743.2005-06-29.SA_and_HP.spam.txt'),
 (25828,
  'spampy/datasets/enron\\enron5\\spam\\2504.2005-06-28.SA_and_HP.spam.txt'),
 (27163,
  'spampy/datasets/enron\\enron5\\spam\\4405.2005-07-18.SA_and_HP.spam.txt'),
 (27376,
  'spampy/datasets/enron\\enron5\\spam\\4702.2005-07-19.SA_and_HP.spam.txt'),
 (25983,
  'spampy/datasets/enron\\enron5\\spam\\2724.2005-06-29.SA_and_HP.spam.txt'),
 (29842, 'spampy/datasets/enron\\enron6\\spam\\0843.2004-09-30.BG.spam.txt'),
 (26707,
  'spampy/datasets/enron\\enron5\\spam\\3749.2005-07-06.SA_and_HP.spam.txt'),
 (9673,
  'spampy/datasets/enron\\enron2\\spam\\0581.2002-07-17.SA_and_HP.spam.txt'),
 (4595, 'spampy/datasets/enron\\enron1\\spam\\3265.2004-12-23.GP.spam.txt'),
 (30916, 'spampy/datasets/enron\\enron6\\spam\\2279.2004-12-22.BG.spam.txt'),
 (15762, 'spampy/datasets/enron\\enron3\\spam\\2641.2005-01-23.BG.spam.txt'),
 (3

In [24]:
# Perform adversarial attacks
noise_type = 'l2'  # Type of perturbation 'l1' or 'l2'
dmax = 0.09 # Maximum perturbation
lb, ub = 0, 1  # Bounds of the attack space. Can be set to `None` for unbounded

solver_params = {
    'eta': 0.01,
    'max_iter': 20,
    'eps': 1e-6}

#set lower bound and upper bound respectively to 0 and 1 since all features are Boolean
pgd_attack = CAttackEvasionPGD(
    classifier=clf_lin,
    double_init_ds=tr_set,
    distance=noise_type,
    dmax=dmax,
    lb=lb, ub=ub,
    solver_params=solver_params)


ad_examples_x = []
ad_examples_y = []
cnt = 0
for i in range(len(ori_examples2_x)):
    print("Current Number:", i)
    x0 = ori_examples2_x[i]
    y0 = ori_examples2_y[i]

    y_pred_pgd, _, adv_ds_pgd, _ = pgd_attack.run(x0, y0)

    if y_pred_pgd.item() == 0:
        cnt = cnt + 1

    ad_examples_x.append(adv_ds_pgd.X.tondarray()[0])
    ad_examples_y.append(y_pred_pgd.item())

    attack_pt = adv_ds_pgd.X.tondarray()[0]
print("Accuracy:", cnt/nb_attack)

Current Number: 0


MemoryError: Unable to allocate array with shape (2232946,) and data type float64

In [2]:
ori_examples2_x = np.array(ori_examples2_x)
ori_examples2_y = np.array(ori_examples2_y)
ad_examples_x = np.array(ad_examples_x)
ad_examples_y = np.array(ad_examples_y)

ori_dataframe = pd.DataFrame(ori_examples2_x, columns = vect.get_feature_names())
ad_dataframe = pd.DataFrame(ad_examples_x, columns = vect.get_feature_names())

ad_dataframe['ad_label'] = ad_examples_y
ad_success = ad_dataframe.loc[ad_dataframe.ad_label == 0]
ori_success = ori_dataframe.loc[ad_dataframe.ad_label == 0]
ad_fail = ad_dataframe.loc[ad_dataframe.ad_label == 1]
ori_fail = ori_dataframe.loc[ad_dataframe.ad_label == 1]

ad_success_x = ad_success.drop(columns = ['ad_label'])
ad_fail_x = ad_fail.drop(columns = ['ad_label'])
result = (ad_success_x - ori_success)

vect.idf_
IDF = pd.DataFrame(vect.idf_.T, index=vect.get_feature_names())
IDF.to_csv("idf.csv")
IDF

NameError: name 'np' is not defined

In [1]:
# Method 2
x2result1 = result
x2result1 = np.array(x2result1)
x2result = result
x2result = x2result.multiply(x2result1)

 
sum_number = x2result.sum()/cnt
sum_number = pd.DataFrame(sum_number, columns = ['sum_number'])
sum_number = sum_number.sort_values(by='sum_number', ascending=False, inplace=False)

sum_number_pd = pd.DataFrame(sum_number.index[:100])
sum_number_pd.to_csv("x2result.csv")
sum_number_pd

NameError: name 'result' is not defined

In [11]:
# ori > 0, ad = 0
# The dispearing features
ad1 = ad_success_x
ori1 = ori_success

ori2 = ori1.loc[:, (ori1>=0).all(axis=0)]
ori = ori2.loc[:, ~(ori2==0).all(axis=0)]

ad = ad1.loc[:,ori.columns]
ad = ad.loc[:, (ad>=0).all(axis=0)]
ad = ad.loc[:, ~(ad>0).all(axis=0)]
ad

Unnamed: 0,aaqzufn,abba,abbi,abcd,abdomin,abe,abidjan,abiiiti,abil,abl,...,znvzcwgu,znxfrf,zone,zonedubai,zsmpc,ztop,zuitq,zwhhw,zy,zyban
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
ori1_ad0_columns = ad.columns
ori1_ad0_columns = pd.DataFrame(ori1_ad0_columns)
ori1_ad0_columns.to_csv("ori1_ad0_columns.csv")

In [13]:
# ori = 0, ad > 0
# The adding features
ad11 = ad_success_x
ori11 = ori_success

ori21 = ori11.loc[:, (ori11>=0).all(axis=0)]
ori22 = ori21.loc[:, ~(ori21>0).all(axis=0)]

ad22 = ad11.loc[:,ori22.columns]
ad22 = ad22.loc[:, (ad22>=0).all(axis=0)]
ad22 = ad22.loc[:, ~(ad22==0).all(axis=0)]
ad22

Unnamed: 0,aa,aaa,aaldou,aaqzufn,aaron,aarp,abackof,abacu,abacustech,abandon,...,zrobilem,zsmpc,ztop,zuitq,zwhhw,zwlaszcza,zy,zyban,zyc,zzn
0,0.000155,0.000466,0.000071,0.0,0.000071,0.000027,0.000011,0.000497,0.002197,0.000165,...,0.000089,0.0,0.0,0.0,0.0,0.000089,0.0,0.0,0.000172,0.000011
1,0.000155,0.000466,0.000071,0.0,0.000071,0.000027,0.000011,0.000497,0.002197,0.000165,...,0.000089,0.0,0.0,0.0,0.0,0.000089,0.0,0.0,0.000172,0.000011
2,0.000155,0.000467,0.000071,0.0,0.000071,0.000027,0.000011,0.000497,0.002197,0.000166,...,0.000089,0.0,0.0,0.0,0.0,0.000089,0.0,0.0,0.000172,0.000011
3,0.000155,0.000466,0.000071,0.0,0.000071,0.000027,0.000011,0.000497,0.002197,0.000165,...,0.000089,0.0,0.0,0.0,0.0,0.000089,0.0,0.0,0.000172,0.000011
4,0.000155,0.000466,0.000071,0.0,0.000071,0.000027,0.000011,0.000497,0.002197,0.000165,...,0.000089,0.0,0.0,0.0,0.0,0.000089,0.0,0.0,0.000172,0.000011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000155,0.000466,0.000071,0.0,0.000071,0.000027,0.000011,0.000497,0.002195,0.000165,...,0.000089,0.0,0.0,0.0,0.0,0.000089,0.0,0.0,0.000172,0.000011
96,0.000155,0.000466,0.000071,0.0,0.000071,0.000027,0.000011,0.000497,0.002196,0.000165,...,0.000089,0.0,0.0,0.0,0.0,0.000089,0.0,0.0,0.000172,0.000011
97,0.000155,0.000466,0.000071,0.0,0.000071,0.000027,0.000011,0.000497,0.002196,0.000165,...,0.000089,0.0,0.0,0.0,0.0,0.000089,0.0,0.0,0.000172,0.000011
98,0.000155,0.000466,0.000071,0.0,0.000071,0.000027,0.000011,0.000497,0.002195,0.000165,...,0.000089,0.0,0.0,0.0,0.0,0.000089,0.0,0.0,0.000172,0.000011


In [14]:
ori0_ad1_columns = ad22.columns
ori0_ad1_columns = pd.DataFrame(ori0_ad1_columns)
ori0_ad1_columns.to_csv("ori0_ad1_columns.csv")

In [21]:
# we use the file of "Capstone2" to find these magic words
words14 = " erisk cdnow listbot kaminski beenladen wassup clickathom counterparti sitara enrononlin cera ferc jhherbert topica ena eyeforenergi calger pjm"

spam_cnt = 0
d2 = "spampy/datasets/enron/"
for r in range(1, 7):
    d3 = d2 + "enron" + str(r)+"/spam"
    emails2 = [os.path.join(d3, f) for f in os.listdir(d3)]
    for j in emails2:
        with codecs.open(j, "rb", encoding='utf_8_sig', errors='ignore') as m:
            #print(j)
            choose_email = []
            line_str = ""
            for line in m:
                for word in line:
                    if word.startswith("http"):
                        word = "URL"
                    word = stemmer.stem(word)

                line = re.sub(r'[^a-zA-Z\s]', '', string=line)
                line = line.lower()
                line = line.strip()
                tokens = cut_model.tokenize(line)
                line = [stemmer.stem(token) for token in tokens if token not in stopwords]

                line = ' '.join(line)
                line_str = line_str+line+" "
            line_str = line_str+words14
            choose_email.append(line_str)
        message_14_email = pd.DataFrame(choose_email, columns = ["message"])
        message_14_tf_idf = vect.transform(message_14_email["message"])
        message_14_tf_idf = pd.DataFrame(message_14_tf_idf.toarray(), columns = vect.get_feature_names())
        #print(message_14_tf_idf)
        message_14_y = [1]
        message_14_y = pd.Series(message_14_y)
        message_CData = CDataset(message_14_tf_idf, message_14_y)
        message_14_pred = clf_lin.predict(message_CData.X)
        # print(message_14_pred)
        if message_14_pred == 0:
            spam_cnt = spam_cnt+1
        #break
    print(r, spam_cnt)
print("Count: ", spam_cnt)

1 1291
2 2599
3 3883
4 7747
5 10978
6 14753
Count:  14753


In [1]:
# we get 5 results of different train and test dataset, and get their intersection
list1 = ['ena', 'cdnow', 'ferc', 'listbot', 'enrononlin', 'counterparti', 'clickathom', 'topica', 'kaminski', 'sitara', 'pjm']
list2 = ['cera', 'counterparti', 'kal', 'lokay', 'enrononlin','kaminski', 'wassup', 'topica', 'listbot', 'clickathom','cdnow', 'calger', 'beenladen', 'ena', 'pjm', 'sitara']
list3 = ['ena', 'cdnow', 'wassup', 'sitara', 'cera', 'listbot', 'enrononlin', 'beenladen', 'clickathom','topica', 'calger', 'kaminski', 'jhherbert', 'eyeforenergi', 'pjm']
list4 = ['enrononlin', 'ena', 'beenladen', 'cdnow', 'pjm', 'wassup','listbot','kaminski','lokay', 'calger','eyeforenergi']
list5 = ['enrononlin', 'ena', 'reactionsnet', 'cdnow', 'kaminski', 'lokay', 'sitara', 'counterparti', 'clickathom', 'topica', 'cera', 'eyeforenergi']

final = list(set(list1).intersection(set(list2)))
final = list(set(final).intersection(set(list3)))
final = list(set(final).intersection(set(list4)))
final = list(set(final).intersection(set(list5)))
final

['cdnow', 'kaminski', 'enrononlin', 'ena']

In [5]:
words14str = ""
for item in finalu:
    words14str = words14str + " " + item
    
words14str

' kal topica cdnow ferc lokay cera jhherbert enrononlin sitara listbot calger wassup pjm kaminski counterparti eyeforenergi clickathom reactionsnet ena beenladen'

In [4]:
# find the union of 5 results
finalu = list(set(list1).union(set(list2)))
finalu = list(set(finalu).union(set(list3)))
finalu = list(set(finalu).union(set(list4)))
finalu = list(set(finalu).union(set(list5)))
finalu

['kal',
 'topica',
 'cdnow',
 'ferc',
 'lokay',
 'cera',
 'jhherbert',
 'enrononlin',
 'sitara',
 'listbot',
 'calger',
 'wassup',
 'pjm',
 'kaminski',
 'counterparti',
 'eyeforenergi',
 'clickathom',
 'reactionsnet',
 'ena',
 'beenladen']

In [14]:
# Black-box Attack

from collections import namedtuple
CLF = namedtuple('CLF', 'clf_name clf xval_parameters')

random_state = 999
from secml.ml.classifiers import CClassifierKNN, CClassifierDecisionTree, CClassifierSGD


target_clf_list = [
    CLF(clf_name='kNN',
        clf=CClassifierKNN(),
        xval_parameters={'n_neighbors': [160]}),
    CLF(clf_name='Decision Tree',
        clf=CClassifierDecisionTree(random_state=random_state),
        xval_parameters={'max_depth': [55]}),
    CLF(clf_name='Logistic(SGD)',
        clf=CClassifierSGD(random_state=random_state, regularizer = 'l2', loss='log'),
        xval_parameters={'alpha': [1e-6, 1e-5, 1e-4, 1e-4]}),
]

for i, test_case in enumerate(target_clf_list):

    clf = test_case.clf
    xval_params = test_case.xval_parameters

    print("\nEstimating the best training parameters of {:} ..."
          "".format(test_case.clf_name))

    best_params2 = clf.estimate_parameters(
        dataset=tr_set, parameters=xval_params, splitter=xval_splitter,
        metric='accuracy', perf_evaluator='xval')

    print("The best parameters for '{:}' are: ".format(test_case.clf_name),
          [(k, best_params2[k]) for k in sorted(best_params2)])

    print("Training of {:} ...".format(test_case.clf_name))
    clf.fit(tr_set.X, tr_set.Y)

    # Predictions on test set and performance evaluation
    y_pred1 = clf.predict(ts_set.X)
    acc1 = metric.performance_score(y_true=ts_set.Y, y_pred=y_pred1)

    print("Classifier: {:}\tAccuracy: {:.2%}".format(test_case.clf_name, acc1))


Estimating the best training parameters of kNN ...
The best parameters for 'kNN' are:  [('n_neighbors', 160)]
Training of kNN ...
Classifier: kNN	Accuracy: 96.00%

Estimating the best training parameters of Decision Tree ...
The best parameters for 'Decision Tree' are:  [('max_depth', 55)]
Training of Decision Tree ...
Classifier: Decision Tree	Accuracy: 95.85%

Estimating the best training parameters of Logistic(SGD) ...
The best parameters for 'Logistic(SGD)' are:  [('alpha', 1e-06)]
Training of Logistic(SGD) ...
Classifier: Logistic(SGD)	Accuracy: 99.04%


In [17]:
words14 = " erisk cdnow listbot kaminski beenladen wassup clickathom counterparti sitara enrononlin cera ferc jhherbert topica ena eyeforenergi calger pjm"

In [21]:
number_list2 = [(30577, 'spampy/datasets/enron\\enron6\\spam\\1826.2004-11-26.BG.spam.txt'),
 (33198, 'spampy/datasets/enron\\enron6\\spam\\5306.2005-06-16.BG.spam.txt'),
 (26462,
  'spampy/datasets/enron\\enron5\\spam\\3394.2005-07-04.SA_and_HP.spam.txt'),
 (18083, 'spampy/datasets/enron\\enron4\\spam\\0059.2003-12-25.GP.spam.txt'),
 (27474,
  'spampy/datasets/enron\\enron5\\spam\\4842.2005-07-19.SA_and_HP.spam.txt'),
 (30932, 'spampy/datasets/enron\\enron6\\spam\\2302.2004-12-23.BG.spam.txt'),
 (22308, 'spampy/datasets/enron\\enron4\\spam\\5687.2005-07-22.GP.spam.txt'),
 (18986, 'spampy/datasets/enron\\enron4\\spam\\1253.2004-05-22.GP.spam.txt'),
 (5124, 'spampy/datasets/enron\\enron1\\spam\\5020.2005-08-16.GP.spam.txt'),
 (5040, 'spampy/datasets/enron\\enron1\\spam\\4755.2005-06-27.GP.spam.txt'),
 (10854,
  'spampy/datasets/enron\\enron2\\spam\\5171.2005-07-19.SA_and_HP.spam.txt'),
 (26308,
  'spampy/datasets/enron\\enron5\\spam\\3179.2005-07-02.SA_and_HP.spam.txt'),
 (21344, 'spampy/datasets/enron\\enron4\\spam\\4400.2005-02-22.GP.spam.txt'),
 (15073, 'spampy/datasets/enron\\enron3\\spam\\0131.2004-08-12.BG.spam.txt'),
 (31727, 'spampy/datasets/enron\\enron6\\spam\\3346.2005-02-09.BG.spam.txt'),
 (15656, 'spampy/datasets/enron\\enron3\\spam\\2248.2005-01-05.BG.spam.txt'),
 (26468,
  'spampy/datasets/enron\\enron5\\spam\\3403.2005-07-04.SA_and_HP.spam.txt'),
 (19759, 'spampy/datasets/enron\\enron4\\spam\\2286.2004-09-09.GP.spam.txt'),
 (29829, 'spampy/datasets/enron\\enron6\\spam\\0825.2004-09-29.BG.spam.txt'),
 (3930, 'spampy/datasets/enron\\enron1\\spam\\0903.2004-04-23.GP.spam.txt'),
 (24215,
  'spampy/datasets/enron\\enron5\\spam\\0241.2002-05-20.SA_and_HP.spam.txt'),
 (18644, 'spampy/datasets/enron\\enron4\\spam\\0794.2004-04-09.GP.spam.txt'),
 (21683, 'spampy/datasets/enron\\enron4\\spam\\4855.2005-04-09.GP.spam.txt'),
 (4871, 'spampy/datasets/enron\\enron1\\spam\\4204.2005-04-05.GP.spam.txt'),
 (18359, 'spampy/datasets/enron\\enron4\\spam\\0420.2004-02-14.GP.spam.txt'),
 (10533,
  'spampy/datasets/enron\\enron2\\spam\\3956.2005-07-04.SA_and_HP.spam.txt'),
 (16303, 'spampy/datasets/enron\\enron3\\spam\\4687.2005-06-02.BG.spam.txt'),
 (24129,
  'spampy/datasets/enron\\enron5\\spam\\0124.2002-05-11.SA_and_HP.spam.txt'),
 (21289, 'spampy/datasets/enron\\enron4\\spam\\4326.2005-02-17.GP.spam.txt'),
 (30124, 'spampy/datasets/enron\\enron6\\spam\\1219.2004-10-25.BG.spam.txt'),
 (27280,
  'spampy/datasets/enron\\enron5\\spam\\4570.2005-07-19.SA_and_HP.spam.txt'),
 (27538,
  'spampy/datasets/enron\\enron5\\spam\\4933.2005-07-20.SA_and_HP.spam.txt'),
 (15741, 'spampy/datasets/enron\\enron3\\spam\\2554.2005-01-20.BG.spam.txt'),
 (9704,
  'spampy/datasets/enron\\enron2\\spam\\0715.2002-07-21.SA_and_HP.spam.txt'),
 (21084, 'spampy/datasets/enron\\enron4\\spam\\4050.2005-01-27.GP.spam.txt'),
 (30558, 'spampy/datasets/enron\\enron6\\spam\\1801.2004-11-24.BG.spam.txt'),
 (32844, 'spampy/datasets/enron\\enron6\\spam\\4828.2005-05-15.BG.spam.txt'),
 (25986,
  'spampy/datasets/enron\\enron5\\spam\\2729.2005-06-29.SA_and_HP.spam.txt'),
 (30572, 'spampy/datasets/enron\\enron6\\spam\\1819.2004-11-25.BG.spam.txt'),
 (4282, 'spampy/datasets/enron\\enron1\\spam\\2175.2004-09-15.GP.spam.txt'),
 (25984,
  'spampy/datasets/enron\\enron5\\spam\\2725.2005-06-29.SA_and_HP.spam.txt'),
 (18717, 'spampy/datasets/enron\\enron4\\spam\\0897.2004-04-17.GP.spam.txt'),
 (30334, 'spampy/datasets/enron\\enron6\\spam\\1502.2004-11-08.BG.spam.txt'),
 (33101, 'spampy/datasets/enron\\enron6\\spam\\5176.2005-06-08.BG.spam.txt'),
 (24984,
  'spampy/datasets/enron\\enron5\\spam\\1323.2002-09-26.SA_and_HP.spam.txt'),
 (24140,
  'spampy/datasets/enron\\enron5\\spam\\0141.2002-05-12.SA_and_HP.spam.txt'),
 (18998, 'spampy/datasets/enron\\enron4\\spam\\1268.2004-05-25.GP.spam.txt'),
 (30579, 'spampy/datasets/enron\\enron6\\spam\\1829.2004-11-26.BG.spam.txt'),
 (19564, 'spampy/datasets/enron\\enron4\\spam\\2030.2004-08-15.GP.spam.txt'),
 (4989, 'spampy/datasets/enron\\enron1\\spam\\4568.2005-05-24.GP.spam.txt'),
 (26841,
  'spampy/datasets/enron\\enron5\\spam\\3941.2005-07-14.SA_and_HP.spam.txt'),
 (22396, 'spampy/datasets/enron\\enron4\\spam\\5804.2005-08-10.GP.spam.txt'),
 (18358, 'spampy/datasets/enron\\enron4\\spam\\0419.2004-02-14.GP.spam.txt'),
 (20505, 'spampy/datasets/enron\\enron4\\spam\\3289.2004-11-27.GP.spam.txt'),
 (15902, 'spampy/datasets/enron\\enron3\\spam\\3170.2005-02-17.BG.spam.txt'),
 (9939,
  'spampy/datasets/enron\\enron2\\spam\\1614.2004-07-21.SA_and_HP.spam.txt'),
 (27482,
  'spampy/datasets/enron\\enron5\\spam\\4854.2005-07-19.SA_and_HP.spam.txt'),
 (15306, 'spampy/datasets/enron\\enron3\\spam\\0992.2004-10-19.BG.spam.txt'),
 (25770,
  'spampy/datasets/enron\\enron5\\spam\\2422.2005-06-27.SA_and_HP.spam.txt'),
 (4106, 'spampy/datasets/enron\\enron1\\spam\\1557.2004-07-12.GP.spam.txt'),
 (24511,
  'spampy/datasets/enron\\enron5\\spam\\0667.2002-07-22.SA_and_HP.spam.txt'),
 (18501, 'spampy/datasets/enron\\enron4\\spam\\0606.2004-03-17.GP.spam.txt'),
 (15559, 'spampy/datasets/enron\\enron3\\spam\\1886.2004-12-15.BG.spam.txt'),
 (21836, 'spampy/datasets/enron\\enron4\\spam\\5060.2005-04-29.GP.spam.txt'),
 (10781,
  'spampy/datasets/enron\\enron2\\spam\\4914.2005-07-17.SA_and_HP.spam.txt'),
 (16164, 'spampy/datasets/enron\\enron3\\spam\\4131.2005-04-25.BG.spam.txt'),
 (31970, 'spampy/datasets/enron\\enron6\\spam\\3669.2005-02-27.BG.spam.txt'),
 (4596, 'spampy/datasets/enron\\enron1\\spam\\3266.2004-12-23.GP.spam.txt'),
 (4471, 'spampy/datasets/enron\\enron1\\spam\\2810.2004-11-12.GP.spam.txt'),
 (24555,
  'spampy/datasets/enron\\enron5\\spam\\0728.2002-07-24.SA_and_HP.spam.txt'),
 (32844, 'spampy/datasets/enron\\enron6\\spam\\4828.2005-05-15.BG.spam.txt'),
 (27013,
  'spampy/datasets/enron\\enron5\\spam\\4184.2005-07-16.SA_and_HP.spam.txt'),
 (29916, 'spampy/datasets/enron\\enron6\\spam\\0947.2004-10-07.BG.spam.txt'),
 (21278, 'spampy/datasets/enron\\enron4\\spam\\4309.2005-02-16.GP.spam.txt'),
 (26963,
  'spampy/datasets/enron\\enron5\\spam\\4111.2005-07-15.SA_and_HP.spam.txt'),
 (15481, 'spampy/datasets/enron\\enron3\\spam\\1593.2004-11-28.BG.spam.txt'),
 (24475,
  'spampy/datasets/enron\\enron5\\spam\\0616.2002-07-21.SA_and_HP.spam.txt'),
 (21967, 'spampy/datasets/enron\\enron4\\spam\\5233.2005-05-20.GP.spam.txt'),
 (31654, 'spampy/datasets/enron\\enron6\\spam\\3248.2005-02-05.BG.spam.txt'),
 (32476, 'spampy/datasets/enron\\enron6\\spam\\4338.2005-04-10.BG.spam.txt'),
 (24882,
  'spampy/datasets/enron\\enron5\\spam\\1181.2002-09-11.SA_and_HP.spam.txt'),
 (4203, 'spampy/datasets/enron\\enron1\\spam\\1879.2004-08-18.GP.spam.txt'),
 (20940, 'spampy/datasets/enron\\enron4\\spam\\3865.2005-01-10.GP.spam.txt'),
 (21843, 'spampy/datasets/enron\\enron4\\spam\\5069.2005-04-30.GP.spam.txt'),
 (18853, 'spampy/datasets/enron\\enron4\\spam\\1073.2004-05-05.GP.spam.txt'),
 (22128, 'spampy/datasets/enron\\enron4\\spam\\5449.2005-06-17.GP.spam.txt'),
 (21864, 'spampy/datasets/enron\\enron4\\spam\\5096.2005-05-05.GP.spam.txt'),
 (24822,
  'spampy/datasets/enron\\enron5\\spam\\1100.2002-08-28.SA_and_HP.spam.txt'),
 (32750, 'spampy/datasets/enron\\enron6\\spam\\4705.2005-05-06.BG.spam.txt'),
 (26394,
  'spampy/datasets/enron\\enron5\\spam\\3298.2005-07-03.SA_and_HP.spam.txt'),
 (20780, 'spampy/datasets/enron\\enron4\\spam\\3650.2004-12-23.GP.spam.txt'),
 (32637, 'spampy/datasets/enron\\enron6\\spam\\4554.2005-04-26.BG.spam.txt'),
 (9870,
  'spampy/datasets/enron\\enron2\\spam\\1341.2002-09-06.SA_and_HP.spam.txt'),
 (20879, 'spampy/datasets/enron\\enron4\\spam\\3782.2005-01-04.GP.spam.txt'),
 (26225,
  'spampy/datasets/enron\\enron5\\spam\\3062.2005-07-01.SA_and_HP.spam.txt'),
 (19314, 'spampy/datasets/enron\\enron4\\spam\\1691.2004-07-10.GP.spam.txt'),
 (32846, 'spampy/datasets/enron\\enron6\\spam\\4830.2005-05-15.BG.spam.txt'),
 (30185, 'spampy/datasets/enron\\enron6\\spam\\1300.2004-10-29.BG.spam.txt'),
 (26264,
  'spampy/datasets/enron\\enron5\\spam\\3116.2005-07-02.SA_and_HP.spam.txt'),
 (19415, 'spampy/datasets/enron\\enron4\\spam\\1828.2004-07-23.GP.spam.txt')]

In [22]:
m2_empty = pd.DataFrame()
for j1, j in number_list2:
    # print(j)
    with codecs.open(j, "rb", encoding='utf_8_sig', errors='ignore') as m:
        #print(j)
        choose_email = []
        line_str = ""
        for line in m:
            for word in line:
                if word.startswith("http"):
                    word = "URL"
                word = stemmer.stem(word)

            line = re.sub(r'[^a-zA-Z\s]', '', string=line)
            line = line.lower()
            line = line.strip()
            tokens = cut_model.tokenize(line)
            line = [stemmer.stem(token) for token in tokens if token not in stopwords]

            line = ' '.join(line)
            line_str = line_str+line+" "
        line_str = line_str+words14
        choose_email.append(line_str)
    message_14_email = pd.DataFrame(choose_email, columns = ["message"])
    message_14_tf_idf = vect.transform(message_14_email["message"])
    message_14_tf_idf = pd.DataFrame(message_14_tf_idf.toarray(), columns = vect.get_feature_names())
    
    m2_empty = m2_empty.append(message_14_tf_idf, ignore_index=True)
m2_empty

Unnamed: 0,aa,aaa,aaaaci,aaaahhhhhh,aaadrizzl,aaaenerfax,aaal,aaaplusdirect,aab,aababp,...,zzucpkow,zzvffofbj,zzw,zzx,zzxtfeerekvwkug,zzxxst,zzyudgpd,zzzglvaa,zzzxlqbha,zzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
ori_dataframe
ori_100 = CDataset(ori_dataframe, ori_examples2_y)


PGD_y = [0]*len(ad_success_x)
PGD_y = pd.Series(PGD_y)
PGD_100 = CDataset(ad_success_x, PGD_y)


m2_y = [0]*len(m2_empty)
m2_y = pd.Series(m2_y)
m2_100 = CDataset(m2_empty, m2_y)

for target_clf in target_clf_list:

    # original emails
    y100 = target_clf.clf.predict(ori_100.X)
    acc100 = metric.performance_score(y_true=ori_100.Y, y_pred=y100)
    print("Classifier: {:}\tAccuracy of 100: {:.2%}".format(target_clf.clf_name, acc100))
    
    #PGD
    yPGD = target_clf.clf.predict(PGD_100.X)
    accPGD = metric.performance_score(y_true=PGD_100.Y, y_pred=yPGD)
    print("Classifier: {:}\tAccuracy of PGD: {:.2%}".format(target_clf.clf_name, accPGD))
    
    
    y_m2 = target_clf.clf.predict(m2_100.X)
    acc_m2 = metric.performance_score(y_true=m2_100.Y, y_pred=y_m2)
    print("Classifier: {:}\tAccuracy of Method 2: {:.2%}".format(target_clf.clf_name, acc_m2))

Classifier: kNN	Accuracy of 100: 95.00%
Classifier: kNN	Accuracy of PGD: 16.67%
Classifier: kNN	Accuracy of Method 2: 31.00%
Classifier: Decision Tree	Accuracy of 100: 95.00%
Classifier: Decision Tree	Accuracy of PGD: 100.00%
Classifier: Decision Tree	Accuracy of Method 2: 92.00%
Classifier: Logistic(SGD)	Accuracy of 100: 99.00%
Classifier: Logistic(SGD)	Accuracy of PGD: 61.90%
Classifier: Logistic(SGD)	Accuracy of Method 2: 79.00%
