In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TerrenceZwy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [1]:
import re
import nltk
import os
import numpy as np
import codecs
import multiprocessing as mp
import pandas as pd

from nltk.corpus import stopwords
from collections import Counter
from typing import Dict, List, Tuple

from nltk import stem

stemmer = stem.PorterStemmer()

# find the path of emails
def listdir(directory: str) -> List:
    """
    A specialized version of os.listdir() that ignores files that
    start with a leading period.
    
    Especially dismissing .DS_STORE s.
    """
    filelist = os.listdir(directory)
    return [x for x in filelist if not (x.startswith('.'))]

stopwords = stopwords.words('english')
cut_model = nltk.WordPunctTokenizer()


# Preprocess emails
def enron_processor(emails_dir: str, return_list: list) -> list:    
    """
    * remove numbers
    * remove stopwords
    * add lables
    """
    dirs = [os.path.join(emails_dir, f) for f in os.listdir(emails_dir)]
    for d in dirs:
        emails = [os.path.join(d, f) for f in os.listdir(d)]
        for mail in emails:
            # print(mail)
            with codecs.open(mail, "rb", encoding='utf_8_sig', errors='ignore') as m:
                email_list = []
                line_str = ""
                for line in m:
                    for word in line:
                        if word.startswith("http"):
                            print(word)
                            word = "URL"
                            print(word)
                        word = stemmer.stem(word)
                    line = re.sub(r'[^a-zA-Z\s]', '', string=line)
                    line = line.lower()
                    line = line.strip()
                    tokens = cut_model.tokenize(line)
                    line = [stemmer.stem(token) for token in tokens if token not in stopwords]
           
                    line = ' '.join(line)
                    line_str = line_str+line+" "
                email_list.append(line_str)
                
                if mail.split(".")[-2] == 'spam':
                    email_list.append("spam")
                else: 
                    email_list.append("ham")
                email_list.append(mail)
                return_list.append(email_list)
                

                
return_dict = {}   
root_dir = 'spampy/datasets/enron'
emails_dirs = [os.path.join(root_dir, f) for f in listdir(root_dir)]
return_list = []
for emails_dir in emails_dirs:
    enron_processor(emails_dir, return_list)


messages = pd.DataFrame(return_list, columns=['message', 'label', 'path'])

messages

Unnamed: 0,message,label,path
0,christma tree farm pictur,ham,spampy/datasets/enron\enron1\ham\0001.1999-12-...
1,vastar resourc inc gari product high island la...,ham,spampy/datasets/enron\enron1\ham\0002.1999-12-...
2,calpin daili ga nomin calpin daili ga nomin doc,ham,spampy/datasets/enron\enron1\ham\0003.1999-12-...
3,issu fyi see note alreadi done stella forward ...,ham,spampy/datasets/enron\enron1\ham\0004.1999-12-...
4,meter nov alloc fyi forward lauri allen hou ec...,ham,spampy/datasets/enron\enron1\ham\0005.1999-12-...
...,...,...,...
33711,iso q good news c edaliss val edumm vl eoggra ...,spam,spampy/datasets/enron\enron6\spam\5995.2005-07...
33712,prescript medicin special precis put buck back...,spam,spampy/datasets/enron\enron6\spam\5997.2005-07...
33713,next gener onlin pharmaci readi rock let man r...,spam,spampy/datasets/enron\enron6\spam\5998.2005-07...
33714,bloow time time learn last time longer bed rea...,spam,spampy/datasets/enron\enron6\spam\5999.2005-07...


In [2]:
# Split train and test and store the path in x_train_path

from sklearn.model_selection import train_test_split
messages['label'] = messages['label'].replace('ham', 0)
messages['label'] = messages['label'].replace('spam', 1)

messages_label = messages['label']
message_path = messages['path']
x = messages['message']
y = messages_label

x_train_path = []
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)
#print(type(x_train))
for i, v in x_train.items():
    x_train_path.append((i, message_path[i]))


In [3]:
    
x_test_path = []
for i, v in x_test.items():
    x_test_path.append((i, message_path[i]))

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Tf-idf for train datasets
vect = TfidfVectorizer()
tfidf_train = vect.fit_transform(x_train)
# print(vect.get_feature_names())
# print(vect.vocabulary_)
tfidf_matrix_train = pd.DataFrame(tfidf_train.toarray(), columns = vect.get_feature_names())
headers = vect.get_feature_names()
tfidf_matrix_train

Unnamed: 0,aa,aaa,aaaa,aaaaci,aaadrizzl,aaaenerfax,aaagrp,aaaplusdirect,aaasash,aababp,...,zzucpkow,zzvffofbj,zzw,zzx,zzxtfeerekvwkug,zzxxst,zzzglvaa,zzzxlqbha,zzzz,zzzzcard
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Tf-idf for test datsets
tfidf_test = vect.transform(x_test)
tfidf_matrix_test = pd.DataFrame(tfidf_test.toarray(), columns = vect.get_feature_names())
tfidf_matrix_test

Unnamed: 0,aa,aaa,aaaa,aaaaci,aaadrizzl,aaaenerfax,aaagrp,aaaplusdirect,aaasash,aababp,...,zzucpkow,zzvffofbj,zzw,zzx,zzxtfeerekvwkug,zzxxst,zzzglvaa,zzzxlqbha,zzzz,zzzzcard
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
from sklearn.model_selection import KFold
from secml.data import CDataset
from secml.data.splitter import CDataSplitterKFold
from secml.ml.classifiers import CClassifierSVM
from secml.ml.peval.metrics import CMetricAccuracy
from secml.ml.peval.metrics import CMetricConfusionMatrix

from secml.ml.classifiers.multiclass import CClassifierMulticlassOVA
from secml.ml.kernels import CKernelLinear

nb_col = tfidf_train.size

tr_set = CDataset(tfidf_train, y_train)


# Train the SVM + RBF Kernal
print("Build SVM")
xval_splitter = CDataSplitterKFold()
clf_lin = CClassifierSVM(kernel=CKernelLinear())
#clf_lin = CClassifierMulticlassOVA(CClassifierSVM, kernel=CKernelRBF())


xval_lin_params = {'C': [0.1, 1, 10, 100]}

print("Find the best params")

best_lin_params = clf_lin.estimate_parameters(
    dataset = tr_set,
    parameters = xval_lin_params,
    splitter = xval_splitter,
    metric = 'accuracy',
    perf_evaluator = 'xval'
)


print("Finish Train")
print("The best training parameters are: ", [(k, best_lin_params[k]) for k in sorted(best_lin_params)])

    

Build SVM
Find the best params
Finish Train
The best training parameters are:  [('C', 1)]


In [94]:
print("Train SVM")
clf_lin.fit(tr_set.X, tr_set.Y)

Train SVM


CClassifierSVM{'classes': CArray(2,)(dense: [0 1]), 'n_features': 103534, 'preprocess': CKernelLinear{'rv': CArray(4192, 103534)(sparse: (0, 907) 0.0163111882311297  (0, 925) 0.015618595065795647  (0, 1449) 0.023467038319695325  (0, 2677) 0.022182347574999157  (0, 3798) 0.05812676823509335  (0, 4285) 0.025284424420541525  (0, 4369) 0.021987602346690793  (0, 4466) 0.04043421314629786  (0, 5376) 0.20935113937481556  (0, 5804) 0.03491877972522973  (0, 5936) 0.020974949385089692  (0, 6225) 0.015314230533234269  (0, 6393) 0.10151749511053956  (0, 7766) 0.029601300203885902  (0, 10364) 0.09873917780582388  (0, 10973) 0.022197667374587827  (0, 12071) 0.05515458663467931  (0, 12867) 0.021369680479150242  (0, 12978) 0.02037585697723595  (0, 13045) 0.02258116046824096  (0, 13201) 0.09114123431925694  (0, 14164) 0.020621760728799846  (0, 14298) 0.028202216510583508  (0, 14824) 0.01744761060314739  (0, 15719) 0.02192926765385204  : :  (4191, 54658) 0.22665775634858365  (4191, 54675) 0.247550159920

In [8]:
# save model
# clf_lin.save("svm_classifier")

'C:\\Users\\TerrenceZwy\\Desktop\\capstone\\svm_classifier.gz'

In [95]:
# Test the Classifier
ts_set = CDataset(tfidf_test, y_test)
print(ts_set)

y_pred = clf_lin.predict(ts_set.X)
metric = CMetricAccuracy()
acc = metric.performance_score(y_true=ts_set.Y, y_pred=y_pred)

confusion_matrix = CMetricConfusionMatrix() 
cm = confusion_matrix.performance_score(y_true=ts_set.Y, y_pred=y_pred)

print("Accuracy on test set: {:.2%}".format(acc))
print("Confusion Matrix: ")
print(cm)
print("False Positive Rate: {:.2%}".format(48/(48+4281)))

CDataset{'X': CArray(8429, 103534)(sparse: (0, 100066) 0.08727638844785315  (0, 99878) 0.0948510324887037  (0, 98075) 0.14733106708645696  (0, 97798) 0.12301551236881204  (0, 97724) 0.08007830953295064  (0, 95903) 0.05436691002747807  (0, 94485) 0.08853304283286532  (0, 93836) 0.07327772736886615  (0, 93835) 0.04016631251773859  (0, 93416) 0.06877078222922754  (0, 90248) 0.1388763337865423  (0, 88524) 0.03490904315013628  (0, 88221) 0.13685892767301638  (0, 86087) 0.13204769571170397  (0, 86039) 0.15167739764071075  (0, 85146) 0.14395979119569316  (0, 84105) 0.15167739764071075  (0, 83959) 0.1334876517822526  (0, 83037) 0.15167739764071075  (0, 82746) 0.15780320649989754  (0, 80453) 0.04404815369389414  (0, 78412) 0.14395979119569316  (0, 76580) 0.05319287464166703  (0, 74976) 0.08566332681403742  (0, 71620) 0.15167739764071075  : :  (8427, 5439) 0.054620854035939054  (8427, 5131) 0.05047364334670651  (8427, 4776) 0.07714342371721604  (8427, 4419) 0.047232795297042184  (8427, 4369) 0.0

In [96]:
import random
from secml.array import CArray
from secml.adv.attacks.evasion import CAttackEvasionPGD



nb_attack=20

#class from which the attack starts
class_to_attack=1
# nb_repet= 5
# acc_attack = np.zeros([nb_repet+1,1],float)
# acc_attack[0,0] = acc
# for rep in range(1,nb_repet+1):
cnt = 0

ori_examples2_x = []
ori_examples2_y = []
number_list = []
for i in range(nb_attack):
    #print("Current Number: ", i)
    #take a point at random being the starting point of the attack
    idx_candidates = np.where(y_test == class_to_attack)
    #select nb_init_pts points randomly in candidates and make them move
    rn = np.random.choice(idx_candidates[0].size, 1)
    x0,y0 =ts_set[idx_candidates[0][rn[0]],:].X, ts_set[idx_candidates[0][rn[0]],:].Y
    # print(x0)
    number_list.append(x_test_path[idx_candidates[0][rn[0]]])
   
    #x0.append(y0[0].item())
    
    x0=x0.astype(float)
    y0=y0.astype(int)
    x2 = x0.tondarray()[0]
    y2 = y0.tondarray()[0]
    
    
    ori_examples2_x.append(x2)
    ori_examples2_y.append(y2)

In [97]:
 number_list

[(21170, 'spampy/datasets/enron\\enron4\\spam\\4166.2005-02-05.GP.spam.txt'),
 (32050, 'spampy/datasets/enron\\enron6\\spam\\3771.2005-03-05.BG.spam.txt'),
 (32612, 'spampy/datasets/enron\\enron6\\spam\\4521.2005-04-24.BG.spam.txt'),
 (32099, 'spampy/datasets/enron\\enron6\\spam\\3835.2005-03-09.BG.spam.txt'),
 (33350, 'spampy/datasets/enron\\enron6\\spam\\5512.2005-06-30.BG.spam.txt'),
 (21652, 'spampy/datasets/enron\\enron4\\spam\\4814.2005-04-05.GP.spam.txt'),
 (4253, 'spampy/datasets/enron\\enron1\\spam\\2073.2004-09-04.GP.spam.txt'),
 (29618, 'spampy/datasets/enron\\enron6\\spam\\0539.2004-09-10.BG.spam.txt'),
 (32159, 'spampy/datasets/enron\\enron6\\spam\\3909.2005-03-14.BG.spam.txt'),
 (22274, 'spampy/datasets/enron\\enron4\\spam\\5642.2005-07-15.GP.spam.txt'),
 (33512, 'spampy/datasets/enron\\enron6\\spam\\5725.2005-07-13.BG.spam.txt'),
 (22282, 'spampy/datasets/enron\\enron4\\spam\\5652.2005-07-16.GP.spam.txt'),
 (31566, 'spampy/datasets/enron\\enron6\\spam\\3136.2005-01-31.BG

In [98]:
# Perform adversarial attacks
noise_type = 'l2'  # Type of perturbation 'l1' or 'l2'
dmax = 0.09 # Maximum perturbation
lb, ub = 0, 1  # Bounds of the attack space. Can be set to `None` for unbounded

solver_params = {
    'eta': 0.01,
    'max_iter': 20,
    'eps': 1e-6}

#set lower bound and upper bound respectively to 0 and 1 since all features are Boolean
pgd_attack = CAttackEvasionPGD(
    classifier=clf_lin,
    double_init_ds=tr_set,
    distance=noise_type,
    dmax=dmax,
    lb=lb, ub=ub,
    solver_params=solver_params)


ad_examples_x = []
ad_examples_y = []
cnt = 0
for i in range(len(ori_examples2_x)):
    print("Current Number:", i)
    x0 = ori_examples2_x[i]
    y0 = ori_examples2_y[i]

    y_pred_pgd, _, adv_ds_pgd, _ = pgd_attack.run(x0, y0)

    # print("Original x0 label: ", y0.item())
    # print("Adversarial example label (PGD): ", y_pred_pgd.item())
    if y_pred_pgd.item() == 0:
        cnt = cnt + 1

    # print("Number of classifier gradient evaluations: {:}"
    #  "".format(pgd_attack.grad_eval))

    # print("Initial sample feature values: ", x0)
    # print("Final sample(s) feature values: ", adv_ds_pgd)

    ad_examples_x.append(adv_ds_pgd.X.tondarray()[0])
    ad_examples_y.append(y_pred_pgd.item())


    attack_pt = adv_ds_pgd.X.tondarray()[0]
    # print("attack_pt:", attack_pt)
print("Accuracy:", cnt/100)

Current Number: 0
Current Number: 1
Current Number: 2
Current Number: 3
Current Number: 4
Current Number: 5
Current Number: 6
Current Number: 7
Current Number: 8
Current Number: 9
Current Number: 10
Current Number: 11
Current Number: 12
Current Number: 13
Current Number: 14
Current Number: 15
Current Number: 16
Current Number: 17
Current Number: 18
Current Number: 19
Accuracy: 0.19


In [60]:
ori_examples2_x = np.array(ori_examples2_x)
ori_examples2_y = np.array(ori_examples2_y)
ad_examples_x = np.array(ad_examples_x)
ad_examples_y = np.array(ad_examples_y)

ori_dataframe = pd.DataFrame(ori_examples2_x, columns = vect.get_feature_names())
print(ori_dataframe.max().sum()/len(vect.get_feature_names()))

ad_dataframe = pd.DataFrame(ad_examples_x, columns = vect.get_feature_names())
ad_dataframe['ad_label'] = ad_examples_y
ad_success = ad_dataframe.loc[ad_dataframe.ad_label == 0]
ori_success = ori_dataframe.loc[ad_dataframe.ad_label == 0]
ad_fail = ad_dataframe.loc[ad_dataframe.ad_label == 1]
ori_fail = ori_dataframe.loc[ad_dataframe.ad_label == 1]
ad_success

0.000990039386812982


Unnamed: 0,aa,aaa,aaaa,aaaaci,aaadrizzl,aaaenerfax,aaagrp,aaaplusdirect,aaasash,aababp,...,zzvffofbj,zzw,zzx,zzxtfeerekvwkug,zzxxst,zzzglvaa,zzzxlqbha,zzzz,zzzzcard,ad_label
0,0.000142,0.00082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.000142,0.00082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.000142,0.00082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.000142,0.000821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.000142,0.000821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,0.000142,0.000818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,0.000142,0.000821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,0.000142,0.000818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,0.000142,0.00082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
10,0.000142,0.000821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [61]:
ori_success

Unnamed: 0,aa,aaa,aaaa,aaaaci,aaadrizzl,aaaenerfax,aaagrp,aaaplusdirect,aaasash,aababp,...,zzucpkow,zzvffofbj,zzw,zzx,zzxtfeerekvwkug,zzxxst,zzzglvaa,zzzxlqbha,zzzz,zzzzcard
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
ad_success_x = ad_success.drop(columns = ['ad_label'])
ad_fail_x = ad_fail.drop(columns = ['ad_label'])
result = (ad_success_x - ori_success)

result2 = pd.DataFrame(result, columns = vect.get_feature_names())
#result2.to_csv("resultnoabs.csv")
result2 = result2.sort_values(by=result2.index.tolist(), axis=1, ascending=False, inplace=False)
result2

Unnamed: 0,enron,louis,vinc,attach,thank,dave,doc,employe,schedul,edu,...,miss,drug,world,pain,legal,within,cheap,medic,http,remov
0,0.016067,0.009212,0.008954,0.007152,0.00713,0.00624,0.005893,0.005466,0.005317,0.005155,...,-0.001552,-0.001957,-0.002071,-0.002084,-0.002318,-0.002556,-0.003108,-0.003361,-0.00348,-0.006112
1,0.016079,0.009218,0.00896,0.007157,0.007135,0.006244,0.005897,0.00547,0.005321,0.005159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.003483,0.0
2,0.016061,0.009208,0.008951,0.00715,0.007127,0.006238,0.005891,0.005464,0.005315,0.005154,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.003479,0.0
3,0.016086,0.009223,0.008964,0.007161,0.007138,0.006247,0.0059,0.005473,0.005323,0.005161,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.016081,0.00922,0.008961,0.007158,0.007136,0.006245,0.005898,0.005471,0.005321,0.00516,...,0.0,-0.001959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.016027,0.009189,0.008931,0.007134,0.007112,0.006224,0.005878,0.005453,0.005303,0.005142,...,0.0,0.0,0.0,0.0,0.0,-0.002549,0.0,0.0,0.0,0.0
6,0.016086,0.009222,0.008964,0.00716,0.007138,0.006247,0.0059,0.005473,0.005323,0.005161,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.003484,0.0
7,0.016032,0.009191,0.008934,0.007136,0.007114,0.006226,0.00588,0.005454,0.005305,0.005144,...,0.0,0.0,-0.002067,0.0,0.0,-0.00255,0.0,0.0,-0.003473,0.0
9,0.016077,0.009217,0.008959,0.007157,0.007134,0.006244,0.005897,0.00547,0.00532,0.005159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.016081,0.00922,0.008962,0.007158,0.007136,0.006245,0.005898,0.005471,0.005321,0.00516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
vect.idf_
IDF = pd.DataFrame(vect.idf_.T, index=vect.get_feature_names())
IDF.to_csv("idf.csv")
IDF

Unnamed: 0,0
aa,5.923149
aaa,7.805881
aaaa,10.039473
aaaaci,10.444938
aaadrizzl,10.444938
...,...
zzxxst,10.444938
zzzglvaa,10.444938
zzzxlqbha,10.444938
zzzz,8.047043


In [64]:
ad_success_x

Unnamed: 0,aa,aaa,aaaa,aaaaci,aaadrizzl,aaaenerfax,aaagrp,aaaplusdirect,aaasash,aababp,...,zzucpkow,zzvffofbj,zzw,zzx,zzxtfeerekvwkug,zzxxst,zzzglvaa,zzzxlqbha,zzzz,zzzzcard
0,0.000142,0.00082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000142,0.00082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000142,0.00082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000142,0.000821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000142,0.000821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.000142,0.000818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.000142,0.000821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.000142,0.000818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.000142,0.00082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.000142,0.000821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
ad_success_x[0:1]

Unnamed: 0,aa,aaa,aaaa,aaaaci,aaadrizzl,aaaenerfax,aaagrp,aaaplusdirect,aaasash,aababp,...,zzucpkow,zzvffofbj,zzw,zzx,zzxtfeerekvwkug,zzxxst,zzzglvaa,zzzxlqbha,zzzz,zzzzcard
0,0.000142,0.00082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
ad_all_x = pd.DataFrame(ad_success_x, columns = vect.get_feature_names())
for left in range(96):
    right = left+1
    extract100 = ad_success_x[left:right].values/vect.idf_                # tf-idf[i] / idf[i]

    extract100_ad = ad_success_x[left:right]
    extract100_ad = extract100_ad.loc[:, (extract100_ad>0).all(axis=0)]   # 第一封邮件中，tf-idf>0的部分
    extract100_ad_columns = extract100_ad.columns.tolist()                # 第一封邮件中，tf-idf>0的部分的列名
    extract100_ad_idf = IDF.loc[extract100_ad_columns]                    # 第一封邮件中，tf-idf>0的部分的idf值
    # extract100_ad_idf = extract100_ad_idf[extract100_ad_idf[0] < 5]       # 第一封邮件中，tf-idf>0的部分的idf值 且idf<7 

    X_idf5 = extract100_ad_idf.index.tolist()


    extract100 = pd.DataFrame(extract100, columns = vect.get_feature_names())
    # print(extract100)
    total_tf = extract100.sum().sum()
    print("total_tf", extract100.sum().sum())
    for index, row in ad_success_x[left:right].iterrows():
        row = row.sort_values(axis=0, ascending = False)
    l = row[0:8700].index.tolist()                                         # 第一封邮件中，tf-idf值最大的100个词
    # print(l)

    X_idf5 = list(set(X_idf5).difference(set(l)))                         # 去除top100中出现的元素

    extract100 = extract100[l]                                           # 第一封邮件中，tf-idf值最大的100个词 的 tf-idf[i] / idf[i]


    extract100_list = extract100.values.tolist()[0]
    c = len(X_idf5)
    c2 =  float(extract100.quantile(axis=1)) #extract100_list[0]
    # c2 = 0.022864698
    
    print(c2)
    ratio = []
    for i in range(8700):
        ratio.append(extract100_list[i]/c2)
    extract100_sum = sum(ratio)     # = 1+b1+c1+d1+......
    # c2 = extract100["co"].values        
    #c2 = extract100[extract100.columns[1]].values                         # c2 = a/IDF[A]
    print("c2", c2)
    y1 = (total_tf/c2-extract100_sum)/(c)
    print("y1", y1)
    
    flag = True
    for y in range(1,1000):
        for A in range(1,5000):
            if abs(y/A - y1) < 1e-4:
                print(y, y1*A)
                flag = False
                break
        if flag == False:
            break

    print(flag, A, y, y/A)

    extract100_re = []
    for j in range(8700):
        extract100_re.append(ratio[j]*A)

    extract100_re
    extract100_re_list = extract100_re
    extract100_re = pd.DataFrame(extract100_re, index = l)
    extract100_re.to_csv("extract100_re.csv")
    
    
    
    
    lint = []
    total = 0
    for item in extract100_re_list:
        newitem = float(item)
        lint.append(round(newitem))
        total = total + round(newitem)
    
    len_y = [math.ceil(y)]*c
    # print(lint)
    lint2 = lint+len_y
    total2 = total + c*y
    lint2 = [i/total2 for i in lint2]
    l2 = l+X_idf5

    dictionary = dict(zip(l2, lint2))
    dictionary
    formulate_idf = IDF.loc[dictionary.keys()][0].tolist()
    formulate_tf = list(dictionary.values())
    # print(formulate_tf)
    # print(formulate_idf)
    tf_idf_np = np.multiply(np.array(formulate_idf), np.array(formulate_tf))
    # print(ad_success_x[:1]['newslett'])
    tf_idf_np.tolist()

    formulate_tf_idf = pd.DataFrame(tf_idf_np.tolist(), index = list(dictionary.keys()))
    formulate_tf_idf = pd.DataFrame(formulate_tf_idf.values.T, index = formulate_tf_idf.columns, columns = formulate_tf_idf.index)
    formulate_tf_idf = formulate_tf_idf.loc[0:1]*total_tf 
    formulate_columns = formulate_tf_idf.columns
    
    ad_df = ad_success_x[left:right][formulate_columns.tolist()]
    other_df = ad_success_x[left:right].drop(formulate_columns.tolist(), axis=1)
    other_columns = other_df.columns.tolist()
    all_columns_order = formulate_columns.tolist()+other_columns
    ad_df = pd.concat([ad_df, other_df], axis = 1)


    formulate0 = [0]* len(other_columns)
    formulate0_pf = pd.DataFrame(formulate0, index = other_columns)
    formulate0_pf = pd.DataFrame(formulate0_pf.values.T, columns = formulate0_pf.index, index = formulate0_pf.columns)
    # ad_temp = ad_success_x[left:right]
    # formulate0_pf[formulate0_pf.loc[:, (ad_temp<0.0001).all(axis=0)].columns.tolist()] = 0

    df = pd.concat([formulate_tf_idf, formulate0_pf], axis = 1)
    df.to_csv("formulate.csv")
    df
    
    diff_ge_ad_values = (df.values - ad_df.values)
    diff_ge_ad = pd.DataFrame(diff_ge_ad_values, columns = all_columns_order)
    foldnum = str(left)
    diff_fold = "Diff/Diff_Generate_ad"+ foldnum + ".csv"
    diff_ge_ad.to_csv(diff_fold)
    print("Sum: ", diff_ge_ad.sum().sum())
    
    formulate_y = [0]
    formulate_y = pd.Series(formulate_y)
    formulate_set = CDataset(df, formulate_y)

    formulate_pred = clf_lin.predict(formulate_set.X)
    print(formulate_pred)

total_tf 1.3479526051811923
2.7972091072776635e-05
c2 2.7972091072776635e-05
y1 0.002747068579165815
1 0.966968139866367
False 352 1 0.002840909090909091
Sum:  -1.0647373118570107e-05
CArray([1])
total_tf 1.6701908796714244
2.802493746356834e-05
c2 2.802493746356834e-05
y1 0.0027621984877831502
1 0.9667694707241026
False 350 1 0.002857142857142857
Sum:  -3.146970843300431e-05
CArray([1])
total_tf 1.9068902906103808
2.8037552183010412e-05
c2 2.8037552183010412e-05
y1 0.0028093953215931025
1 0.9664319906280273
False 344 1 0.0029069767441860465
Sum:  -9.207769147310458e-07
CArray([1])
total_tf 1.8981828980276199
2.8098477105159858e-05
c2 2.8098477105159858e-05
y1 0.0028132653318434244
1 0.967763274154138
False 344 1 0.0029069767441860465
Sum:  -2.707136775913338e-06
CArray([1])
total_tf 2.7821781451339955
2.953701013720293e-05
c2 2.953701013720293e-05
y1 0.002889450848602724
1 0.9679660342819126
False 335 1 0.0029850746268656717
Sum:  5.0636162283047346e-05
CArray([1])
total_tf 1.42413346

KeyboardInterrupt: 

In [102]:
left = 3
right = left+1
extract100 = ad_success_x[left:right].values/vect.idf_                # tf-idf[i] / idf[i]

extract100_ad = ad_success_x[left:right]
extract100_ad = extract100_ad.loc[:, (extract100_ad>0).all(axis=0)]   # 第一封邮件中，tf-idf>0的部分
extract100_ad_columns = extract100_ad.columns.tolist()                # 第一封邮件中，tf-idf>0的部分的列名
extract100_ad_idf = IDF.loc[extract100_ad_columns]                    # 第一封邮件中，tf-idf>0的部分的idf值
# extract100_ad_idf = extract100_ad_idf[extract100_ad_idf[0] < 5]       # 第一封邮件中，tf-idf>0的部分的idf值 且idf<7 

X_idf5 = extract100_ad_idf.index.tolist()


print("len_column: ", len(extract100_ad_columns))
extract100 = pd.DataFrame(extract100, columns = vect.get_feature_names())
# print(extract100)
total_tf = extract100.sum().sum()
print("total_tf", extract100.sum().sum())
for index, row in ad_success_x[left:right].iterrows():
    row = row.sort_values(axis=0, ascending = False)
l = row[0:9600].index.tolist()                                         # 第一封邮件中，tf-idf值最大的100个词
# print(l)

X_idf5 = list(set(X_idf5).difference(set(l)))                         # 去除top100中出现的元素

extract100 = extract100[l]                                            # 第一封邮件中，tf-idf值最大的100个词 的 tf-idf[i] / idf[i]
extract100.to_csv("extract100.csv")

len_column:  9635
total_tf 1.3713921962238185


In [67]:
ad_success_x[left:right][l].to_csv("formulate_ad.csv")
ad_df = ad_success_x[left:right][l]
ad_df

Unnamed: 0,newslett,adress,servic,kind,hi,send,regard,mail,thank,pleas,...,californian,coga,shult,interchang,valenzuela,writer,sticki,negativ,jayshre,quattrocchi
3,0.575448,0.488759,0.3976,0.265295,0.243113,0.201515,0.182148,0.174422,0.151895,0.128445,...,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05,2.8e-05


In [103]:
extract100_list = extract100.values.tolist()[0]
print(extract100)
c = len(X_idf5)
c2 =  float(extract100.quantile(axis=1)) #extract100_list[0]
# c2 = 0.022864698



print(c2)
ratio = []
for i in range(9600):
    ratio.append(extract100_list[i]/c2)
extract100_sum = sum(ratio)     # = 1+b1+c1+d1+......
print("Extract100_sum", extract100_sum)
# c2 = extract100["co"].values        
#c2 = extract100[extract100.columns[1]].values                         # c2 = a/IDF[A]
print("c2", c2)
y1 = (total_tf/c2-extract100_sum)/(c)
print("y1", y1)

   newslett    adress    servic      kind        hi      send    regard  \
0  0.124981  0.062614  0.125408  0.062728  0.062043  0.063156  0.063583   

       mail     thank     pleas  ...        biggst         skoch  \
0  0.062133  0.065738  0.064195  ...  7.958783e-08  7.958783e-08   

           bhat      shultsjw        kgaden         aivar    chambellan  \
0  7.958783e-08  7.958783e-08  7.958783e-08  7.958783e-08  7.958783e-08   

        philtic   enaresearch        srpnet  
0  7.958783e-08  8.721808e-08  7.958783e-08  

[1 rows x 9600 columns]
2.1880581800764738e-05
Extract100_sum 62676.1294018187
c2 2.1880581800764738e-05
y1 0.0026375532773922064


In [105]:
flag = True
for y in range(1,1000):
    for A in range(1,5000):
        if abs(y/A - y1) < 1e-7:
            print(y, y1*A)
            flag = False
            break
    if flag == False:
        break

print(flag, A, y, y/A)

extract100_re = []
for j in range(9600):
    extract100_re.append(ratio[j]*A)

extract100_re
extract100_re_list = extract100_re
extract100_re = pd.DataFrame(extract100_re, index = l)
extract100_re.to_csv("extract100_re.csv")

7 7.000066398198916
False 2654 7 0.0026375282592313487


In [106]:
import math
lint = []
total = 0
for item in extract100_re_list:
    newitem = float(item)
    lint.append(round(newitem))
    total = total + round(newitem)
    
len_y = [math.ceil(y)]*c
# print("len_y", len_y)
# print(lint)
lint2 = lint+len_y
# print(lint2)
total2 = total + c*y
lint2 = [i/total2 for i in lint2]
l2 = l+X_idf5

dictionary = dict(zip(l2, lint2))
formulate_idf = IDF.loc[dictionary.keys()][0].tolist()
formulate_tf = list(dictionary.values())
# print(formulate_tf)
# print(formulate_idf)
tf_idf_np = np.multiply(np.array(formulate_idf), np.array(formulate_tf))
# print(ad_success_x[:1]['newslett'])
tf_idf_np.tolist()

formulate_tf_idf = pd.DataFrame(tf_idf_np.tolist(), index = list(dictionary.keys()))
formulate_tf_idf = pd.DataFrame(formulate_tf_idf.values.T, index = formulate_tf_idf.columns, columns = formulate_tf_idf.index)
formulate_tf_idf = formulate_tf_idf.loc[0:1]*total_tf 
formulate_columns = formulate_tf_idf.columns
formulate_tf_idf

Unnamed: 0,newslett,adress,servic,kind,hi,send,regard,mail,thank,pleas,...,horoscop,recepi,regina,presentaton,piqu,pushkar,glenn,tisman,copom,unhappi
0,0.575449,0.488759,0.397601,0.265296,0.243113,0.201515,0.182148,0.174423,0.151895,0.128445,...,4.904849e-07,5.627827e-07,4.993811e-07,5.793851e-07,5.09903e-07,5.09903e-07,4.147577e-07,6.027848e-07,6.027848e-07,4.39278e-07


In [108]:
ad_df = ad_success_x[left:right][formulate_columns.tolist()]
other_df = ad_success_x[left:right].drop(formulate_columns.tolist(), axis=1)
other_columns = other_df.columns.tolist()
all_columns_order = formulate_columns.tolist()+other_columns
ad_df = pd.concat([ad_df, other_df], axis = 1)


formulate0 = [0]* len(other_columns)
formulate0_pf = pd.DataFrame(formulate0, index = other_columns)
formulate0_pf = pd.DataFrame(formulate0_pf.values.T, columns = formulate0_pf.index, index = formulate0_pf.columns)
# ad_temp = ad_success_x[left:right]
# formulate0_pf[formulate0_pf.loc[:, (ad_temp<0.0001).all(axis=0)].columns.tolist()] = 0

df = pd.concat([formulate_tf_idf, formulate0_pf], axis = 1)
df.to_csv("formulate.csv")
df

Unnamed: 0,newslett,adress,servic,kind,hi,send,regard,mail,thank,pleas,...,zzucpkow,zzvffofbj,zzw,zzx,zzxtfeerekvwkug,zzxxst,zzzglvaa,zzzxlqbha,zzzz,zzzzcard
0,0.575449,0.488759,0.397601,0.265296,0.243113,0.201515,0.182148,0.174423,0.151895,0.128445,...,0,0,0,0,0,0,0,0,0,0


In [109]:
diff_ge_ad_values = (df.values - ad_df.values)
diff_ge_ad = pd.DataFrame(diff_ge_ad_values, columns = all_columns_order)
diff_ge_ad.to_csv("Diff_Generate_ad.csv")
print("Sum: ", diff_ge_ad.sum().sum())

Sum:  -6.932099673581947e-06


In [115]:
diff_ge_ad.max().max()

5.421138021821648e-07

In [176]:
diff_ge_ad_threshold = diff_ge_ad.loc[:, (diff_ge_ad>0.001).all(axis=0)] 
diff_ge_ad_threshold_columns = diff_ge_ad_threshold

In [125]:
formulate_y = [0]
formulate_y = pd.Series(formulate_y)
ad_minus2 = ad_success_x[left:right]
words14_list= ["enrononlin", "fundi", "cera", "dailyupd", "jobsearch", "congrad", "ena", "listbot", "counterparti", "calger", "sitara", "jhherbert", "kaminski", "solarc", "clickathom"]
df1 = df
df1 = df1.sub(1e-3)
# df1 = df1.sub(1e-5)

formulate_set = CDataset(df1, formulate_y)

formulate_pred = clf_lin.predict(formulate_set.X)
print(formulate_pred)

CArray([0])


In [150]:
clf_lin.backward(clf_lin.w)

NameError: name 'w' is not defined

In [135]:
clf_lin.grad_f_params(ad_minus2,1)
# clf_lin.grad_f_x(formulate_set.X, 1)

CArray(3479, 1)(dense: [[0.027182] [0.003052] [0.04914 ] ... [0.011569] [0.001291] [1.      ]])

In [126]:
from secml.ml.classifiers.loss import CSoftmax

scores = clf_lin.predict(formulate_set.X, return_decision_function=True)[1]
scores = CSoftmax().softmax(scores)
scores

CArray(1, 2)(dense: [[0.756857 0.243143]])

In [78]:
from secml.ml.classifiers import CClassifierSGD

xval_splitter = CDataSplitterKFold()
clf_lin_sgd = CClassifierMulticlassOVA(CClassifierSGD, regularizer='l2', loss='log')

#xval_lin_params = {'C': [0.1, 1, 10, 100]}
xval_lin_params = {'alpha': [1e-6,1e-5,1e-4]}

print("Find the best params")

best_lin_params = clf_lin.estimate_parameters(
    dataset = tr_set,
    parameters = xval_lin_params,
    splitter = xval_splitter,
    metric = 'accuracy',
    perf_evaluator = 'xval'
)

Find the best params


In [82]:
clf_lin.fit(tr_set.X, tr_set.Y)

pred_class = clf_lin.predict(formulate_set.X)
print(pred_class)

scores = clf_lin.predict(formulate_set.X, return_decision_function=True)[1]
scores = CSoftmax().softmax(scores)
scores

CArray([1])


CArray(1, 2)(dense: [[0.01376 0.98624]])

In [88]:
from secml.ml.classifiers import CClassifierDecisionTree

xval_splitter = CDataSplitterKFold()
clf_lin_knn = CClassifierDecisionTree(random_state=999)

#xval_lin_params = {'C': [0.1, 1, 10, 100]}
xval_lin_params_tree = {}

print("Find the best params")

best_lin_params = clf_lin.estimate_parameters(
    dataset = tr_set,
    parameters = xval_lin_params_tree,
    splitter = xval_splitter,
    metric = 'accuracy',
    perf_evaluator = 'xval'
)

Find the best params


In [92]:
clf_lin_knn.fit(tr_set.X, tr_set.Y)

pred_class = clf_lin_knn.predict(formulate_set.X)
print(pred_class)

scores = clf_lin_knn.predict(formulate_set.X, return_decision_function=True)[1]
scores = CSoftmax().softmax(scores)
scores

CArray([0])


CArray(1, 2)(dense: [[0.731059 0.268941]])

In [99]:
words14 = "enrononlin fundi cera dailyupd jobsearch congrad ena listbot counterparti calger sitara jhherbert kaminski solarc clickathom"
cnt = 0
d2 = "spampy/datasets/enron/enron1/spam"
emails2 = [os.path.join(d2, f) for f in os.listdir(d2)]
for j in emails2:
    with codecs.open(j, "rb", encoding='utf_8_sig', errors='ignore') as m:
        #print(j)
        choose_email = []
        line_str = ""
        for line in m:
            for word in line:
                if word.startswith("http"):
                    word = "URL"
                word = stemmer.stem(word)

            line = re.sub(r'[^a-zA-Z\s]', '', string=line)
            line = line.lower()
            line = line.strip()
            tokens = cut_model.tokenize(line)
            line = [stemmer.stem(token) for token in tokens if token not in stopwords]

            line = ' '.join(line)
            line_str = line_str+line+" "
        line_str = line_str+words14
        choose_email.append(line_str)
    message_14_email = pd.DataFrame(choose_email, columns = ["message"])
    message_14_tf_idf = vect.transform(message_14_email["message"])
    message_14_tf_idf = pd.DataFrame(message_14_tf_idf.toarray(), columns = vect.get_feature_names())
    #print(message_14_tf_idf)
    message_14_y = [1]
    message_14_y = pd.Series(formulate_y)
    message_CData = CDataset(message_14_tf_idf, message_14_y)
    message_14_pred = clf_lin_knn.predict(message_CData.X)
    print(message_14_pred)
    if message_14_pred == 1:
        cnt = cnt+1
    #break
print("Count: ", cnt)
message_14_tf_idf = message_14_tf_idf.loc[:, (message_14_tf_idf>0).all(axis=0)]
message_14_tf_idf.to_csv("message_14_tf_idf.csv")

CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArr

CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArr

CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([1])
CArray([1])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArray([0])
CArr

In [15]:
absresult = abs(ad_success_x - ori_success)
absresult_fail = abs(ad_fail_x - ori_fail)

#absresult = absresult.sort_values(by=absresult.index.tolist(), axis=1, ascending=False, inplace=False)

absresult

Unnamed: 0,aa,aaa,aaaa,aaaaci,aaadrizzl,aaaenerfax,aaagrp,aaaplusdirect,aaasash,aababp,...,zzucpkow,zzvffofbj,zzw,zzx,zzxtfeerekvwkug,zzxxst,zzzglvaa,zzzxlqbha,zzzz,zzzzcard
0,0.0,0.000456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.000455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.000455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,0.0,0.000456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,0.0,0.000457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.000456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.000456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
r = pd.DataFrame(result, columns = vect.get_feature_names())
r = r.loc[0,:].to_frame()
r.to_csv("row0_resultnoabs.csv")

In [39]:
for index, row in absresult[: 1].iterrows():
    #print(row)
#row = absresult.tail(1)
#row = absresult.iloc[-25].to_frame()
    row = row.to_frame()
#row = row.sort_values(by=row.index.tolist(), axis=1, ascending=False, inplace=False)
    row = row.sort_values(by=index, axis=0, ascending=False, inplace=False)
    featuresindex = row.index[:100].to_frame()
    filename = "chosenfiles/row" + str(index) + "_100.csv"
    #print(filename)
    #print(featuresindex)
    featuresindex.to_csv(filename)
    #row = row.head(100).index
    #row = pd.DataFrame(row)
    #row.to_csv("row0_100.csv")
    #break
print(featuresindex)

                 0
enron        enron
louis        louis
vinc          vinc
thank        thank
attach      attach
...            ...
destruct  destruct
resum        resum
htm            htm
calendar  calendar
meter        meter

[100 rows x 1 columns]


In [19]:
for index, row in absresult_fail.iterrows():
    #print(row)
#row = absresult.tail(1)
#row = absresult.iloc[-25].to_frame()
    row = row.to_frame()
#row = row.sort_values(by=row.index.tolist(), axis=1, ascending=False, inplace=False)
    row = row.sort_values(by=index, axis=0, ascending=False, inplace=False)
    featuresindex = row.index[:100].to_frame()
    filename = "chosenfiles_fail/row" + str(index) + "_100.csv"
    #print(filename)
    #print(featuresindex)
    featuresindex.to_csv(filename)
    #row = row.head(100).index
    #row = pd.DataFrame(row)
    #row.to_csv("row0_100.csv")
    #break

In [20]:
ad_fail_result = ad_fail_x - ori_fail
for index, row in ad_fail_result.iterrows():
    row = row.to_frame()
    row = row.loc[(row<0).all(axis=1),:]
   
    filename = "disappearing_fail/row" + str(index) + "_100.csv"
    
    row.to_csv(filename)

In [21]:
for index, row in result.iterrows():
    #print(row)
#row = absresult.tail(1)
#row = absresult.iloc[-25].to_frame()
    row = row.to_frame()
    row = row.loc[(row<0).all(axis=1),:]
    #print(row)
#row = row.sort_values(by=row.index.tolist(), axis=1, ascending=False, inplace=False)
    # row = row.sort_values(by=index, axis=0, ascending=False, inplace=False)
    
    # featuresindex = row.index[:100].to_frame()
    filename = "disappearing/row" + str(index) + "_100.csv"
    
    #print(filename)
    #print(featuresindex)
    
    row.to_csv(filename)
    
    #row = row.head(100).index
    #row = pd.DataFrame(row)
    #row.to_csv("row0_100.csv")
    #break

In [85]:
import csv
with codecs.open("spampy/2081.2004-09-06.GP.spam14.txt", "rb", encoding='utf_8_sig', errors='ignore') as m:
    choose_email = []
    line_str = ""
    for line in m:
        for word in line:
            if word.startswith("http"):
                print(word)
                word = "URL"
                print(word)
            word = stemmer.stem(word)
            add_words.append(word)
            
        line = re.sub(r'[^a-zA-Z\s]', '', string=line)
        line = line.lower()
        line = line.strip()
        tokens = cut_model.tokenize(line)
        line = [stemmer.stem(token) for token in tokens if token not in stopwords]

        line = ' '.join(line)
        line_str = line_str+line+" "
    choose_email.append(line_str)
    
add_words = choose_email[0].split()
print(add_words)
ham_unique = []
with open("ham_unique.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        ham_unique.append(row[1])
ham_unique = ham_unique[1:]

spam_unique = []
with open("spam_unique.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        spam_unique.append(row[1])
spam_unique = spam_unique[1:]


# add_words = list(set(add_words).difference(set(ham_unique)))
add_words = list(set(add_words).difference(set(spam_unique)))
print(add_words)


ori_choose = ori_examples2_x[0]
ori_choose = pd.DataFrame(ori_examples2_x, columns = vect.get_feature_names())
ori_choose = ori_choose.head(1)
ori_choose


choose_email_message = pd.DataFrame(choose_email)
choose_email_message
tfidf_choose = vect.transform(choose_email_message.iloc[:,0])
tfidf_matrix_choose = pd.DataFrame(tfidf_choose.toarray(), columns = vect.get_feature_names())

ad_success_choose = ad_success_x.head(1)
tfidf_matrix_choose = tfidf_matrix_choose.append(ad_success_choose)

tfidf_matrix_choose = tfidf_matrix_choose.loc[:, (tfidf_matrix_choose>=0).all(axis=0)]
tfidf_matrix_choose = tfidf_matrix_choose.loc[:, ~(tfidf_matrix_choose==0).all(axis=0)]
print(tfidf_matrix_choose)

tfidf_matrix_choose.to_csv("Change14/change2.csv")

choose_y_pred = clf_lin.predict(tfidf_choose)
choose_y_pred

['email', 'load', 'wise', 'triatom', 'lauren', 'jocular', 'lifo', 'perplex', 'ringlet', 'romano', 'wichita', 'acced', 'hither', 'blot', 'contract', 'scamp', 'sultan', 'caveman', 'finni', 'atmospher', 'complaint', 'reminisc', 'dihedr', 'itt', 'rica', 'mt', 'patrimoni', 'diminut', 'lampoon', 'jimmi', 'whereupon', 'handicraftsman', 'small', 'alumni', 'ellsworth', 'cater', 'cereal', 'enrononlin', 'fundi', 'cera', 'dailyupd', 'jobsearch', 'congrad', 'ena', 'listbot', 'counterparti', 'calger', 'sitara', 'jhherbert', 'kaminski', 'solarc', 'clickathorm', 'blot', 'cater', 'romano', 'diminut', 'rica', 'atmosph', 'email', 'whereupon', 'lauren', 'reminisc', 'sultan', 'small', 'load', 'jimmi', 'perplex', 'contract', 'complaint', 'mt', 'alumni', 'lifo', 'wise', 'blot', 'cater', 'romano', 'diminut', 'rica', 'atmosph', 'email', 'whereupon', 'lauren', 'reminisc', 'sultan', 'small', 'load', 'jimmi', 'perplex', 'contract', 'complaint', 'mt', 'alumni', 'lifo', 'wise', 'blot', 'cater', 'romano', 'diminut',

PermissionError: [Errno 13] Permission denied: 'Change14/change2.csv'

In [99]:
from sklearn.feature_extraction.text import CountVectorizer
vect.idf_

vectest = CountVectorizer()
vectest
test = vectest.fit_transform(choose_email_message.iloc[:,0])
testpd = pd.DataFrame(test.toarray(), columns = vectest.vocabulary_)
testpd.sum(axis=1)
testpd


Unnamed: 0,email,load,wise,triatom,lauren,jocular,lifo,perplex,ringlet,romano,...,ena,listbot,counterparti,calger,sitara,jhherbert,kaminski,solarc,clickathorm,atmosph
0,1,6,5,1,6,1,6,1,1,1,...,6,1,1,6,1,6,1,6,1,6


In [21]:
absresult.to_csv("abs_result.csv")

In [18]:
absresult_columns = absresult.columns[0:500]
abs_result_columns = pd.DataFrame(absresult_columns)
abs_result_columns.to_csv("abs_result_columns.csv")

In [26]:

x2result1 = result
x2result1 = np.array(x2result1)
x2result = result
x2result = x2result.multiply(x2result1)

 
sum_number = x2result.sum()/cnt


sum_number = pd.DataFrame(sum_number, columns = ['sum_number'])
sum_number = sum_number.sort_values(by='sum_number', ascending=False, inplace=False)
sum_number
#sum_number = sum_number.loc[var_number['var_number']>0]
#print(sum_number.index[:200])
#print(sum_number.index[:500])

sum_number_pd = pd.DataFrame(sum_number.index[:500])
sum_number_pd.to_csv("x2result.csv")

In [22]:
# ori > 0, ad = 0
ad1 = ad_success_x
ori1 = ori_success

ori2 = ori1.loc[:, (ori1>=0).all(axis=0)]
ori = ori2.loc[:, ~(ori2==0).all(axis=0)]

ad = ad1.loc[:,ori.columns]
ad = ad.loc[:, (ad>=0).all(axis=0)]
ad = ad.loc[:, ~(ad>0).all(axis=0)]
ad

Unnamed: 0,abacha,abe,abei,abel,abhorr,abiiiti,abl,aboard,abraham,abroad,...,youst,youstil,yx,zap,zenith,zggpyfjfvt,zipcod,zit,zone,zw
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.276177,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.027996,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.13319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.030628,0.061257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
ori1_ad0_columns = ad.columns
ori1_ad0_columns = pd.DataFrame(ori1_ad0_columns)
ori1_ad0_columns.to_csv("ori1_ad0_columns.csv")

In [24]:
# ori = 0, ad > 0
ad11 = ad_success_x
ori11 = ori_success

ori21 = ori11.loc[:, (ori11>=0).all(axis=0)]
ori22 = ori21.loc[:, ~(ori21>0).all(axis=0)]

ad22 = ad11.loc[:,ori22.columns]
ad22 = ad22.loc[:, (ad22>=0).all(axis=0)]
ad22 = ad22.loc[:, ~(ad22==0).all(axis=0)]
ad22

Unnamed: 0,aaa,aaldou,abacha,abacu,abacustech,abandon,abb,abbamont,abbott,abdo,...,zorganizowalem,zosta,zrobic,zrobilem,zw,zwlaszcza,zwrocic,zwwyw,zyc,zzncacst
0,0.000456,0.000104,0.0,0.000184,0.001456,0.000152,4.1e-05,9e-05,9.2e-05,5e-05,...,8.3e-05,4.8e-05,0.000112,5.6e-05,0.0,5.6e-05,2e-06,8.4e-05,7e-05,8.4e-05
1,0.000455,0.000104,0.0,0.000184,0.001457,0.000152,4.1e-05,9e-05,9.2e-05,5e-05,...,8.3e-05,4.8e-05,0.000111,5.6e-05,0.0,5.6e-05,2e-06,8.4e-05,7e-05,8.4e-05
2,0.000456,0.000104,0.0,0.000183,0.001452,0.000152,4.1e-05,9e-05,9.3e-05,5e-05,...,8.3e-05,4.8e-05,0.000111,5.6e-05,0.0,5.6e-05,2e-06,8.4e-05,7e-05,8.4e-05
3,0.000456,0.000104,0.0,0.000183,0.001455,0.000153,4.1e-05,9e-05,9.2e-05,5e-05,...,8.3e-05,4.8e-05,0.000111,5.6e-05,0.0,5.6e-05,2e-06,8.4e-05,7e-05,8.4e-05
4,0.000456,0.000104,0.0,0.000183,0.001458,0.000152,4.1e-05,9e-05,9.2e-05,5e-05,...,8.3e-05,4.8e-05,0.000112,5.6e-05,0.0,5.6e-05,2e-06,8.4e-05,7e-05,8.4e-05
5,0.000455,0.000104,0.0,0.000183,0.00145,0.000157,4.1e-05,9e-05,9.2e-05,5e-05,...,8.3e-05,4.8e-05,0.000111,5.6e-05,0.0,5.6e-05,2e-06,8.4e-05,7e-05,8.4e-05
7,0.000456,0.000104,0.0,0.000184,0.001455,0.000153,4.1e-05,9e-05,9.2e-05,5e-05,...,8.3e-05,4.8e-05,0.000112,5.6e-05,0.0,5.6e-05,2e-06,8.4e-05,7e-05,8.4e-05
12,0.000456,0.000104,0.0,0.000183,0.001453,0.000152,4.1e-05,9.1e-05,9.2e-05,5e-05,...,8.3e-05,4.8e-05,0.000112,5.6e-05,0.0,5.6e-05,2e-06,8.4e-05,7e-05,8.4e-05
16,0.000455,0.000104,0.0,0.000183,0.00145,0.000152,4.1e-05,9e-05,9.1e-05,5e-05,...,8.3e-05,4.8e-05,0.000111,5.6e-05,0.0,5.6e-05,2e-06,8.4e-05,7e-05,8.4e-05
17,0.000457,0.000104,0.0,0.000184,0.001453,0.000152,4.1e-05,9e-05,9.2e-05,5e-05,...,8.3e-05,4.8e-05,0.000112,5.6e-05,0.0,5.6e-05,2e-06,8.4e-05,7e-05,8.4e-05


In [25]:
ori0_ad1_columns = ad22.columns
ori0_ad1_columns = pd.DataFrame(ori0_ad1_columns)
ori0_ad1_columns.to_csv("ori0_ad1_columns.csv")

In [20]:
result_var = result
result_var = pd.DataFrame(result, columns = vect.get_feature_names())
var_number = result_var.var()

var_number = pd.DataFrame(var_number, columns = ['var_number'])
var_number = var_number.sort_values(by='var_number', ascending=False, inplace=False)
#var_number
var_number = var_number.loc[var_number['var_number']>0]
print(var_number.index[:200])
print(var_number.index[:500])

var_number_pd = pd.DataFrame(var_number.index[:500])
var_number_pd.to_csv("top100features_var_7.csv")


Index(['remov', 'http', 'softwar', 'money', 'email', 'onlin', 'life', 'medic',
       'best', 'us',
       ...
       'loan', 'newslett', 'lowest', 'client', 'word', 'live', 'grow', 'fear',
       'car', 'half'],
      dtype='object', length=200)
Index(['remov', 'http', 'softwar', 'money', 'email', 'onlin', 'life', 'medic',
       'best', 'us',
       ...
       'agileconcept', 'benefit', 'simpli', 'rr', 'batch', 'width', 'member',
       'serial', 'debt', 'enjoy'],
      dtype='object', length=500)


In [21]:
result2 = result2.loc[:, (result2<=0).all(axis=0)]
result2_columns = result2.loc[:, ~(result2==0).all(axis=0)].columns  # 删了它

In [22]:
result2_columns_pd = pd.DataFrame(result2_columns)
result2_columns_pd.to_csv("columns_success_7.csv")