In [1]:
import numpy as np
import tensorflow as tf 
from keras.models import load_model

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten,Subtract,Reshape
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D,Conv2D,MaxPooling2D,Input,Lambda,GlobalMaxPooling2D
from keras.regularizers import l2
from keras import backend as K
from keras.applications.vgg16 import VGG16
from skimage.io import imsave

from matplotlib.pyplot import imread
from skimage.transform import rescale, resize
import os

# Load precomputed embeddings

In [2]:
output_dir = '../train/'
dataset_path = '../../datasets/VisualPhish/'

phish_emb_name = 'phishing_emb.npy'
phish_emb_labels_name = 'phishing_labels.npy'

phish_train_idx_name = 'train_idx.npy'
phish_test_idx_name = 'test_idx.npy'

train_emb_name = 'whitelist_emb.npy'
train_emb_labels_name = 'whitelist_labels.npy'

#precomputed attacks embeddings for the phishing test set if any. 
#set use_attack to 1 to compute based on this
phish_emb_test_attack = 'X_phish_test_noise_gamma.npy'
use_attack = 0

X_legit_train = np.load(output_dir+train_emb_name)
y_legit_train = np.load(output_dir+train_emb_labels_name)

X_phish = np.load(output_dir+phish_emb_name)
y_phish = np.load(output_dir+phish_emb_labels_name)

phish_test_idx = np.load(output_dir+phish_test_idx_name)
phish_train_idx = np.load(output_dir+phish_train_idx_name)

X_phish_test = X_phish[phish_test_idx,:]
y_phish_test = y_phish[phish_test_idx,:]

#set the phishing test set directly to the precomputed embeddings of the attack
if use_attack == 1:
    X_phish_test = np.load(output_dir+phish_emb_test_attack)
    print('Test on: '+phish_emb_test_attack)

X_phish_train = X_phish[phish_train_idx,:]
y_phish_train = y_phish[phish_train_idx,:]


In [3]:
label_dict={}#reverse lookup for file name, return labels
# Get file names of each example 
def read_file_names(data_path,file_name):
    targets_file = open(data_path+file_name, "r")
    targets = targets_file.read()
    
    file_names_list = []
    labels_list=[]
    targets_list = targets.splitlines()
    for i in range(0,len(targets_list)):
        target_path = data_path + targets_list[i]
        #new add stuff
        label=targets_list[i]
        file_names = sorted(os.listdir(target_path))
        for j in range(0,len(file_names)):
            file_names_list.append(file_names[j])
            #new add stuff
            label_dict[file_names[j]]=label
    return file_names_list

legit_file_names = read_file_names(dataset_path+'trusted_list/','targets.txt')
phish_file_names = read_file_names(dataset_path+'phishing/','targets.txt')

phish_train_file_names = []
for i in range(0,phish_train_idx.shape[0]):
    phish_train_file_names.append(phish_file_names[phish_train_idx[i]])
    
phish_test_file_names = []
for i in range(0,phish_test_idx.shape[0]):
    phish_test_file_names.append(phish_file_names[phish_test_idx[i]])

def get_label_from_name(name):
#     first_half = name.split('_',1)[0]
#     number = int(first_half.replace('T',''))
    label = label_dict[name]
    #return number
    return label

In [4]:
# L2 distance
def compute_distance_pair(layer1,layer2):
    diff = layer1 - layer2
    l2_diff = np.sum(diff**2) / X_phish_train.shape[1]
    return l2_diff

# Pairwise distance between query image and training
def compute_all_distances(test_matrix):
    train_size = phish_train_idx.shape[0] + X_legit_train.shape[0]
    X_all_train = np.concatenate((X_phish_train,X_legit_train))
    pairwise_distance = np.zeros([test_matrix.shape[0],train_size])
    for i in range(0,test_matrix.shape[0]):
        pair1 = test_matrix[i,:]
        for j in range(0,train_size):
            pair2 = X_all_train[j,:]
            l2_diff = compute_distance_pair(pair1,pair2)
            pairwise_distance[i,j] = l2_diff
    return pairwise_distance
pairwise_distance = compute_all_distances(X_phish_test)

# Find Smallest n distances
def find_min_distances(distances,n):
    idx = distances.argsort()[:n]
    values = distances[idx]
    return idx,values

# Find names of examples with min distance
def find_names_min_distances(idx,values):
    names_min_distance = ''
    only_names = []
    distances = ''
    for i in range(0,idx.shape[0]):
        index_min_distance = idx[i]
        if (index_min_distance < X_phish_train.shape[0]):
            names_min_distance = names_min_distance + 'Phish: ' + phish_train_file_names[index_min_distance] +','
            only_names.append(phish_train_file_names[index_min_distance])   
        else:
            names_min_distance = names_min_distance + 'Legit: ' + legit_file_names[index_min_distance-X_phish_train.shape[0]] +','
            only_names.append(legit_file_names[index_min_distance-X_phish_train.shape[0]])   
        distances = distances + str(values[i]) + ','
    names_min_distance = names_min_distance[:-1]
    distances = distances[:-1]
    return names_min_distance,only_names,distances

# Find same-category website (matching is correct if it was matched to the same category (e.g. microsoft and outlook ))
parents_targets = ['microsoft','apple','google','alibaba']
sub_targets = [['ms_outlook','ms_office','ms_bing','ms_onedrive','ms_skype'],['itunes','icloud'],['google_drive'],['aliexpress']]

parents_targets_idx = [90,12,65,4]
sub_targets = [[150,152,151,149,148],[153,154],[147],[5]]

def check_if_same_category(img_label1,img_label2):
    if_same = 0
    if img_label1 in parents_targets_idx:
        if img_label2 in sub_targets[parents_targets_idx.index(img_label1)]:
            if_same = 1
    elif img_label1 in sub_targets[0]:
        if img_label2 in sub_targets[0] or img_label2 == parents_targets_idx[0]:
            if_same = 1
    elif img_label1 in sub_targets[1]:
        if img_label2 in sub_targets[1] or img_label2 == parents_targets_idx[1]:
            if_same = 1
    elif img_label1 in sub_targets[2]:
        if img_label2 in sub_targets[2] or img_label2 == parents_targets_idx[2]:
            if_same = 1
    return if_same

# Find if target is in the top closest n distances
def check_if_target_in_top(test_file_name,only_names):
    found = 0
    idx = 0
    test_label = get_label_from_name(test_file_name)
    print('***')
    print('Test example: '+test_file_name)
    for i in range(0,len(only_names)):
        label_distance = get_label_from_name(only_names[i])
        if label_distance == test_label or check_if_same_category(test_label,label_distance) == 1:
            found = 1
            idx = i+1
            print('found')
            break
    return found,idx

# Compute correct matches

In [5]:
n = 1 #Top-1 match
correct = 0

for i in range(0,phish_test_idx.shape[0]):
    distances_to_train = pairwise_distance[i,:]
    idx,values = find_min_distances(np.ravel(distances_to_train),n)
    names_min_distance,only_names,min_distances = find_names_min_distances(idx,values)
    found,found_idx = check_if_target_in_top(phish_test_file_names[i],only_names)
    print(names_min_distance)
    
    if found == 1:
        correct += 1
        

print("Correct match percentage: " + str(correct/phish_test_idx.shape[0]))

***
Test example: 657f5850f7.png
found
Legit: 657f5850f7.png
***
Test example: 5b4ffaf701.png
found
Legit: 5b4ffaf701.png
***
Test example: d20032afed.png
found
Legit: c44007d408.png
***
Test example: d88ffc754c.png
found
Phish: 9e0fc9ca1a.png
***
Test example: 66265d43d1.png
found
Legit: b20987ebcd.png
***
Test example: 102b42428c.png
found
Phish: a018b75668.png
***
Test example: 10ac09f9bd.png
found
Legit: c391dc241a.png
***
Test example: b78ae1118a.png
found
Legit: 1181851fb8.png
***
Test example: 4289ba2b21.png
Legit: 97b28b4ea5.png
***
Test example: 5e6b903c2a.png
Legit: 67a33d2fca.png
***
Test example: 25d37b7e37.png
found
Legit: 9889b709e3.png
***
Test example: 1828d43ba5.png
found
Legit: 25b243542d.png
***
Test example: ef9c7a91fb.png
found
Phish: 164987d787.png
***
Test example: 7f3eb53cac.png
Phish: 726c28c866.png
***
Test example: 04868a5e93.png
found
Phish: de7a7950cd.png
***
Test example: 61c0c4dcec.png
found
Legit: 7f3158d6e1.png
***
Test example: 91df002ce7.png
found
Leg

***
Test example: 95e4eb7948.png
found
Legit: ab0d109fe6.png
***
Test example: da22a100be.png
found
Phish: 2f8b84fbad.png
***
Test example: 00007e782f.png
found
Legit: 00007e782f.png
***
Test example: 14ac712617.png
found
Legit: 14ac712617.png
***
Test example: d6bbd8eb0d.png
found
Legit: d6bbd8eb0d.png
***
Test example: 732077580e.png
found
Legit: fa733b6604.png
***
Test example: 1f196f3bbb.png
found
Phish: 8a63fab552.png
***
Test example: 34422d4162.png
found
Legit: 34422d4162.png
***
Test example: 29e3b50147.png
found
Legit: 3b0cbde150.png
***
Test example: 3e8c285ba0.png
found
Phish: 8f1de4610d.png
***
Test example: 8429c4f4fd.png
found
Legit: 54185a76e6.png
***
Test example: 512320c5d1.png
Legit: 32904405af.png
***
Test example: 7529f03c42.png
found
Phish: 1143d5eadc.png
***
Test example: 7f00974a6d.png
Legit: 7f19b879a8.png
***
Test example: eab4de12de.png
found
Legit: 3de3fe5ebc.png
***
Test example: 01727b73b3.png
Legit: a52c4c5b75.png
***
Test example: 87b2fbf550.png
found
Leg

***
Test example: 06c681ef92.png
found
Phish: bb5399c1d9.png
***
Test example: f54322680b.png
found
Phish: 39863ae213.png
***
Test example: 8dc03d3375.png
found
Legit: 8dc03d3375.png
***
Test example: ffccf5dbfc.png
found
Legit: 326044a620.png
***
Test example: a62a49d478.png
Legit: 020a9b5ceb.png
***
Test example: 8b094b002f.png
found
Legit: 8b094b002f.png
***
Test example: fe1e090f85.png
found
Phish: dc23339f1a.png
***
Test example: dc384ed3df.png
found
Phish: 6419eecd95.png
***
Test example: c30e9f28c9.png
found
Phish: 3d751f8265.png
***
Test example: 299f1bf0f4.png
found
Phish: f2cfb96dc0.png
***
Test example: 214c5b846a.png
Legit: 4169f2abe9.png
***
Test example: d0cad335ef.png
Legit: 8c87d579ab.png
***
Test example: 59928fc955.png
found
Phish: 0fee0bcfc4.png
***
Test example: 88a3087511.png
found
Legit: c391dc241a.png
***
Test example: 7ca446be53.png
found
Legit: 217015ade0.png
***
Test example: 977c9e6b6c.png
found
Phish: 61cc89bfca.png
***
Test example: 0a89d9a510.png
found
Leg

***
Test example: 6a453e2837.png
Phish: 8cb501f73e.png
***
Test example: 3c38747265.png
Legit: 710430528d.png
***
Test example: d9caa79394.png
found
Legit: 0245fc29a6.png
***
Test example: 8ff337fa31.png
found
Legit: 8ff337fa31.png
***
Test example: 0c4256d1de.png
found
Phish: d6fee6f3e7.png
***
Test example: 984fb1a875.png
Legit: faa9f7b1d6.png
***
Test example: 7076b713dc.png
found
Legit: 0f77017f3d.png
***
Test example: 966b95d353.png
found
Legit: 966b95d353.png
***
Test example: 824e7ff2d2.png
found
Phish: 329d268e85.png
***
Test example: 7f0772311d.png
Phish: cae194e044.png
***
Test example: ddd4e38fa9.png
found
Phish: ce4755b54c.png
***
Test example: b57bdfa34e.png
found
Legit: 64b7b860e6.png
***
Test example: 82521a8e02.png
found
Legit: 82521a8e02.png
***
Test example: c55e16d0d8.png
Legit: 0808bb8145.png
***
Test example: c0a33b6575.png
found
Phish: eb548b68e3.png
***
Test example: 963836e6c8.png
Legit: 862b541067.png
***
Test example: 3e273a6800.png
found
Phish: 525d4c1c1d.png