In [1]:
import numpy as np
import pandas as pd
import glob
import os.path
import sys
from datetime import datetime

# from svecon.HierarchicalGridSearchCV import HierarchicalGridSearchCV
# from svecon.EmptyTransformer import EmptyTransformer

# from sklearn.cross_validation import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

from metric_learn import LMNN, NCA, LFDA, Covariance
from metric_learn import ITML_Supervised, SDML_Supervised, LSML_Supervised, RCA_Supervised

In [2]:
def firstSmallerThan(threshold):
    def firstSmaller(L):
        for i,v in enumerate(L):
            if v<threshold:
                return i,v
        return None,None
    return firstSmaller

def firstHigherThan(threshold):
    def firstHigher(L):
        for i,v in enumerate(L):
            if v>=threshold:
                return i,v
        return None,None
    return firstHigher

def findMalwareAndClean(L):
    findMalware = firstHigherThan(X_train.shape[0]//2)
    findClean =  firstSmallerThan(X_train.shape[0]//2)
    
    im,vm = findMalware(L)
    ic,vc = findClean(L)
    return ic,im

def timestampToStr(t):
    return datetime.fromtimestamp(t).strftime('%Y-%m-%d')

def renameColumns(d):
    def checkDict(c):
        if c in d:
            return d[c]
        return c
    return checkDict

def evaluateResults(label, y_true, y_pred):
    # TrainSize: 68000
    # TestSize: 28000
    # Ratio: 1.5000
    # MinCln: 96
    # MaxMal: 600
    # FP: 292 (0.0209)
    # FN: 1805 (0.1289)
    # TP: 12195 (0.8711)
    # TN: 13708 (0.9791)
    # Precision: 0.9766
    # Recall: 0.8711
    # Accuracy: 0.9251
    # F1 measure: 0.9208

    n = len(y_true)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    accuracy = (tp+tn)/n
    f1 = 2*((precision*recall)/(precision+recall))
    
    print("===== {}".format(label))
    print("FP: {}  ({})".format(fp, fp-292))
    print("FN: {}  ({})".format(fn, fn-1805))
    print("TP: {}  ({})".format(tp, tp-12195))
    print("TN: {}  ({})".format(tn, tn-13708))
    print("Precision: {}  ({})".format( 100*precision, 100*(precision-0.9766) ))
    print("Recall: {}  ({})".format( 100*recall, 100*(recall-0.8711) ))
    print("Accuracy: {}  ({})".format( 100*accuracy, 100*(accuracy-0.9251) ))
    print("F1 measure: {}  ({})".format( 100*f1, 100*(f1-0.9208) ))
    print()
    return {
        'fp': fp,
        'fn': fn,
        'tp': tp,
        'tn': tn,
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'f1': f1,
    }

def gridSearchKNN(label, X_train, y_train, X_test, y_test, transformer=None):
    return []
    results = []
    
    if transformer:
        X_train_trans = transformer.transform(X_train)
        X_test_trans = transformer.transform(X_test)
    else:
        X_train_trans = X_train
        X_test_trans = X_test
    
    for weight in ['uniform', 'distance']:
        for k in [1,2,4,8,16,32]:
            knn = KNeighborsClassifier(n_neighbors=k, weights=weight, n_jobs=-1)
            knn.fit(X_train_trans, y_train)
            results.append( evaluateResults(label.format(k, weight), y_test, knn.predict(X_test_trans)) )
            results[-1]['label'] = label.format(k, weight)
    return results

def getDist(D,i):
    if i is None:
        return float('inf')
    return D[i]

def gridSearchKNNratio(label, X_train, y_train, X_test, y_test, transformer=None):
    results = []
    
    if transformer:
        X_train_trans = transformer.transform(X_train)
        X_test_trans = transformer.transform(X_test)
    else:
        X_train_trans = X_train
        X_test_trans = X_test
    

    knn = KNeighborsClassifier(n_neighbors=X_train.shape[0]//2, weights='uniform', n_jobs=1)
    knn.fit(X_train_trans, y_train)
    distances, indices = knn.kneighbors(X_test_trans)

    malwareAndCleanIndices = list(map(findMalwareAndClean, indices))
    distancesClosest = [(getDist(D, i[0]),getDist(D, i[1])) for D,i in zip(distances, malwareAndCleanIndices)]

    for ratio in [.25,.5,.75,1,1.25,1.5,1.75,2,2.5,3,3.5,4,5]:
        distanceRatios = [x/y if y>0 else ratio+1 for x,y in distancesClosest]
        y_pred = [0 if x<ratio else 1 for x in distanceRatios]

        results.append( evaluateResults(label.format(ratio), y_test, y_pred))
        results[-1]['label'] = label.format(ratio)
    return results

In [3]:
X_train = pd.read_csv('kovac/train260k_features.csv', sep=',', low_memory=False, index_col=False, dtype=float)
X_test = pd.read_csv('kovac/test90k_features.csv', sep=',', low_memory=False, index_col=False, dtype=float)
X_train.values.shape, X_test.shape

((260000, 99), (90000, 99))

In [4]:
X_train.describe(include='all')

Unnamed: 0,file_length,pe_imgbase,pe_ep_rvao,pe_dir_rva_0,pe_dir_rva_1,pe_dir_rva_2,pe_dir_rva_3,pe_dir_rva_4,pe_dir_rva_5,pe_dir_rva_6,...,dotnet_strings_size,dotnet_ustrings_size,dotnet_blob_size,dotnet_tilde_size,dotnet_stream_cnt,pe_link_maj,pe_link_min,pe_res_types_count,pe_res_ratio_to_size,pe_res_manifest_req_admin
count,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,...,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0
mean,1145662.0,2013257000000000.0,505049.6,305676.8,639527.8,1135777.0,128751.7,641075.4,795387.5,235874.9,...,2646.548,2463.844,1243.963,4121.195,0.622658,7.465869,6.332196,0.849015,16.020285,0.022835
std,8851515.0,1.847337e+17,13638680.0,14527770.0,6492955.0,14166340.0,14318520.0,14996410.0,18152830.0,10064170.0,...,30095.74,62158.09,16281.8,52935.43,1.653254,5.169654,13.360378,2.165276,27.493918,0.149376
min,784.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,65536.0,4194304.0,6075.0,0.0,17481.0,12288.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0
50%,256704.0,4194304.0,30338.0,0.0,76344.0,69632.0,0.0,0.0,24576.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,2.0,0.0
75%,835732.0,268435500.0,149056.0,4336.0,292008.0,380928.0,0.0,0.0,212992.0,6128.0,...,0.0,0.0,0.0,0.0,0.0,9.0,10.0,0.0,16.0,0.0
max,1966969000.0,1.844674e+19,3221379000.0,4294967000.0,1092039000.0,1092096000.0,2975263000.0,3408354000.0,4278190000.0,2975263000.0,...,6655260.0,12302020.0,2443600.0,6193232.0,11.0,255.0,255.0,54.0,100.0,1.0


In [5]:
XTM = X_test.mean(axis=0)
XTS = X_train.std(axis=0)
XTM.shape, XTS.shape

((99,), (99,))

In [6]:
X_train_norm = (X_train - XTM) / XTS
X_test_norm = (X_test - XTM) / XTS
X_train_norm.describe(include='all')

Unnamed: 0,file_length,pe_imgbase,pe_ep_rvao,pe_dir_rva_0,pe_dir_rva_1,pe_dir_rva_2,pe_dir_rva_3,pe_dir_rva_4,pe_dir_rva_5,pe_dir_rva_6,...,dotnet_strings_size,dotnet_ustrings_size,dotnet_blob_size,dotnet_tilde_size,dotnet_stream_cnt,pe_link_maj,pe_link_min,pe_res_types_count,pe_res_ratio_to_size,pe_res_manifest_req_admin
count,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,...,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0,260000.0
mean,-0.00592,0.006029,-0.000731,0.001179,-0.000752,-0.002641,0.000162,0.001075,-0.00123,-1.2e-05,...,0.008771,0.00097,0.00325,0.004258,0.003046,-0.004433,-0.003657,-0.00263,-0.004038,0.002314
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-0.135262,-0.004869,-0.037762,-0.019862,-0.099248,-0.082815,-0.00883,-0.041674,-0.045046,-0.023449,...,-0.079166,-0.038668,-0.073152,-0.073595,-0.37358,-1.448605,-0.47761,-0.394735,-0.586722,-0.150552
25%,-0.127947,-0.004869,-0.037316,-0.019862,-0.096556,-0.081948,-0.00883,-0.041674,-0.045046,-0.023449,...,-0.079166,-0.038668,-0.073152,-0.073595,-0.37358,-0.287986,-0.47761,-0.394735,-0.586722,-0.150552
50%,-0.10635,-0.004869,-0.035537,-0.019862,-0.08749,-0.0779,-0.00883,-0.041674,-0.043692,-0.023449,...,-0.079166,-0.038668,-0.073152,-0.073595,-0.37358,-0.09455,-0.47761,-0.394735,-0.513979,-0.150552
75%,-0.040934,-0.004869,-0.026833,-0.019563,-0.054275,-0.055926,-0.00883,-0.041674,-0.033313,-0.02284,...,-0.079166,-0.038668,-0.073152,-0.073595,-0.37358,0.292323,0.270871,-0.394735,-0.004776,-0.150552
max,222.083054,99.85096,236.156467,295.618516,168.088999,77.00811,207.782459,227.236417,235.631101,295.605684,...,221.057099,197.876299,150.008552,116.922367,6.279964,47.877713,18.608676,24.544345,3.050446,6.54395


In [7]:
y_train = np.concatenate((np.zeros(X_train.shape[0]//2), np.ones(X_train.shape[0]//2)))
y_test = np.concatenate((np.zeros(X_test.shape[0]//2), np.ones(X_test.shape[0]//2)))
y_train.shape, y_test.shape

((260000,), (90000,))

In [8]:
allResults = []

# Euklid

In [9]:
allResults += gridSearchKNN("{}NN-{}", X_train, y_train, X_test, y_test)

In [16]:
allResults += gridSearchKNN("norm + {}NN-{}", X_train_norm, y_train, X_test_norm, y_test)

In [None]:
allResults += gridSearchKNNratio("NN-{}", X_train, y_train, X_test, y_test)

In [None]:
allResults += gridSearchKNNratio("norm + NN-{}", X_train_norm, y_train, X_test_norm, y_test)

# LFDA

In [11]:
subset = .25
train_mask = np.random.choice([True, False], X_train.shape[0], p=[subset, 1-subset])

X_train_sample = X_train[train_mask]
X_train_sample_norm = X_train_norm[train_mask]
y_train_sample = y_train[train_mask]

X_train_sample.shape, X_train_sample_norm.shape

# should be around 70000 * 99

((64823, 99), (64823, 99))

In [None]:
lfda = LFDA()
lfda.fit(X_train_sample.values, y_train_sample)

In [None]:
allResults += gridSearchKNN("LFDA + {}NN-{}", X_train, y_train, X_test, y_test, lfda)

In [None]:
allResults += gridSearchKNNratio("LFDA + NN-{}", X_train, y_train, X_test, y_test, lfda)

# LFDA norm

In [None]:
lfda_norm = LFDA()
lfda_norm.fit(X_train_sample_norm.values, y_train_sample)

In [None]:
allResults += gridSearchKNN("norm + LFDA + {}NN-{}", X_train_norm, y_train, X_test_norm, y_test, lfda)

In [None]:
allResults += gridSearchKNNratio("norm + LFDA + NN-{}", X_train_norm, y_train, X_test_norm, y_test, lfda)

# LMNN

In [16]:
subset = 0.08
train_mask = np.random.choice([True, False], X_train.shape[0], p=[subset, 1-subset])

X_train_sample = X_train[train_mask]
X_train_sample_norm = X_train_norm[train_mask]
y_train_sample = y_train[train_mask]

X_train_sample.shape, X_train_sample_norm.shape

# should be around 20.000 * 99

((20874, 99), (20874, 99))

In [14]:
lmnn = LMNN()
lmnn.fit(X_train_sample.values, y_train_sample)

KeyboardInterrupt: 

In [None]:
allResults += gridSearchKNN("LMNN + {}NN-{}", X_train, y_train, X_test, y_test, lmnn)

In [None]:
allResults += gridSearchKNNratio("LMNN + NN-{}", X_train, y_train, X_test, y_test, lmnn)

# LMNN norm

In [None]:
lmnn_norm = LMNN()
lmnn_norm.fit(X_train_sample_norm.values, y_train_sample)

In [None]:
allResults += gridSearchKNN("norm + LMNN + {}NN-{}", X_train_norm, y_train, X_test_norm, y_test, lmnn)

In [None]:
allResults += gridSearchKNNratio("norm + LMNN + NN-{}", X_train_norm, y_train, X_test_norm, y_test, lmnn)

# NCA

In [18]:
subset = 0.015/4
train_mask = np.random.choice([True, False], X_train.shape[0], p=[subset, 1-subset])

X_train_sample = X_train[train_mask]
X_train_sample_norm = X_train_norm[train_mask]
y_train_sample = y_train[train_mask]

X_train_sample.shape, X_train_sample_norm.shape

((988, 99), (988, 99))

In [None]:
nca = NCA()
nca.fit(X_train_sample.values, y_train_sample)

In [None]:
allResults += gridSearchKNN("NCA + {}NN-{}", X_train, y_train, X_test, y_test, nca)

In [None]:
allResults += gridSearchKNNratio("NCA + NN-{}", X_train, y_train, X_test, y_test, nca)

# NCA norm

In [None]:
nca_norm = NCA()
nca_norm.fit(X_train_sample_norm.values, y_train_sample)

In [None]:
allResults += gridSearchKNN("norm + NCA + {}NN-{}", X_train_norm, y_train, X_test_norm, y_test, nca)

In [None]:
allResults += gridSearchKNNratio("norm + NCA + NN-{}", X_train_norm, y_train, X_test_norm, y_test, nca)

# Results

In [None]:
allResults

In [None]:
resPd = pd.DataFrame(allResults)

In [None]:
resPd

In [None]:
resPd.to_csv('res_mde.csv')