In [None]:
import numpy as np
import pandas as pd
import glob
import os.path
import sys
from datetime import datetime

# from svecon.HierarchicalGridSearchCV import HierarchicalGridSearchCV
# from svecon.EmptyTransformer import EmptyTransformer

# from sklearn.cross_validation import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

from metric_learn import LMNN, NCA, LFDA, Covariance
from metric_learn import ITML_Supervised, SDML_Supervised, LSML_Supervised, RCA_Supervised

In [None]:
def firstSmallerThan(threshold):
    def firstSmaller(L):
        for i,v in enumerate(L):
            if v<threshold:
                return i,v
        return None,None
    return firstSmaller

def firstHigherThan(threshold):
    def firstHigher(L):
        for i,v in enumerate(L):
            if v>=threshold:
                return i,v
        return None,None
    return firstHigher

def findMalwareAndClean(L):
    findMalware = firstHigherThan(X_train.shape[0]//2)
    findClean =  firstSmallerThan(X_train.shape[0]//2)
    
    im,vm = findMalware(L)
    ic,vc = findClean(L)
    return ic,im

def timestampToStr(t):
    return datetime.fromtimestamp(t).strftime('%Y-%m-%d')

def renameColumns(d):
    def checkDict(c):
        if c in d:
            return d[c]
        return c
    return checkDict

def evaluateResults(label, y_true, y_pred):
    # TrainSize: 68000
    # TestSize: 28000
    # Ratio: 1.5000
    # MinCln: 96
    # MaxMal: 600
    # FP: 292 (0.0209)
    # FN: 1805 (0.1289)
    # TP: 12195 (0.8711)
    # TN: 13708 (0.9791)
    # Precision: 0.9766
    # Recall: 0.8711
    # Accuracy: 0.9251
    # F1 measure: 0.9208

    n = len(y_true)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    accuracy = (tp+tn)/n
    f1 = 2*((precision*recall)/(precision+recall))
    
    print("===== {}".format(label))
    print("FP: {}  ({})".format(fp, fp-292))
    print("FN: {}  ({})".format(fn, fn-1805))
    print("TP: {}  ({})".format(tp, tp-12195))
    print("TN: {}  ({})".format(tn, tn-13708))
    print("Precision: {}  ({})".format( 100*precision, 100*(precision-0.9766) ))
    print("Recall: {}  ({})".format( 100*recall, 100*(recall-0.8711) ))
    print("Accuracy: {}  ({})".format( 100*accuracy, 100*(accuracy-0.9251) ))
    print("F1 measure: {}  ({})".format( 100*f1, 100*(f1-0.9208) ))
    return {
        'fp': fp,
        'fn': fn,
        'tp': tp,
        'tn': tn,
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'f1': f1,
    }

def gridSearchKNN(label, X_train, y_train, X_test, y_test, transformer=None):
    results = []
    
    if transformer:
        X_train_trans = transformer.transform(X_train)
        X_test_trans = transformer.transform(X_test)
    else:
        X_train_trans = X_train
        X_test_trans = X_test
    
    for weight in ['uniform', 'distance']:
        for k in [1,2,4,8,16,32]:
            knn = KNeighborsClassifier(n_neighbors=k, weights=weight, n_jobs=-1)
            knn.fit(X_train_trans, y_train)
            results.append( evaluateResults(label.format(k, weight), y_test, knn.predict(X_test_trans)) )
            results[-1]['label'] = label.format(k, weight)
    return results

def gridSearchKNNratio(label, X_train, y_train, X_test, y_test, transformer=None):
    results = []
    
    if transformer:
        X_train_trans = transformer.transform(X_train)
        X_test_trans = transformer.transform(X_test)
    else:
        X_train_trans = X_train
        X_test_trans = X_test
    

    knn = KNeighborsClassifier(n_neighbors=X_train.shape[0]//2, weights='uniform', n_jobs=-1)
    knn.fit(X_train_trans, y_train)
    distances, indices = knn.kneighbors(X_test_trans)

    malwareAndCleanIndices = list(map(findMalwareAndClean, indices))
    distancesClosest = [(D[i[0]],D[i[1]]) for D,i in zip(distances, malwareAndCleanIndices)]

    for ratio in [.25,.5,.75,1,1.25,1.5,1.75,2,2.5]:
        distanceRatios = [x/y if y>0 else ratio+1 for x,y in distancesClosest]
        y_pred = [0 if x<ratio else 1 for x in distanceRatios]

        results.append( evaluateResults(label.format(ratio), y_test, y_pred))
        results[-1]['label'] = label.format(ratio)
    return results

In [None]:
X_train = pd.read_csv('kovac/train68k_features.csv', sep=',', low_memory=False, index_col=False, dtype=float)
X_test = pd.read_csv('kovac/test28k_features.csv', sep=',', low_memory=False, index_col=False, dtype=float)
X_train.values.shape, X_test.shape

In [None]:
X_train.describe(include='all')

In [None]:
XTM = X_test.mean(axis=0)
XTS = X_train.std(axis=0)
XTM.shape, XTS.shape

In [None]:
X_train_norm = (X_train - XTM) / XTS
X_test_norm = (X_test - XTM) / XTS
X_train_norm.describe(include='all')

In [None]:
y_train = np.concatenate((np.zeros(X_train.shape[0]//2), np.ones(X_train.shape[0]//2)))
y_test = np.concatenate((np.zeros(X_test.shape[0]//2), np.ones(X_test.shape[0]//2)))
y_train.shape, y_test.shape

In [None]:
allResults = []

# Euklid

In [None]:
allResults += gridSearchKNN("{}NN-{}", X_train, y_train, X_test, y_test)

In [None]:
allResults += gridSearchKNN("norm + {}NN-{}", X_train_norm, y_train, X_test_norm, y_test)

In [None]:
allResults += gridSearchKNNratio("NN-{}", X_train, y_train, X_test, y_test)

In [None]:
allResults += gridSearchKNNratio("norm + NN-{}", X_train_norm, y_train, X_test_norm, y_test)

# LFDA

In [None]:
subset = 0.75
train_mask = np.random.choice([True, False], X_train.shape[0], p=[subset, 1-subset])

X_train_sample = X_train[train_mask]
X_train_sample_norm = X_train_norm[train_mask]
y_train_sample = y_train[train_mask]

X_train_sample.shape, X_train_sample_norm.shape

In [None]:
lfda = LFDA()
lfda.fit(X_train_sample.values, y_train_sample)

In [None]:
allResults += gridSearchKNN("LFDA + {}NN-{}", X_train, y_train, X_test, y_test, lfda)

In [None]:
allResults += gridSearchKNNratio("LFDA + NN-{}", X_train, y_train, X_test, y_test, lfda)

# LFDA norm

In [None]:
lfda_norm = LFDA()
lfda_norm.fit(X_train_sample_norm.values, y_train_sample)

In [None]:
allResults += gridSearchKNN("norm + LFDA + {}NN-{}", X_train_norm, y_train, X_test_norm, y_test, lfda)

In [None]:
allResults += gridSearchKNNratio("norm + LFDA + NN-{}", X_train_norm, y_train, X_test_norm, y_test, lfda)

# LMNN

In [None]:
subset = 0.25
train_mask = np.random.choice([True, False], X_train.shape[0], p=[subset, 1-subset])

X_train_sample = X_train[train_mask]
X_train_sample_norm = X_train_norm[train_mask]
y_train_sample = y_train[train_mask]

X_train_sample.shape, X_train_sample_norm.shape

In [None]:
lmnn = LMNN()
lmnn.fit(X_train_sample.values, y_train_sample)

In [None]:
allResults += gridSearchKNN("LMNN + {}NN-{}", X_train, y_train, X_test, y_test, lmnn)

In [None]:
allResults += gridSearchKNNratio("LMNN + NN-{}", X_train, y_train, X_test, y_test, lmnn)

# LMNN norm

In [None]:
lmnn_norm = LMNN()
lmnn_norm.fit(X_train_sample_norm.values, y_train_sample)

In [None]:
allResults += gridSearchKNN("norm + LMNN + {}NN-{}", X_train_norm, y_train, X_test_norm, y_test, lmnn)

In [None]:
allResults += gridSearchKNNratio("norm + LMNN + NN-{}", X_train_norm, y_train, X_test_norm, y_test, lmnn)

# NCA

In [None]:
subset = 0.015
train_mask = np.random.choice([True, False], X_train.shape[0], p=[subset, 1-subset])

X_train_sample = X_train[train_mask]
X_train_sample_norm = X_train_norm[train_mask]
y_train_sample = y_train[train_mask]

X_train_sample.shape, X_train_sample_norm.shape

In [None]:
nca = NCA()
nca.fit(X_train_sample.values, y_train_sample)

In [None]:
allResults += gridSearchKNN("NCA + {}NN-{}", X_train, y_train, X_test, y_test, nca)

In [None]:
allResults += gridSearchKNNratio("NCA + NN-{}", X_train, y_train, X_test, y_test, nca)

# NCA norm

In [None]:
nca_norm = NCA()
nca_norm.fit(X_train_sample_norm.values, y_train_sample)

In [None]:
allResults += gridSearchKNN("norm + NCA + {}NN-{}", X_train_norm, y_train, X_test_norm, y_test, nca)

In [None]:
allResults += gridSearchKNNratio("norm + NCA + NN-{}", X_train_norm, y_train, X_test_norm, y_test, nca)

# Results

In [None]:
allResults