In [17]:
import lightgbm as lgb
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd
from dataset_builder import DatasetReader

In [18]:
dataset = DatasetReader('dataset')

trojan = dataset.read_data['trojan']
rootkit = dataset.read_data['rootkit']
backdoor = dataset.read_data['backdoor']
benign = dataset.read_data['benign']

X = trojan+rootkit+backdoor+benign
Y = []

for i in range(len(benign)):
    Y.append(0)
for i in range(len(trojan)):
    Y.append(1)
for i in range(len(rootkit)):
    Y.append(2)
for i in range(len(backdoor)):
    Y.append(3)

Data = np.array(X, dtype=np.float32)
Labels = np.asarray(Y, dtype=np.float32)
print("Shape of data: ", Data.shape)
print("Shape of labels: ", Labels.shape)

Shape of data:  (564, 2479)
Shape of labels:  (564,)


In [19]:
X = pd.DataFrame(Data)
Y = pd.DataFrame(Labels)
print(Y[0].unique())
frames = [X, Y]
DATA = pd.concat(frames, axis = 1)
Y = DATA.iloc[:, -1]
print(Y.shape)
X = DATA.iloc[:, :-1]
print(np.unique(Y))
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

[0. 1. 2. 3.]
(564,)
[0. 1. 2. 3.]


In [20]:
d_train = lgb.Dataset(x_train, label = y_train)
params = {}
params['learning_rate'] = 0.05
params['boosting_type'] = 'gbdt'
params['objective'] = 'multiclass'
params['num_class'] = 4
params['metric'] = 'multi_logloss'
params['sub_feature'] = 0.3
params['num_leaves'] = 15
params['min_data'] = 95
params['max_depth'] = 15
params['device'] = 'cpu'

clf = lgb.train(params, d_train, 100)


y_pred = clf.predict(x_test)
best_preds = [np.argmax(line) for line in y_pred]

In [21]:
np.unique(best_preds)

array([0, 1, 2, 3], dtype=int64)

In [22]:
accuracy_score(y_test, best_preds)

0.7079646017699115

In [23]:
clf.save_model('without_benign.mdl')

<lightgbm.basic.Booster at 0x2840852ef98>

In [24]:
lgb.Booster(model_file='model.mdl')

<lightgbm.basic.Booster at 0x284086830b8>

In [25]:
from application import get_pred

In [26]:
data = open('ngrok', 'rb')

In [27]:
f1 = open("samples/backd1.bin", 'rb')
tr1 = f1.read()
f2 = open("samples/tr2.bin", 'rb')
tr2 = f2.read()
f3 = open("samples/s.exe", 'rb')
rt = f3.read()
data = [tr1, tr2]

In [None]:
from datasketch import MinHash, MinHashLSH
import sys
from nltk import ngrams

data = [tr1, tr2]

# Create an MinHashLSH index optimized for Jaccard threshold 0.5,
# that accepts MinHash objects with 128 permutations functions
lsh = MinHashLSH(threshold=0.9, num_perm=128)

# Create MinHash objects
minhashes = {}
for c, i in enumerate(data):
  minhash = MinHash(num_perm=128)
  for d in ngrams(i, 3):
    minhash.update("".join(str(d)).encode('utf-8'))
  lsh.insert(c, minhash)
  minhashes[c] = minhash

for i in range(len(minhashes.keys())):
  result = lsh.query(minhashes[i])
  print("Candidates with Jaccard similarity > 0.5 for input", i, ":", result)

In [13]:
from application import get_pred, LABEL_MAP

In [15]:
data = open('samples/trojan2.bin', 'rb').read()
pred = get_pred(data)
print(pred)
print(LABEL_MAP[pred])

lief error:  This file is not a PE binary
1
1
trojan


In [16]:
data = open('samples/s.exe', 'rb').read()
pred = get_pred(data)
print(pred)
print(LABEL_MAP[pred])

1
1
trojan
