In [None]:
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import DataLoader
import os
from torch.utils.data import Dataset
import torch.optim as optim
from os import listdir

In [None]:
def embed_bytes(byte):
    binary_string = "{0:08b}".format(byte)
    vec = np.zeros(8)
    for i in range(8):
        if binary_string[i] == "1":
            vec[i] = float(1) / 16
        else:
            vec[i] = -float(1) / 16
    return vec

In [None]:
byte1 = 255
byte2 = 1
print(embed_bytes(byte1))
print(embed_bytes(byte2))

In [None]:
directories_with_labels = [(r"C:\Users\sridh\Desktop\smallDataset_opseq\malware\Benign", 0), (r"C:\Users\sridh\Desktop\smallDataset_opseq\malware\ADRD", 1)]
list_of_samples = []
labels = []
for dataset_path, label in directories_with_labels:
    samples = [f for f in listdir(dataset_path)]
    for file in samples:
        file_path = os.path.join(dataset_path, file)
        list_of_samples.append(file_path)
        labels.append(label)
def read_file(file_path):
    """Read the binary sequence of a file."""
    with open(file_path, "rb") as binary_file:
        return binary_file.read()




In [None]:
max_size = 15000
num_samples = len(list_of_samples)
X = np.zeros((num_samples, 8, max_size))
Y = np.asarray(labels)
file_num = 0
for file in tqdm(list_of_samples):
    sample_byte_sequence = read_file(file)
    for i in range(min(max_size, len(sample_byte_sequence))):
           X[file_num, :, i] = embed_bytes(sample_byte_sequence[i])
    file_num += 1
   

In [None]:
print(X.shape)

In [None]:

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
use_gpu = True  
use_cpu = 32 
display_step = 10 
test_step = 100  
learning_rate = 0.0001  
max_step = 500  
batch_size = 128  
first_n_byte = (
    100000 
)
window_size = 500  

In [None]:
train, valid, train_label, valid_label = train_test_split(
    list_of_samples,
    labels,
    test_size=0.2,
    random_state=100,
)

trainset = pd.DataFrame({"list_of_samples": train, "labels": train_label})
validset = pd.DataFrame({"list_of_samples": valid, "labels": valid_label})

label_path= r"C:/Users/sridh/Desktop/smallDataset_opseq/malware/"

trainset.to_csv(
    label_path + "example-train-label.csv", index=False, header=False, encoding="utf-8"
)
validset.to_csv(
    label_path + "example-valid-label.csv", index=False, header=False, encoding="utf-8"
)



def write_pred(test_pred,test_idx,file_path):
    test_pred = [item for sublist in test_pred for item in sublist]
    with open(file_path,'w') as f:
        for idx,pred in zip(test_idx,test_pred):
            print(idx.upper()+','+str(pred[0]),file=f)

# Dataset preparation
class ExeDataset(Dataset):
    def __init__(self, fp_list, data_path, label_list, first_n_byte=2000000):
        self.fp_list = fp_list
        self.data_path = data_path
        self.label_list = label_list
        self.first_n_byte = first_n_byte

    def __len__(self):
        return len(self.fp_list)

    def __getitem__(self, idx):
        try:
            with open(self.data_path+self.fp_list[idx],'rb') as f:
                tmp = [i+1 for i in f.read()[:self.first_n_byte]]
                tmp = tmp+[0]*(self.first_n_byte-len(tmp))
        except:
            with open(self.data_path+self.fp_list[idx].lower(),'rb') as f:
                tmp = [i+1 for i in f.read()[:self.first_n_byte]]
                tmp = tmp+[0]*(self.first_n_byte-len(tmp))

        return np.array(tmp),np.array([self.label_list[idx]])




trainloader = DataLoader(
    ExeDataset(
        list(trainset["list_of_samples"]), train, list(trainset["labels"]), first_n_byte
    ),
    batch_size=batch_size,
    shuffle=False,
    num_workers=use_cpu,
    pin_memory=True,
)
validloader = DataLoader(
    ExeDataset(
        list(validset["list_of_samples"]), train, list(validset["labels"]), first_n_byte
    ),
    batch_size=batch_size,
    shuffle=False,
    num_workers=use_cpu,
    pin_memory=True,
)



In [None]:
class MalConv(nn.Module):
    def __init__(self,input_length=2000000,window_size=500):
        super(MalConv, self).__init__()

        self.embed = nn.Embedding(257, 8, padding_idx=0)

        self.conv_1 = nn.Conv1d(4, 128, window_size, stride=window_size, bias=True)
        self.conv_2 = nn.Conv1d(4, 128, window_size, stride=window_size, bias=True)

        self.pooling = nn.MaxPool1d(int(input_length/window_size))
        

        self.fc_1 = nn.Linear(128,128)
        self.fc_2 = nn.Linear(128,1)

        self.sigmoid = nn.Sigmoid()
        #self.softmax = nn.Softmax()
        

    def forward(self,x):
        x = self.embed(x)
        # Channel first
        x = torch.transpose(x,-1,-2)

        cnn_value = self.conv_1(x.narrow(-2, 0, 4))
        gating_weight = self.sigmoid(self.conv_2(x.narrow(-2, 4, 4)))

        x = cnn_value * gating_weight
        x = self.pooling(x)

        x = x.view(-1,128)
        x = self.fc_1(x)
        x = self.fc_2(x)
        #x = self.sigmoid(x)

        return x

In [None]:
malconv = MalConv(input_length=first_n_byte,window_size=window_size)
bce_loss = nn.BCEWithLogitsLoss()
adam_optim = optim.Adam([{'params':malconv.parameters()}],lr=learning_rate)
sigmoid = nn.Sigmoid()

if use_gpu:
    malconv = malconv.cuda()
    bce_loss = bce_loss.cuda()
    sigmoid = sigmoid.cuda()


step_msg = 'step-{}-loss-{:.6f}-acc-{:.4f}-time-{:.2f}'
valid_msg = 'step-{}-tr_loss-{:.6f}-tr_acc-{:.4f}-val_loss-{:.6f}-val_acc-{:.4f}'
log_msg = '{}, {:.6f}, {:.4f}, {:.6f}, {:.4f}, {:.2f}'
history = {}
history['tr_loss'] = []
history['tr_acc'] = []

log = open(log_file_path,'w')
log.write('step,tr_loss, tr_acc, val_loss, val_acc, time\n')

valid_best_acc = 0.0
total_step = 0
step_cost_time = 0


while total_step < max_step:
    
   
    for step,batch_data in enumerate(dataloader):
        start = time.time()
        
        adam_optim.zero_grad()
        
        cur_batch_size = batch_data[0].size(0)

        exe_input = batch_data[0].cuda() if use_gpu else batch_data[0]
        exe_input = Variable(exe_input.long(),requires_grad=False)
        
        label = batch_data[1].cuda() if use_gpu else batch_data[1]
        label = Variable(label.float(),requires_grad=False)
        
        pred = malconv(exe_input)
        loss = bce_loss(pred,label)
        loss.backward()
        adam_optim.step()
        
        history['tr_loss'].append(loss.cpu().data.numpy()[0])
        history['tr_acc'].extend(list(label.cpu().data.numpy().astype(int)==(sigmoid(pred).cpu().data.numpy()+0.5).astype(int)))
        
        step_cost_time = time.time()-start
        
        if (step+1)%display_step == 0:
            print(step_msg.format(total_step,np.mean(history['tr_loss']),
                                  np.mean(history['tr_acc']),step_cost_time),end='\r',flush=True)
        total_step += 1

        
        if total_step%test_step ==0:
            break
    
    
    
    history['val_loss'] = []
    history['val_acc'] = []
    history['val_pred'] = []
    
    for _,val_batch_data in enumerate(validloader):
        cur_batch_size = val_batch_data[0].size(0)

        exe_input = val_batch_data[0].cuda() if use_gpu else val_batch_data[0]
        exe_input = Variable(exe_input.long(),requires_grad=False)

        label = val_batch_data[1].cuda() if use_gpu else val_batch_data[1]
        label = Variable(label.float(),requires_grad=False)

        pred = malconv(exe_input)
        loss = bce_loss(pred,label)

        history['val_loss'].append(loss.cpu().data.numpy()[0])
        history['val_acc'].extend(list(label.cpu().data.numpy().astype(int)==(sigmoid(pred).cpu().data.numpy()+0.5).astype(int)))
        history['val_pred'].append(list(sigmoid(pred).cpu().data.numpy()))

    print(log_msg.format(total_step, np.mean(history['tr_loss']), np.mean(history['tr_acc']),
                    np.mean(history['val_loss']), np.mean(history['val_acc']),step_cost_time),
          file=log,flush=True)
    
    print(valid_msg.format(total_step,np.mean(history['tr_loss']),np.mean(history['tr_acc']),
                           np.mean(history['val_loss']),np.mean(history['val_acc'])))
    if valid_best_acc < np.mean(history['val_acc']):
        valid_best_acc = np.mean(history['val_acc'])
        torch.save(malconv,chkpt_acc_path)
        print('Checkpoint saved at',chkpt_acc_path)
        write_pred(history['val_pred'],valid_idx,pred_path)
        print('Prediction saved at', pred_path)

    history['tr_loss'] = []
    history['tr_acc'] = []