In [1]:
import pandas as pd
import glob
import os
import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
df = pd.DataFrame(columns=["sha256","family", "total", "e8", "c3", "cb", "eb", "e9"])

In [3]:
mal_family = ["adware", "crypto_miner", "downloader", "dropper", "file_infector",
              "flooder", "installer", "packed", "ransomware", "spyware", "worm"]


def features_extraction(fileName):
    fileName = fileName.replace("\\", "/")
    sha256 = fileName.split("/")[-1]
    family = fileName.split("/")[1]
    with open(fileName, 'rb') as f:
        fileContents = f.read()
        i=0
        e8 = 0
        c3 = 0
        cb = 0
        eb = 0
        e9 = 0
        for b in fileContents:
            # hexdata = binascii.hexlify(f.read(),"-")
            # e8 = hexdata.count(b"e8")
            # c3 = hexdata.count(b"c3")
            # cb = hexdata.count(b"cb")
            # eb = hexdata.count(b"eb")
            # e9 = hexdata.count(b"e9")
            opcode = "{0:0{1}x}".format(b, 2)
            i += 1
            if opcode == 'e8':
                e8 = e8 + 1
            if opcode == 'c3':
                c3 = c3 + 1
            if opcode == 'cb':
                cb = cb + 1
            if opcode == 'eb':
                eb = eb + 1
            if opcode == 'e9':
                e9 = e9 + 1
    df.loc[df.shape[0]] = [sha256, family, i, e8, c3, cb, eb, e9]


failed_reads = []


def exec_malware_family(current_family):
    fp = glob.glob(f'./{current_family}/**')
    with tqdm.tqdm(desc=f"Reading {current_family}", total=len(fp)) as pbar:
        for fileName in glob.iglob(f'./{current_family}/**', recursive=True):
            if os.path.isfile(fileName):
                features_extraction(fileName)
                pbar.update(1)


def exec_malware_family_con(current_family):
    fp = glob.glob(f'./{current_family}/**')
    with tqdm.tqdm(desc=f"Reading {current_family}", total=len(fp)) as pbar:
        with ThreadPoolExecutor(max_workers=32) as executor:
            futures = {
                executor.submit(features_extraction, file): file for file in fp
            }
            for future in as_completed(futures):
                if future.exception():
                    failed_downloads.append(futures[future])
                pbar.update(1)
                print(df)


In [4]:
for family in mal_family:
    exec_malware_family(family)
# exec_malware_family(mal_family[0])


Reading adware: 100%|████████████████████████████████████████████████████████████████| 997/997 [07:21<00:00,  2.26it/s]
Reading crypto_miner: 100%|██████████████████████████████████████████████████████████| 993/993 [22:34<00:00,  1.36s/it]
Reading downloader: 100%|██████████████████████████████████████████████████████████| 1000/1000 [04:26<00:00,  3.76it/s]
Reading dropper: 100%|███████████████████████████████████████████████████████████████| 998/998 [03:14<00:00,  5.12it/s]
Reading file_infector: 100%|███████████████████████████████████████████████████████| 1000/1000 [02:51<00:00,  5.82it/s]
Reading flooder: 100%|███████████████████████████████████████████████████████████████| 999/999 [02:27<00:00,  6.76it/s]
Reading installer: 100%|█████████████████████████████████████████████████████████████| 974/974 [12:33<00:00,  1.29it/s]
Reading packed: 100%|████████████████████████████████████████████████████████████████| 998/998 [03:42<00:00,  4.48it/s]
Reading ransomware: 100%|███████████████

In [5]:
df.shape

(10958, 8)

In [6]:
df.to_csv("FE_extract_raw_count.csv")

In [7]:
df

Unnamed: 0,sha256,family,total,e8,c3,cb,eb,e9
0,00011e3c72f77fd7cebaf0a59c411a5ad5dd4778d30fa0...,adware,222592,843,843,822,907,866
1,0014f640f54d304a7c34797ae7dc81cde9215ada4a04a6...,adware,198733,750,734,762,829,839
2,005de2971db4c3704264fd0771d3a32255e468e2972dc9...,adware,235432,1029,868,948,968,987
3,00e29c9e76baeeb012be5c9c3d4f8fc1477e70619108dd...,adware,234597,927,887,931,956,1001
4,011518b58379d986a4fad32065db413516b5ad561bab9a...,adware,235425,1036,946,905,878,994
...,...,...,...,...,...,...,...,...
10953,ff4783ffec54af85bd39258d5c6f295e54f6bd6c64da09...,worm,46049,195,192,149,224,177
10954,ff80b4e5c8aebda9ae9f8d069fbb1950ce41c0ff4c5f25...,worm,223208,856,866,884,930,901
10955,ff99673762d6020d4ce5ad9bec2b1044262a9dd575f32b...,worm,223147,893,883,849,907,846
10956,ffa44315ce0bd3439edecfec4ce458b4f6fa4a6df8a6bf...,worm,222817,831,837,901,940,792
