In [69]:
import json
from pathlib import Path

import torch
import numpy as np
import pefile
import pandas as pd
from tqdm import tqdm

from sklearn.metrics import accuracy_score, f1_score

tqdm.pandas()

In [11]:
NOTEBOOKS_DIR = Path().resolve()
PROJECT_DIR = NOTEBOOKS_DIR.parent
DATA_DIR = PROJECT_DIR / 'data'
DATASET_DIR = DATA_DIR / 'pe-machine-learning-dataset'

SAMPLES_WITH_REPORTS_DIR = DATASET_DIR / 'samples_with_reports'
SAMPLES_PICTURES = DATASET_DIR / 'samples_pictures'
SAMPLES_PICTURES.mkdir(exist_ok=True)
REPORTS_DIR = DATASET_DIR / 'reports'


RANDOM_STATE = 741

In [40]:
df = pd.read_parquet(DATA_DIR / 'df_test.parquet')

In [41]:
df.shape

(7398, 7)

In [44]:
df.head(5)

Unnamed: 0,filename,sha256,count_malicious,count_suspicious,type_tags,label,label_id
0,230628,8ffd165e17716d50ffd6e3c81297d0e7ac241594d004b7...,0,0,"[executable, windows, win32, pe, pedll]",benign,0
1,50231,2b6a879deb2175b5e2f005e2ecceaaecd2dc7631ef0048...,52,0,"[executable, windows, win32, pe, peexe]",malware,1
2,239024,a9c299434ddb0ccaf57070d91a6a964b5e342c78c4b3ad...,0,0,"[executable, windows, win32, pe, pedll]",benign,0
3,503422,8527d2ac42d3ce800f039c71820442b74c1cd2c4935301...,63,0,"[executable, windows, win32, pe, peexe]",malware,1
4,213954,68cc5730c844adfeb81313cf6195806a88b24c24d979b6...,0,0,"[executable, windows, win32, pe, pedll]",benign,0


In [55]:
engines

['Bkav',
 'Lionic',
 'Elastic',
 'MicroWorld-eScan',
 'ClamAV',
 'FireEye',
 'CAT-QuickHeal',
 'ALYac',
 'Cylance',
 'Zillya',
 'Sangfor',
 'K7AntiVirus',
 'Alibaba',
 'K7GW',
 'CrowdStrike',
 'BitDefenderTheta',
 'VirIT',
 'Cyren',
 'SymantecMobileInsight',
 'Symantec',
 'tehtris',
 'ESET-NOD32',
 'Zoner',
 'APEX',
 'Paloalto',
 'Cynet',
 'Kaspersky',
 'BitDefender',
 'NANO-Antivirus',
 'SUPERAntiSpyware',
 'Avast',
 'Tencent',
 'Trustlook',
 'TACHYON',
 'Emsisoft',
 'Baidu',
 'F-Secure',
 'DrWeb',
 'VIPRE',
 'TrendMicro',
 'McAfee-GW-Edition',
 'Trapmine',
 'CMC',
 'Sophos',
 'Ikarus',
 'GData',
 'Jiangmin',
 'Webroot',
 'Avira',
 'Antiy-AVL',
 'Gridinsoft',
 'Xcitium',
 'Arcabit',
 'ViRobot',
 'ZoneAlarm',
 'Avast-Mobile',
 'Microsoft',
 'Google',
 'BitDefenderFalx',
 'AhnLab-V3',
 'Acronis',
 'McAfee',
 'MAX',
 'VBA32',
 'Malwarebytes',
 'Panda',
 'TrendMicro-HouseCall',
 'Rising',
 'Yandex',
 'SentinelOne',
 'MaxSecure',
 'Fortinet',
 'AVG',
 'Cybereason',
 'DeepInstinct']

In [73]:
def get_stats(row, engine_name = 'Bkav'):
    with open(REPORTS_DIR / f"{row.sha256}.json", 'r') as file:
        report = json.load(file)
    stats = report['files']['data']['attributes']['last_analysis_results']
    value = stats.get(engine_name)
    if value is None:
        return pd.Series({"category_id": np.nan, "label_id": np.nan})

    if value['category'] in ['type-unsupported', 'confirmed-timeout', 'timeout', 'failure']:
        return pd.Series({"category_id": np.nan, "label_id": np.nan})
    
    category_id = {
         'harmless': 0,
          'suspicious': 1,
          'malicious': 1,
          'undetected': 0,
     }[value['category']]


    return pd.Series({"category_id": category_id, "label_id": row.label_id})

In [74]:
def make_stats(engine):
    df_stats = df.apply(lambda x: get_stats(x, engine), axis=1) 
    df_stats = df_stats.dropna()
    acc = accuracy_score(df_stats.label_id, df_stats.category_id)
    f1 = f1_score(df_stats.label_id, df_stats.category_id, average='macro')
    print('Engine:', engine)
    print('Accuracy:', acc)
    print('F1 Score:', f1)
    print()

In [75]:
for engine in engines:
    make_stats(engine)

Engine: Bkav
Accuracy: 0.8669705170203674
F1 Score: 0.8664531501967926

Engine: Lionic
Accuracy: 0.7742448330683624
F1 Score: 0.766283524407078

Engine: Elastic
Accuracy: 0.9923332481472016
F1 Score: 0.9923324864820803

Engine: MicroWorld-eScan
Accuracy: 0.9841226760754512
F1 Score: 0.9840714238880366

Engine: ClamAV
Accuracy: 0.8171924507507922
F1 Score: 0.815166380216274

Engine: FireEye
Accuracy: 0.9976929530201343
F1 Score: 0.9976751706066529

Engine: CAT-QuickHeal
Accuracy: 0.8978954514596063
F1 Score: 0.8978295852203768

Engine: ALYac
Accuracy: 0.9218528438632425
F1 Score: 0.9210576890285125

Engine: Cylance
Accuracy: 0.9902489987811248
F1 Score: 0.9902434466472997

Engine: Zillya
Accuracy: 0.9309117865085248
F1 Score: 0.9307198748043819

Engine: Sangfor
Accuracy: 0.9811710677382319
F1 Score: 0.9811233327046573

Engine: K7AntiVirus
Accuracy: 0.9801109457448248
F1 Score: 0.9800525203563611

Engine: Alibaba
Accuracy: 0.7977291159772911
F1 Score: 0.7909270432059716

Engine: K7GW
Acc

  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Engine: BitDefenderFalx
Accuracy: nan
F1 Score: nan

Engine: AhnLab-V3
Accuracy: 0.9588381057268722
F1 Score: 0.9588255198379163

Engine: Acronis
Accuracy: 0.7578757875787578
F1 Score: 0.7458574869872918

Engine: McAfee
Accuracy: 0.9956043956043956
F1 Score: 0.9955800769540173

Engine: MAX
Accuracy: 0.9689534301452178
F1 Score: 0.968828681755064

Engine: VBA32
Accuracy: 0.9671061574011146
F1 Score: 0.9670465104054455

Engine: Malwarebytes
Accuracy: 0.8694459681843115
F1 Score: 0.8690072568863809

Engine: Panda
Accuracy: 0.9548045602605864
F1 Score: 0.9547631218472687

Engine: TrendMicro-HouseCall
Accuracy: 0.885344239945467
F1 Score: 0.8852044084815125

Engine: Rising
Accuracy: 0.9542573711766327
F1 Score: 0.954189192261834

Engine: Yandex
Accuracy: 0.9308689390302448
F1 Score: 0.9297605314658648

Engine: SentinelOne
Accuracy: 0.9595660749506904
F1 Score: 0.9593611805140116

Engine: MaxSecure
Accuracy: 0.9015765765765765
F1 Score: 0.8994815186399387

Engine: Fortinet
Accuracy: 0.986037