In [1]:
import json
import hashlib
from pathlib import Path

import pandas as pd

In [2]:
NOTEBOOKS_DIR = Path().resolve()
PROJECT_DIR = NOTEBOOKS_DIR.parent
DATA_DIR = PROJECT_DIR / 'data'
DATASET_DIR = DATA_DIR / 'pe-machine-learning-dataset'
REPORTS_DIR = DATASET_DIR / 'reports'
SAMPLES_DIR = DATASET_DIR / 'samples'
RANDOM_STATE = 741

In [3]:
df = pd.read_parquet(DATA_DIR / 'dataset_with_reports.parquet')

In [4]:
df = df[['id', 'sha256']]

In [5]:
df.rename(columns={"id": "filename"}, inplace=True)

In [6]:
df.reset_index(drop=True, inplace=True)

In [7]:
def get_count_malicious_positives(hash: str):
    with open(REPORTS_DIR / f'{hash}.json', 'r') as file:
        report = json.load(file)
    return report["files"]["data"]["attributes"]["last_analysis_stats"]["malicious"]

def get_count_suspicious_positives(hash: str):
    with open(REPORTS_DIR / f'{hash}.json', 'r') as file:
        report = json.load(file)
    return report["files"]["data"]["attributes"]["last_analysis_stats"]["suspicious"]

def get_type_tags(hash: str):
    with open(REPORTS_DIR / f'{hash}.json', 'r') as file:
        report = json.load(file)
    return report["files"]["data"]["attributes"]["type_tags"]

In [8]:
df["count_malicious"] = df.sha256.map(get_count_malicious_positives)
df["count_suspicious"] = df.sha256.map(get_count_suspicious_positives)
df["type_tags"] = df.sha256.map(get_type_tags)

In [9]:
df['type_tags'].value_counts()

type_tags
[executable, windows, win32, pe, peexe]    26139
[executable, windows, win32, pe, pedll]    13926
[executable, dos, mz]                         34
[]                                            26
[executable, windows, win16, ne, neexe]        1
Name: count, dtype: int64

In [10]:
def filter_type_tags(tags):
    if not tags:
        return False

    if tags[-1] == 'peexe' or tags[-1] == 'pedll':
        return True
    return False

In [11]:
df = df[df['type_tags'].map(filter_type_tags)]

In [12]:
df.head(5)

Unnamed: 0,filename,sha256,count_malicious,count_suspicious,type_tags
0,237355,a1e965951f828e0952907baec3d5f41b22d3e4e617da09...,0,0,"[executable, windows, win32, pe, pedll]"
1,253163,ea0f1228efce715ca69bf231bff7c810c1864d0e0294e1...,0,0,"[executable, windows, win32, pe, pedll]"
2,248898,d63ddebe7cdabc62f32dd3717f5751c1af225236c84c12...,0,0,"[executable, windows, win32, pe, pedll]"
3,235326,9cde91b13f5cbb52f3efc86b1fc9f84c7e3ace5f02272c...,0,0,"[executable, windows, win32, pe, pedll]"
4,233581,992774e2e9245861b6f038544d3791850552e0b9ef5e80...,0,0,"[executable, windows, win32, pe, pedll]"


In [13]:
benign_df = df[(df.count_malicious == 0) & (df.count_suspicious == 0)]
malware_df = df[df.count_malicious > 30]

In [14]:
benign_df["label"] = "benign"
benign_df["label_id"] = 0

malware_df["label"] = "malware"
malware_df["label_id"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_df["label"] = "benign"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_df["label_id"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malware_df["label"] = "malware"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value in

In [15]:
final_df = pd.concat([benign_df, malware_df])

In [16]:
for _ in range(50):
    final_df = final_df.sample(frac=1)

In [17]:
final_df.to_parquet(DATA_DIR / 'labeled_df.parquet')

In [18]:
final_df.shape

(36988, 7)

In [32]:
def sha256_calculation(filename):
    with open(SAMPLES_DIR / str(filename), 'rb') as file:
        file_bytes = file.read()
        sha256_hash = hashlib.sha256(file_bytes).hexdigest()
    return sha256_hash

In [33]:
final_df["current_sha256"] = final_df.filename.map(sha256_calculation)

In [36]:
final_df[final_df["sha256"] == final_df["current_sha256"]]

Unnamed: 0,filename,sha256,count_malicious,count_suspicious,type_tags,label,label_id,current_sha256
22115,7334,02ed12ee1f5aa13d065b1a0989c5d321e0a15027d2ad99...,64,0,"[executable, windows, win32, pe, peexe]",malware,1,02ed12ee1f5aa13d065b1a0989c5d321e0a15027d2ad99...
35724,28648,c39bc1fbd1577dcde67daaee2a935ddcd31bca2b58fe4f...,64,0,"[executable, windows, win32, pe, peexe]",malware,1,c39bc1fbd1577dcde67daaee2a935ddcd31bca2b58fe4f...
30358,257333,fd6cb6e09ce46edf0ff581322074de5a1553a8b7f72022...,0,0,"[executable, windows, win32, pe, pedll]",benign,0,fd6cb6e09ce46edf0ff581322074de5a1553a8b7f72022...
11657,236591,9f5ceb1d0e03a0909f1e569ef63c7d1a03d318a118200c...,0,0,"[executable, windows, win32, pe, peexe]",benign,0,9f5ceb1d0e03a0909f1e569ef63c7d1a03d318a118200c...
21482,250804,df0125b9d29101e0f04f9bdfaa490a9a43e160d3bea4d8...,0,0,"[executable, windows, win32, pe, pedll]",benign,0,df0125b9d29101e0f04f9bdfaa490a9a43e160d3bea4d8...
...,...,...,...,...,...,...,...,...
33355,4570,f5845c202d84c15fa395fbf9d6dc71033b44e39adb953d...,23,0,"[executable, windows, win32, pe, peexe]",malware,1,f5845c202d84c15fa395fbf9d6dc71033b44e39adb953d...
34225,58054,71d3729a4f1b766d5bde6becb0ae525eabfc57a5d68b95...,60,0,"[executable, windows, win32, pe, peexe]",malware,1,71d3729a4f1b766d5bde6becb0ae525eabfc57a5d68b95...
14500,250526,dda990d2db605a41212b2654683b482c000771b6a00752...,0,0,"[executable, windows, win32, pe, pedll]",benign,0,dda990d2db605a41212b2654683b482c000771b6a00752...
11941,191180,25cb196f0da0072b25fa943a1a21e45c42d28230acf980...,0,0,"[executable, windows, win32, pe, pedll]",benign,0,25cb196f0da0072b25fa943a1a21e45c42d28230acf980...


Unnamed: 0,filename,sha256,count_malicious,count_suspicious,type_tags,label,label_id
22115,7334,02ed12ee1f5aa13d065b1a0989c5d321e0a15027d2ad99...,64,0,"[executable, windows, win32, pe, peexe]",malware,1
35724,28648,c39bc1fbd1577dcde67daaee2a935ddcd31bca2b58fe4f...,64,0,"[executable, windows, win32, pe, peexe]",malware,1
7510,4709,e13fb76bbbe0adce0c11a52ff1de9d0b3e5978ded89726...,52,0,"[executable, windows, win32, pe, peexe]",malware,1
24796,31328,e40fafe21f9728b0ce11af6d348a794818a20927c586dc...,51,0,"[executable, windows, win32, pe, peexe]",malware,1
19902,59736,76661a35370821cc4764bdcffb0ac588511b0128daccaa...,63,0,"[executable, windows, win32, pe, peexe]",malware,1
...,...,...,...,...,...,...,...
13800,22772,f2b64629b89e5c2b9721197e4a80e58b57be6b423f619f...,43,0,"[executable, windows, win32, pe, peexe]",malware,1
17811,39394,f585f092f7a03ecd4b6905961fe6ab04b7e98aabc6fc73...,52,0,"[executable, windows, win32, pe, pedll]",malware,1
10265,513852,1e815198f6f295097e85500d7ccf2c4b2ed914adbb083c...,42,0,"[executable, windows, win32, pe, peexe]",malware,1
34225,58054,71d3729a4f1b766d5bde6becb0ae525eabfc57a5d68b95...,60,0,"[executable, windows, win32, pe, peexe]",malware,1


In [51]:
df = pd.read_parquet(DATA_DIR / 'labeled_df.parquet')

In [45]:
df = df[df.type_tags.map(lambda x: x[-1] == 'peexe')]

In [46]:
df.reset_index(drop=True, inplace=True)

In [47]:
df.to_parquet(DATA_DIR / 'labeled_df_only_peexe.parquet')

In [48]:
df

Unnamed: 0,filename,sha256,count_malicious,count_suspicious,type_tags,label,label_id
0,7334,02ed12ee1f5aa13d065b1a0989c5d321e0a15027d2ad99...,64,0,"[executable, windows, win32, pe, peexe]",malware,1
1,28648,c39bc1fbd1577dcde67daaee2a935ddcd31bca2b58fe4f...,64,0,"[executable, windows, win32, pe, peexe]",malware,1
2,236591,9f5ceb1d0e03a0909f1e569ef63c7d1a03d318a118200c...,0,0,"[executable, windows, win32, pe, peexe]",benign,0
3,4709,e13fb76bbbe0adce0c11a52ff1de9d0b3e5978ded89726...,52,0,"[executable, windows, win32, pe, peexe]",malware,1
4,31328,e40fafe21f9728b0ce11af6d348a794818a20927c586dc...,51,0,"[executable, windows, win32, pe, peexe]",malware,1
...,...,...,...,...,...,...,...
25307,22772,f2b64629b89e5c2b9721197e4a80e58b57be6b423f619f...,43,0,"[executable, windows, win32, pe, peexe]",malware,1
25308,513852,1e815198f6f295097e85500d7ccf2c4b2ed914adbb083c...,42,0,"[executable, windows, win32, pe, peexe]",malware,1
25309,4570,f5845c202d84c15fa395fbf9d6dc71033b44e39adb953d...,23,0,"[executable, windows, win32, pe, peexe]",malware,1
25310,58054,71d3729a4f1b766d5bde6becb0ae525eabfc57a5d68b95...,60,0,"[executable, windows, win32, pe, peexe]",malware,1
