In [15]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

In [16]:
NOTEBOOKS_DIR = Path().resolve()
PROJECT_DIR = NOTEBOOKS_DIR.parent
DATA_DIR = PROJECT_DIR / 'data'

REPORTS_DIR = DATA_DIR / 'reports'
MALWARE_REPORTS_DIR = REPORTS_DIR / 'malware'
BENIGN_REPORTS_DIR = REPORTS_DIR / 'benign'

EXE_DIR = DATA_DIR / 'exe'
MALWARE_EXE_DIR = EXE_DIR / 'malware'
BENIGN_EXE_DIR = EXE_DIR / 'malware'

RANDOM_STATE = 741

In [9]:
malware_reports = list(map(lambda s: s.stem, MALWARE_REPORTS_DIR.glob('*')))
benign_reports = list(map(lambda s: s.stem, BENIGN_REPORTS_DIR.glob('*')))

In [10]:
df_malware = pd.DataFrame(
    {
        'HASH': malware_reports,
        'LABEL': 'malware',
        'LABEL_ID': 1,
    }
)

df_benign = pd.DataFrame(
    {
        'HASH': benign_reports,
        'LABEL': 'benign',
        'LABEL_ID': 0,
    }
)

In [12]:
df = pd.concat([df_malware, df_benign])

In [13]:
df.shape

(1997, 3)

In [17]:
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=df.LABEL_ID,
)

df_train, df_valid = train_test_split(
    df_train,
    test_size=0.1,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=df_train.LABEL_ID,
)

In [None]:
df_train.reset_index(drop=True, inplace=True)

In [19]:
for _df in df_train, df_valid, df_test:
    _df.reset_index(drop=True, inplace=True)

In [21]:
df_train.to_parquet(DATA_DIR / 'df_train.parquet')
df_valid.to_parquet(DATA_DIR / 'df_valid.parquet')
df_test.to_parquet(DATA_DIR / 'df_test.parquet')