# 生成数据集的描述文件
要提取的信息是时间戳信息以及标签信息  
https://www.kaspersky.com/resource-center/threats/trojans 可以查看家族信息  
https://www.kaspersky.com/resource-center/threats/types-of-malware 查看恶意软件类别

In [None]:
# 根据样本去生成csv文件,然后再加载对应的json文件并获取label等信息

import csv
import os
from tqdm import tqdm


dirnames = [dirname for dirname in os.listdir('../data/') if '-' in dirname]
columns = ['name', 'first_seen', 'scan_date', 'size', 'trojan', 'virus', 'worm', 'backdoor', 'label']

for dirname in dirnames:
    print('start handle %s' % dirname)
    with open(dirname + '.csv', 'w', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(columns)
        dirpath = '../data/' + dirname
        for file in tqdm(os.listdir(dirpath)):
            if(file == '.ipynb_checkpoints'):
                continue
            if not file.endswith('.json'):
                filepath = os.path.join(dirpath, file)
                res = [filepath] + 8*['none']
                writer.writerow(res)

In [None]:
# 获取样本的标签信息

import json
import csv
import os
import pandas as pd
from tqdm import tqdm


# 统计json文件中六种类 别出现的次数信息
def label_count(scan_res, label_all):
    
    label_res = {i: 0 for i in label_all}
    for item in scan_res:
        label = scan_res[item]['result']
        if label:
            label = label.lower()
            for i in label_all[: -1]:
                if i in label:
                    label_res[i] += 1       
    label_res['label'] = max(label_res, key=label_res.get)
    return label_res
           

# 判断恶意样本对应的json文件是否存在
def judge_json(path):
    path += '.json'
    if os.path.exists(path):
        return True
    else:
        return False
    

# 生成对应的json文件
def generate_json(dir_name, columns, label_all):
    
    data = pd.read_csv(dir_name)
    data = data[data['name'].apply(judge_json)]
    data = data.reset_index(drop=True)
    for idx in tqdm(range(len(data))):
        json_path = data.at[idx, 'name'] + '.json'
        try:
            with open(json_path, 'r') as file:
                info = json.load(file)
                scan_res = info['scans']
                label_res = label_count(scan_res, label_all)
                label_res_1 = {k: v for k, v in label_res.items() if k != 'label'}
                max_value = max(label_res_1.values())
                if max_value < 3:
                    continue       
                data.at[idx, 'first_seen'] = info['first_seen']
                data.at[idx, 'scan_date'] = info['scan_date']
                data.at[idx, 'size'] = info['size']
                data.at[idx, 'trojan'] = label_res['trojan']
                data.at[idx, 'virus'] = label_res['virus']
                data.at[idx, 'worm'] = label_res['worm']
                data.at[idx, 'backdoor'] = label_res['backdoor']
                data.at[idx, 'label'] = label_res['label']
        except:
                print('JSONDecodeError')
    data = data[data['label'] != 'none']
    data = data.reset_index(drop=True)
    dir_name = './' + dir_name
    data.to_csv(dir_name, index = 0)

# 提取json文件里的家族信息以及时间戳信息

label_all = ['trojan', 'virus', 'worm', 'backdoor', 'label']
dir_names = [name for name in os.listdir('./')]
columns = ['name', 'scan_date', 'first_seen', 'size'] + label_all
for dir_name in dir_names:
    if '-' in dir_name:
        print('start handle ' + dir_name)
        generate_json(dir_name, columns, label_all)

In [None]:
# 加载csv文件, 合并
import os
import pandas as pd

df = pd.DataFrame(columns = ['name', 'first_seen', 'scan_date', 'size',
                             'trojan', 'virus', 'worm', 'backdoor', 'label'])
for file in os.listdir('./'):
    if '-' in file:
        print('loading ' + file)
        df_file = pd.read_csv(file)
        df = pd.concat([df,df_file], axis=0, ignore_index=True, sort=False)

df.shape

In [None]:
# 对时间信息进行处理，只显示到月份

def handle_time(item):
    item = item.split(' ')[0][:10]
    return item

df['scan_date'] = df['scan_date'].apply(handle_time)
df['first_seen'] = df['first_seen'].apply(handle_time)

In [None]:
# 剔除掉2016年之前以及2019年之后的样本

def judge(time):
    if time[:4] < '2017' or time[:4] > '2019':
        return False
    else:
        return True
df_1 = df[df['first_seen'].apply(judge)]


In [None]:
from collections import Counter


Counter(df_1['first_seen'].apply(lambda x: x[:4]))
df_1.shape      
Counter(df_1.label)

In [None]:
df_1.to_csv('./dataset.csv', index=0)