# 生成数据集的描述文件
要提取的信息是时间戳信息以及标签信息  
https://www.kaspersky.com/resource-center/threats/trojans 可以查看家族信息      
https://www.kaspersky.com/resource-center/threats/types-of-malware 查看恶意软件类别

In [1]:
# 根据样本生成csv文件，后续加载对应的json文件并获取label等信息

import csv
import os
from tqdm import tqdm

dirnames = [dirname for dirname in os.listdir('../data/') if '-' in dirname]
columns = ['name', 'first_seen', 'scan_date', 'size', 'trojan', 'virus', 'worm', 'backdoor', 'label']

for dirname in dirnames:
    print('start handle % s' % dirname)
    with open(dirname + '.csv', 'w', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(columns)
        dirpath = '../data/' + dirname
        for file in tqdm(os.listdir(dirpath)):
            if (file == '.ipynb_checkpoints'):
                continue
            if not file.endswith('.json'):
                filepath = os.path.join(dirpath, file)
                res = [filepath] + 8*['none']
                writer.writerow(res)

 66%|██████▌   | 25402/38438 [00:00<00:00, 253983.30it/s]start handle 2017-10-20
100%|██████████| 38438/38438 [00:00<00:00, 252606.68it/s]start handle 2019-11-29

100%|██████████| 47142/47142 [00:00<00:00, 260437.27it/s]
  0%|          | 0/134575 [00:00<?, ?it/s]start handle 2018-03-14
100%|██████████| 134575/134575 [00:00<00:00, 326613.07it/s]
 61%|██████▏   | 26244/42799 [00:00<00:00, 262437.75it/s]start handle 2018-07-17
100%|██████████| 42799/42799 [00:00<00:00, 267031.24it/s]
 35%|███▌      | 24939/71104 [00:00<00:00, 249365.27it/s]start handle 2019-05-10
100%|██████████| 71104/71104 [00:00<00:00, 228103.79it/s]
100%|██████████| 13762/13762 [00:00<00:00, 150384.32it/s]
start handle 2017-11-20
start handle 2018-06-19
100%|██████████| 154001/154001 [00:00<00:00, 339403.89it/s]


In [2]:
# 获取样本的标签信息

import json
import csv
import os
import pandas as pd 
from tqdm import tqdm

# 统计json文件中的各种类别分别出现的次数信息
def label_count(scan_res, label_all):

    label_res = {i: 0 for i in label_all}
    for item in scan_res:
        label = scan_res[item]['result']
        if label:
            label = label.lower()
            for i in label_all[: -1]:
                if i in label:
                    label_res[i] += 1
    
    label_res['label'] = max(label_res, key=label_res.get)

    return label_res


# 判断恶意样本对应的json文件是否存在
def judge_json(path):
    path += '.json'
    if os.path.exists(path):
        return True
    else:
        return False


# 生成对应的csv文件
def generate_json(dir_name, columns, label_all):

    data = pd.read_csv(dir_name)
    data = data[data['name'].apply(judge_json)]
    print(dir_name + " contain ", len(data), "json file")
    data = data.reset_index(drop=True)
    for idx in tqdm(range(len(data))):
        json_path = data.at[idx, 'name'] + '.json'
        # print(json_path)
        try:
            with open(json_path, 'r', encoding='utf-8') as file:
                info = json.load(file)
                scan_res = info['scans']
                # print(scan_res)
                label_res = label_count(scan_res, label_all)
                label_res_1 = {k: v for k, v in label_res.items() if k != 'label'}
                max_value = max(label_res_1.values())
                if max_value < 3:
                    continue
                data.at[idx, 'first_seen'] = info['first_seen']
                
                data.at[idx, 'scan_date'] = info['scan_date']
                data.at[idx, 'size'] = info['size']
                data.at[idx, 'trojan'] = label_res['trojan']
                data.at[idx, 'virus'] = label_res['virus']
                data.at[idx, 'worm'] = label_res['worm']
                data.at[idx, 'backdoor'] = label_res['backdoor']
                data.at[idx, 'label'] = label_res['label']
        except:
            print('JSONDecodeError')

    data = data[data['label'] != 'none']
    data = data.reset_index(drop=True)
    print("after json file extracted, we acquire ", len(data), " available data items")
    dir_name = './' + dir_name
    data.to_csv(dir_name, index = 0)


# 提取json文件里的家族信息以及时间戳信息

label_all = ['trojan', 'virus', 'worm', 'backdoor', 'label']
dir_names = [name for name in os.listdir('./')]
columns = ['name', 'scan_date', 'first_seen', 'size'] + label_all 
for dir_name in dir_names:
    if '-' in dir_name:
        print('start handle ' + dir_name)
        generate_json(dir_name, columns, label_all)

0it [00:00, ?it/s]start handle 2019-11-29.csv
2019-11-29.csv contain  0 json file
after json file extracted, we acquire  0  available data items
start handle 2017-10-20.csv

  1%|          | 235/19219 [00:00<00:08, 2345.44it/s]2017-10-20.csv contain  19219 json file
100%|██████████| 19219/19219 [02:05<00:00, 153.34it/s]
after json file extracted, we acquire  18469  available data items
start handle 2018-07-17.csv
0it [00:00, ?it/s]2018-07-17.csv contain  0 json file
after json file extracted, we acquire  0  available data items
start handle 2018-03-14.csv

  0%|          | 6/56429 [00:00<16:53, 55.69it/s]2018-03-14.csv contain  56429 json file
 35%|███▍      | 19498/56429 [05:26<10:57, 56.14it/s]JSONDecodeError
 66%|██████▌   | 37270/56429 [10:42<04:53, 65.28it/s]JSONDecodeError
100%|██████████| 56429/56429 [16:29<00:00, 57.01it/s]
after json file extracted, we acquire  55160  available data items
start handle 2018-06-19.csv
  0%|          | 9/52056 [00:00<09:42, 89.37it/s]2018-06-19.c

- 2021年6月18日实验记录  
2017-10-20.csv contain 19219 json file  
after json file extracted, we acquire  18469  available data items  
2017-11-20.csv contain  6881 json file  
after json file extracted, we acquire  6878  available data items  
2018-03-14.csv contain  56429 json file  
after json file extracted, we acquire  55160  available data items  
2018-06-19.csv contain  52056 json file  
after json file extracted, we acquire  48997  available data items  
2019-05-10.csv contain  35552 json file  
after json file extracted, we acquire  34420  available data items

In [3]:
# 加载csv文件
import os 
import pandas as pd 
from collections import Counter

df = pd.DataFrame(columns = ['name', 'first_seen', 'scan_date', 'size', 'trojan',
                             'virus', 'worm', 'backdoor', 'label'])
for file in os.listdir('./'):
    if '-' in file:
        print('loading ' + file)
        df_file = pd.read_csv(file)
        print(Counter(df_file.label))
        print(df_file.shape)
        df = pd.concat([df, df_file], axis=0, ignore_index=True)
print(Counter(df.label))
print(df.shape)

loading 2017-10-20.csv
Counter({'trojan': 15126, 'virus': 1840, 'worm': 873, 'backdoor': 630})
(18469, 9)
loading 2018-03-14.csv
Counter({'trojan': 38687, 'virus': 11456, 'worm': 4244, 'backdoor': 773})
(55160, 9)
loading 2018-06-19.csv
Counter({'trojan': 37361, 'virus': 8070, 'worm': 2993, 'backdoor': 573})
(48997, 9)
loading 2017-11-20.csv
Counter({'trojan': 6870, 'worm': 4, 'virus': 3, 'backdoor': 1})
(6878, 9)
loading 2019-05-10.csv
Counter({'trojan': 22575, 'virus': 7040, 'worm': 3881, 'backdoor': 924})
(34420, 9)


- 2021年6月18日实验记录  
loading 2017-10-20.csv  
Counter({'trojan': 15126, 'virus': 1840, 'worm': 873, 'backdoor': 630})  
(18469, 9)  
loading 2018-03-14.csv  
Counter({'trojan': 38687, 'virus': 11456, 'worm': 4244, 'backdoor': 773})  
(55160, 9)  
loading 2018-06-19.csv  
Counter({'trojan': 37361, 'virus': 8070, 'worm': 2993, 'backdoor': 573})  
(48997, 9)  
loading 2017-11-20.csv  
Counter({'trojan': 6870, 'worm': 4, 'virus': 3, 'backdoor': 1})  
(6878, 9)  
loading 2019-05-10.csv  
Counter({'trojan': 22575, 'virus': 7040, 'worm': 3881, 'backdoor': 924})  
(34420, 9)   
总计：  
Counter({'trojan': 120619, 'virus': 28409, 'worm': 11995, 'backdoor': 2901})   
(163924, 9)

In [5]:
# 对时间信息进行处理，只显示到月份

def handle_time(item):
    item = item.split(' ')[0][:10]
    return item

df['scan_date'] = df['scan_date'].apply(handle_time)
df['first_seen'] = df['first_seen'].apply(handle_time)
df.head(5)

Unnamed: 0,name,first_seen,scan_date,size,trojan,virus,worm,backdoor,label
0,../data/2017-10-20/270b645ecae6ff84c246c82d960...,2017-05-11,2017-10-20,1269584,6,3,0,0,trojan
1,../data/2017-10-20/12ddae06e09b87e9148ccb0be49...,2017-10-20,2017-10-20,4575744,7,1,0,0,trojan
2,../data/2017-10-20/ccc7664e5afdec3bf84ecc1878b...,2017-09-20,2017-10-20,2017667,24,0,3,1,trojan
3,../data/2017-10-20/821c9c2ac136f72cd064dcd47c4...,2017-10-20,2017-10-20,24064,18,0,0,15,trojan
4,../data/2017-10-20/75a3f077ca0341cb0143ea4484f...,2017-09-20,2017-10-20,880669,29,1,3,0,trojan


In [7]:
# 剔除掉2016年之前以及2019年之后的样本

def judge(time):
    if time[:4] < '2017' or time[:4] > '2019':
        return False
    else:
        return True
df_1 = df[df['first_seen'].apply(judge)]

print(df_1.shape)
print(Counter(df_1['first_seen'].apply(lambda x: x[:4])))
print(Counter(df_1['label']))
df_1.to_csv('./dataset.csv', index=0)

(154568, 9)
Counter({'2018': 102100, '2019': 33412, '2017': 19056})
Counter({'trojan': 111633, 'virus': 28197, 'worm': 11922, 'backdoor': 2816})


- 2021年6月18日实验记录  
时间序列保留至月份，提出2016年以前及2019年之后的样本后  
数量： 154568  
按年份统计： Counter({'2018': 102100, '2019': 33412, '2017': 19056})  
按样本种类统计： Counter({'trojan': 111633, 'virus': 28197, 'worm': 11922, 'backdoor': 2816})

In [27]:
# # 将有用的样本复制到新的文件夹中，用于反汇编提取操作码

# import shutil 
# from tqdm import tqdm

# num = 0
# for idx, item in tqdm(enumerate(df_1['name'])):
#     path_obj = '../data/dataset/' + item[19:]
#     shutil.copyfile(item, path_obj)



In [8]:
# 将四个类别的样本进行分开采样

df_worm = df_1[df_1['label'] == 'worm']
df_trojan = df_1[df_1['label'] == 'trojan']
df_virus = df_1[df_1['label'] == 'virus']
df_backdoor = df_1[df_1['label'] == 'backdoor']
print((df_worm.shape[0], df_trojan.shape[0], df_virus.shape[0], df_backdoor.shape[0]))
print(Counter(df_worm['first_seen'].apply(lambda x: x[:4])))
print(Counter(df_trojan['first_seen'].apply(lambda x: x[:4])))
print(Counter(df_virus['first_seen'].apply(lambda x: x[:4])))
print(Counter(df_backdoor['first_seen'].apply(lambda x: x[:4])))

(11922, 111633, 28197, 2816)
Counter({'2018': 7204, '2019': 3859, '2017': 859})
Counter({'2018': 74210, '2019': 21675, '2017': 15748})
Counter({'2018': 19363, '2019': 7000, '2017': 1834})
Counter({'2018': 1323, '2019': 878, '2017': 615})


- 2021年6月18日实验记录  
取样前统计    
worm统计：Counter({'2018': 7204, '2019': 3859, '2017': 859})  
trojan统计:Counter({'2018': 74210, '2019': 21675, '2017': 15748})  
virus统计：Counter({'2018': 19363, '2019': 7000, '2017': 1834})  
backdoor统计：Counter({'2018': 1323, '2019': 878, '2017': 615})

In [12]:
# 对virus进行下采样, 选较小的一批文件，取样方式待商榷
df_virus_2017 = df_virus[df_virus['first_seen'].apply(lambda x: x[:4]) == '2017']
df_virus_2018 = df_virus[df_virus['first_seen'].apply(lambda x: x[:4]) == '2018']
df_virus_2019 = df_virus[df_virus['first_seen'].apply(lambda x: x[:4]) == '2019']

df_virus_2017 = df_virus_2017.sort_values(by='size', ascending=True, inplace=False)
df_virus_2017 = df_virus_2017.reset_index(drop=True)
df_virus_2018 = df_virus_2018.sort_values(by='size', ascending=True, inplace=False)
df_virus_2018 = df_virus_2018.reset_index(drop=True)
df_virus_2019 = df_virus_2019.sort_values(by='size', ascending=True, inplace=False)
df_virus_2019 = df_virus_2019.reset_index(drop=True)

df_virus_2017 = df_virus_2017[:1000]
df_virus_2018 = df_virus_2018[:1600]
df_virus_2019 = df_virus_2019[:1000]

df_virus = pd.concat([df_virus_2017, df_virus_2018, df_virus_2019], axis=0, ignore_index=True)
df_virus = df_virus.reset_index(drop=True)

print("virus取样后统计：", Counter(df_virus['first_seen'].apply(lambda x: x[:4])))


# 对trojan进行采样
df_trojan_2017 = df_trojan[df_trojan['first_seen'].apply(lambda x: x[:4]) == '2017']
df_trojan_2018 = df_trojan[df_trojan['first_seen'].apply(lambda x: x[:4]) == '2018']
df_trojan_2019 = df_trojan[df_trojan['first_seen'].apply(lambda x: x[:4]) == '2019']

df_trojan_2017 = df_trojan_2017.sort_values(by='size', ascending=True, inplace=False)
df_trojan_2017 = df_trojan_2017.reset_index(drop=True)
df_trojan_2018 = df_trojan_2018.sort_values(by='size', ascending=True, inplace=False)
df_trojan_2018 = df_trojan_2018.reset_index(drop=True)
df_trojan_2019 = df_trojan_2019.sort_values(by='size', ascending=True, inplace=False)
df_trojan_2019 = df_trojan_2019.reset_index(drop=True)


df_trojan_2017 = df_trojan_2017[:1000]
df_trojan_2018 = df_trojan_2018[:1600]
df_trojan_2019 = df_trojan_2019[:1000]

df_trojan = pd.concat([df_trojan_2017, df_trojan_2018, df_trojan_2019], axis=0, ignore_index=True)
df_trojan = df_trojan.reset_index(drop=True)

print("trojan取样后统计：", Counter(df_trojan['first_seen'].apply(lambda x: x[:4])))


# 对worm进行下采样
df_worm_2017 = df_worm[df_worm['first_seen'].apply(lambda x: x[:4]) == '2017']
df_worm_2018 = df_worm[df_worm['first_seen'].apply(lambda x: x[:4]) == '2018']
df_worm_2019 = df_worm[df_worm['first_seen'].apply(lambda x: x[:4]) == '2019']

df_worm_2017 = df_worm_2017.sort_values(by='size', ascending=True, inplace=False)
df_worm_2017 = df_worm_2017.reset_index(drop=True)
df_worm_2018 = df_worm_2018.sort_values(by='size', ascending=True, inplace=False)
df_worm_2018 = df_worm_2018.reset_index(drop=True)
df_worm_2019 = df_worm_2019.sort_values(by='size', ascending=True, inplace=False)
df_worm_2019 = df_worm_2019.reset_index(drop=True)


#df_worm_2017 = df_worm_2017[:1000]
df_worm_2018 = df_worm_2018[:1600]
df_worm_2019 = df_worm_2019[:1000]

df_worm = pd.concat([df_worm_2017, df_worm_2018, df_worm_2019], axis=0, ignore_index=True)
df_worm = df_worm.reset_index(drop=True)

print("worm取样后统计：", Counter(df_worm['first_seen'].apply(lambda x: x[:4])))


# 将数据进行合并然后生成总的csv文件
df_2 = pd.concat([df_backdoor, df_trojan, df_virus, df_worm], axis=0, ignore_index=True)
df_2 = df_2.reset_index(drop=True)

df_2.to_csv('./dataset_handled.csv', index=0)
print(len(df_2))
print("合并后按年份进行统计：", Counter(df_2['first_seen'].apply(lambda x: x[:4])))
print("合并后按类型统计：", Counter(df_2['label']))

virus取样后统计： Counter({'2018': 1600, '2017': 1000, '2019': 1000})
trojan取样后统计： Counter({'2018': 1600, '2017': 1000, '2019': 1000})
worm取样后统计： Counter({'2018': 1600, '2019': 1000, '2017': 859})
13475
合并后按年份进行统计： Counter({'2018': 6123, '2019': 3878, '2017': 3474})
合并后按类型统计： Counter({'trojan': 3600, 'virus': 3600, 'worm': 3459, 'backdoor': 2816})


- 2021年6月18日实验记录  
以backdoor的数量为参照进行取样  
virus取样后统计： Counter({'2018': 1600, '2017': 1000, '2019': 1000})  
trojan取样后统计： Counter({'2018': 1600, '2017': 1000, '2019': 1000})  
worm取样后统计： Counter({'2018': 1600, '2019': 1000, '2017': 859})   
取样后总数量： 13475  
合并后按年份进行统计： Counter({'2018': 6123, '2019': 3878, '2017': 3474})  
合并后按类型统计： Counter({'trojan': 3600, 'virus': 3600, 'worm': 3459, 'backdoor': 2816})


In [42]:
import pandas as pd 

df_2 = pd.read_csv('dataset.csv')

# 将样本拷贝到新的文件夹

import shutil
from tqdm import tqdm

num = 0
for idx, item in tqdm(enumerate(df_2['name'])):
    path_obj = '../dataset' + item[7:]
    shutil.copyfile(item, path_obj)

df_2018 = df_2[df_2['first_seen'].apply(lambda x: x[:4]) == '2018']
Counter(df_2018['first_seen'].apply(lambda x: x[:7]))