# 任务目标

# 调包区

In [1]:
from tqdm.notebook import tqdm
import random
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
import gc

# 警告忽略
import warnings
warnings.filterwarnings("ignore")

# matplotlib字体设置
plt.rcParams["font.family"] = "Songti SC"
plt.rcParams["axes.unicode_minus"] = False

# matplotlib警告忽略
pd.plotting.register_matplotlib_converters()


# 观看Dataframe长度
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
# 浮点数位长度
pd.set_option('display.precision',5)

# 显示多个结果
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # ['all', 'last', 'last_expr', 'none', 'last_expr_or_assign']

# 打开少量文件进行预览

In [2]:
# 效果：实现快速预览超大数据集的效果，而不需要等到全部加载玩才能观看
path = './fulldata'

train_data = pd.read_csv(f'{path}/security_train.csv',iterator=True)
test_data = pd.read_csv(f'{path}/security_test.csv',iterator=True)
train_data.get_chunk(10)
test_data.get_chunk(10)


Unnamed: 0,file_id,label,api,tid,index
0,1,5,LdrLoadDll,2488,0
1,1,5,LdrGetProcedureAddress,2488,1
2,1,5,LdrGetProcedureAddress,2488,2
3,1,5,LdrGetProcedureAddress,2488,3
4,1,5,LdrGetProcedureAddress,2488,4
5,1,5,LdrGetProcedureAddress,2488,5
6,1,5,LdrGetProcedureAddress,2488,6
7,1,5,LdrGetProcedureAddress,2488,7
8,1,5,LdrGetProcedureAddress,2488,8
9,1,5,LdrGetProcedureAddress,2488,9


Unnamed: 0,file_id,api,tid,index
0,1,RegOpenKeyExA,2332,0
1,1,CopyFileA,2332,1
2,1,OpenSCManagerA,2332,2
3,1,CreateServiceA,2332,3
4,1,RegOpenKeyExA,2468,0
5,1,CopyFileA,2468,1
6,1,OpenSCManagerA,2468,2
7,1,CreateServiceA,2468,3
8,1,StartServiceA,2468,4
9,1,NtCreateThreadEx,2468,5


# 批量打开全部文件

In [3]:
def read_csv_chunk(file_path):
    # 定义chunk
    chunk_data = pd.read_csv(file_path,chunksize=100000)

    # 遍历打开
    df_list = []
    for part_df in tqdm(chunk_data):
        df_list.append(part_df)
    print("打开完成！")

    # 合并表格
    df = pd.concat(df_list,axis=0)
    
    # 回收内存
    del df_list
    gc.collect()
    return(df)


path = './fulldata'

train_data = read_csv_chunk(f'{path}/security_train.csv') # 打开训练集
test_data = read_csv_chunk(f'{path}/security_test.csv') # 打开测试集

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


打开完成！


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…


打开完成！


# 进行简单的数据观察

In [4]:
train_data.isnull().sum()
train_data.describe().T


file_id    0
label      0
api        0
tid        0
index      0
dtype: int64

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
file_id,89806693.0,7078.77015,3998.79413,1.0,3637.0,7161.0,10551.0,13887.0
label,89806693.0,3.86284,2.39378,0.0,2.0,5.0,5.0,7.0
tid,89806693.0,2533.02802,699.57977,100.0,2356.0,2564.0,2776.0,20896.0
index,89806693.0,1547.52081,1412.24899,0.0,349.0,1085.0,2503.0,5000.0


In [5]:
test_data.isnull().sum()
test_data.describe().T

file_id    0
api        0
tid        0
index      0
dtype: int64

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
file_id,79288375.0,6446.97137,3712.19238,1.0,3208.0,6387.0,9610.0,12955.0
tid,79288375.0,2491.91428,582.45997,100.0,2360.0,2556.0,2752.0,9196.0
index,79288375.0,1584.81483,1411.1161,0.0,390.0,1131.0,2547.0,5000.0


# 进行数据切分

In [6]:
def get_demo_data(df):
    ############################################################

    # 进行数据切割
    from sklearn.model_selection import train_test_split

    file_id_list = df['file_id'].unique()

    _id,demo_id = train_test_split(file_id_list,test_size=0.02)
    print("demo的file_id长度:",demo_id.shape[0])

    demo_df = df[df['file_id'].isin(demo_id)]
    demo_df.info()
    
    return (demo_df)


train_demo_data = get_demo_data(train_data)
test_demo_data = get_demo_data(test_data)



demo的file_id长度: 278
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1930016 entries, 1697900 to 89798401
Data columns (total 5 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   file_id  int64 
 1   label    int64 
 2   api      object
 3   tid      int64 
 4   index    int64 
dtypes: int64(4), object(1)
memory usage: 88.3+ MB
demo的file_id长度: 260
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1855807 entries, 1308695 to 79201751
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   file_id  int64 
 1   api      object
 2   tid      int64 
 3   index    int64 
dtypes: int64(3), object(1)
memory usage: 70.8+ MB


# 保存DEMO数据

In [7]:
path = './sampledata'
train_demo_data.to_csv(f'{path}/security_train.csv',index=None)
test_demo_data.to_csv(f'{path}/security_test.csv',index=None)