In [1]:
%%time
import pandas as pd
import pickle
with open('./datasets/train.pkl', 'rb') as f:
    train = pickle.load(f)
    
with open('./datasets/test.pkl', 'rb') as f:
    test = pickle.load(f)

Wall time: 7.68 s


In [2]:
# 构造新特征
def get_feature(df):
    df_file = df.groupby('file_id')
    if 'label' in df.columns:
        df1 = df.drop_duplicates(subset = ['file_id', 'label'], keep='first')
    else:
        df1 = df.drop_duplicates(subset = ['file_id'], keep='first')
    df1 = df1.sort_values('file_id')
    
    # 提取多个特征 统计特征
    features = ['tid', 'index']
    for f in features:
        df1[f + '_count'] = df_file[f].count().values
        df1[f + '_nuinque'] = df_file[f].nunique().values
        df1[f + '_min'] = df_file[f].min().values
        df1[f + '_max'] = df_file[f].max().values
        df1[f + '_median'] = df_file[f].median().values
        df1[f + '_std'] = df_file[f].std().values
    return df1

In [3]:
# 从df种提取api_sequence
def get_apis(df):
    # 按照file_id进行分组
    group_fileid = df.groupby('file_id')
    
    # 统计file_id 和对应的 api_sequence
    file_api = {}
    
    # 计算每个file_id的api_sequence
    for file_id, file_group in group_fileid:
        # 针对file_id 按照线程tid 和 顺序index进行排序
        result = file_group.sort_values(['tid', 'index'], ascending=True)
        # 得到api的调用序列
        api_sequence = ' '.join(result['api'])
        #print(api_sequence)
        file_api[file_id] = api_sequence
    return file_api

train_apis = get_apis(train)
len(train_apis)

13887

In [4]:
%%time
test_apis = get_apis(test)
len(test_apis)

Wall time: 29.9 s


13887

In [5]:
import pickle
with open('./datasets/df_train.pkl', 'rb') as file:
    df_train = pickle.load(file)
    
with open('./datasets/df_test.pkl', 'rb') as file:
    df_test = pickle.load(file)
df_train

Unnamed: 0,file_id,label,api,tid,index,tid_count,tid_nuinque,tid_min,tid_max,tid_median,tid_std,index_count,index_nuinque,index_min,index_max,index_median,index_std
0,1,5,LdrLoadDll,2488,0,6786,11,2488,2812,2488.0,83.881299,6786,5001,0,5000,1607.5,1510.694221
6786,2,2,GetSystemTimeAsFileTime,2320,0,816,4,2320,2604,2480.0,101.506783,816,204,0,203,101.5,58.925137
7602,3,0,NtAllocateVirtualMemory,2208,0,463,1,2208,2208,2208.0,0.000000,463,463,0,462,231.0,133.800847
8065,4,0,GetSystemTimeAsFileTime,2284,0,2046,9,2284,2980,2340.0,150.460506,2046,1028,0,1027,511.0,295.407885
10111,5,0,SetErrorMode,2500,0,10002,6,2500,2676,2596.0,49.556301,10002,5001,0,5000,2500.0,1443.736493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89620181,13883,2,GetSystemTimeAsFileTime,100,0,178221,998,100,6568,3392.0,1405.045515,178221,5001,0,5000,47.0,1008.636040
89798402,13884,5,GetSystemTimeAsFileTime,2592,0,1319,2,2592,2748,2592.0,4.295386,1319,1319,0,1318,659.0,380.906813
89799721,13885,0,NtAllocateVirtualMemory,2240,0,1033,3,2240,2744,2240.0,33.152020,1033,1033,0,1032,516.0,298.345717
89800754,13886,1,GetSystemTimeAsFileTime,2324,0,5316,10,2324,2836,2600.0,154.796790,5316,2503,0,2502,1165.5,755.545651


In [6]:
df_train['api']

0                        LdrLoadDll
6786        GetSystemTimeAsFileTime
7602        NtAllocateVirtualMemory
8065        GetSystemTimeAsFileTime
10111                  SetErrorMode
                     ...           
89620181    GetSystemTimeAsFileTime
89798402    GetSystemTimeAsFileTime
89799721    NtAllocateVirtualMemory
89800754    GetSystemTimeAsFileTime
89806070                 LdrLoadDll
Name: api, Length: 13887, dtype: object

In [7]:
df_train.drop(['api', 'tid', 'index'], axis=1, inplace=True)
df_test.drop(['api', 'tid', 'index'], axis=1, inplace=True)

In [8]:

temp = pd.DataFrame.from_dict(train_apis, orient='index', columns=['api'])
temp = temp.reset_index().rename(columns={'index': 'file_id'})
df_train = df_train.merge(temp, on='file_id', how='left')
df_train

Unnamed: 0,file_id,label,tid_count,tid_nuinque,tid_min,tid_max,tid_median,tid_std,index_count,index_nuinque,index_min,index_max,index_median,index_std,api
0,1,5,6786,11,2488,2812,2488.0,83.881299,6786,5001,0,5000,1607.5,1510.694221,LdrLoadDll LdrGetProcedureAddress LdrGetProced...
1,2,2,816,4,2320,2604,2480.0,101.506783,816,204,0,203,101.5,58.925137,GetSystemTimeAsFileTime SetUnhandledExceptionF...
2,3,0,463,1,2208,2208,2208.0,0.000000,463,463,0,462,231.0,133.800847,NtAllocateVirtualMemory SetErrorMode LdrLoadDl...
3,4,0,2046,9,2284,2980,2340.0,150.460506,2046,1028,0,1027,511.0,295.407885,GetSystemTimeAsFileTime GetSystemTimeAsFileTim...
4,5,0,10002,6,2500,2676,2596.0,49.556301,10002,5001,0,5000,2500.0,1443.736493,SetErrorMode LdrGetDllHandle LdrGetProcedureAd...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13882,13883,2,178221,998,100,6568,3392.0,1405.045515,178221,5001,0,5000,47.0,1008.636040,GetSystemTimeAsFileTime GetSystemTimeAsFileTim...
13883,13884,5,1319,2,2592,2748,2592.0,4.295386,1319,1319,0,1318,659.0,380.906813,GetSystemTimeAsFileTime GetSystemDirectoryW Nt...
13884,13885,0,1033,3,2240,2744,2240.0,33.152020,1033,1033,0,1032,516.0,298.345717,NtAllocateVirtualMemory NtFreeVirtualMemory Nt...
13885,13886,1,5316,10,2324,2836,2600.0,154.796790,5316,2503,0,2502,1165.5,755.545651,GetSystemTimeAsFileTime GetSystemTimeAsFileTim...


In [9]:
temp = pd.DataFrame.from_dict(test_apis, orient='index', columns=['api'])
temp = temp.reset_index().rename(columns={'index': 'file_id'})
df_test = df_test.merge(temp, on='file_id', how='left')
df_test

Unnamed: 0,file_id,tid_count,tid_nuinque,tid_min,tid_max,tid_median,tid_std,index_count,index_nuinque,index_min,index_max,index_median,index_std,api
0,1,97,4,2332,2568,2544.0,57.218548,97,31,0,30,14.0,9.210466,LdrLoadDll LdrGetProcedureAddress LdrGetProced...
1,2,1361,7,2472,2748,2524.0,104.399149,1361,681,0,680,340.0,196.515744,GetSystemTimeAsFileTime SetUnhandledExceptionF...
2,3,16,1,2344,2344,2344.0,0.000000,16,16,0,15,7.5,4.760952,NtAllocateVirtualMemory SetErrorMode LdrLoadDl...
3,4,193,3,2452,2584,2452.0,50.951508,193,193,0,192,96.0,55.858452,GetSystemTimeAsFileTime GetSystemTimeAsFileTim...
4,5,803,3,2332,2780,2376.0,201.826813,803,268,0,267,133.0,77.317048,SetErrorMode LdrGetDllHandle LdrGetProcedureAd...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12950,12951,289,6,2644,2980,2776.0,75.402526,289,145,0,144,72.0,41.786414,LdrLoadDll LdrGetProcedureAddress LdrGetProced...
12951,12952,112,1,2264,2264,2264.0,0.000000,112,112,0,111,55.5,32.475632,GetSystemTimeAsFileTime NtAllocateVirtualMemor...
12952,12953,5095,15,2324,2884,2708.0,196.695730,5095,1464,0,1463,454.0,393.605016,LdrLoadDll LdrGetProcedureAddress LdrGetProced...
12953,12954,2951,4,2424,2700,2680.0,126.124152,2951,1445,0,1444,555.0,397.358069,LdrGetDllHandle LdrGetProcedureAddress LdrGetP...


In [10]:
df_all = pd.concat([df_train, df_test], axis=0)
df_all

Unnamed: 0,file_id,label,tid_count,tid_nuinque,tid_min,tid_max,tid_median,tid_std,index_count,index_nuinque,index_min,index_max,index_median,index_std,api
0,1,5.0,6786,11,2488,2812,2488.0,83.881299,6786,5001,0,5000,1607.5,1510.694221,LdrLoadDll LdrGetProcedureAddress LdrGetProced...
1,2,2.0,816,4,2320,2604,2480.0,101.506783,816,204,0,203,101.5,58.925137,GetSystemTimeAsFileTime SetUnhandledExceptionF...
2,3,0.0,463,1,2208,2208,2208.0,0.000000,463,463,0,462,231.0,133.800847,NtAllocateVirtualMemory SetErrorMode LdrLoadDl...
3,4,0.0,2046,9,2284,2980,2340.0,150.460506,2046,1028,0,1027,511.0,295.407885,GetSystemTimeAsFileTime GetSystemTimeAsFileTim...
4,5,0.0,10002,6,2500,2676,2596.0,49.556301,10002,5001,0,5000,2500.0,1443.736493,SetErrorMode LdrGetDllHandle LdrGetProcedureAd...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12950,12951,,289,6,2644,2980,2776.0,75.402526,289,145,0,144,72.0,41.786414,LdrLoadDll LdrGetProcedureAddress LdrGetProced...
12951,12952,,112,1,2264,2264,2264.0,0.000000,112,112,0,111,55.5,32.475632,GetSystemTimeAsFileTime NtAllocateVirtualMemor...
12952,12953,,5095,15,2324,2884,2708.0,196.695730,5095,1464,0,1463,454.0,393.605016,LdrLoadDll LdrGetProcedureAddress LdrGetProced...
12953,12954,,2951,4,2424,2700,2680.0,126.124152,2951,1445,0,1444,555.0,397.358069,LdrGetDllHandle LdrGetProcedureAddress LdrGetP...


In [11]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer

# 使用1-3元语法（1元语法 + 2元语法 + 3元语法）
# 将min_df = 0.1 => TFIDF特征数大幅减少 => 速度快\n# 也可以设置 max_df = 0.8\n# 也可以用 PCA降维

vec = TfidfVectorizer(ngram_range=(1, 3), min_df=0.1, max_df = 0.8)
api_features = vec.fit_transform(df_all['api'])
api_features


Wall time: 3min 28s


<26842x1511 sparse matrix of type '<class 'numpy.float64'>'
	with 8618512 stored elements in Compressed Sparse Row format>

In [12]:
api_features

<26842x1511 sparse matrix of type '<class 'numpy.float64'>'
	with 8618512 stored elements in Compressed Sparse Row format>

In [13]:
df_apis = pd.DataFrame(api_features.toarray(), columns=vec.get_feature_names())
df_apis.to_pickle('./df_apis.pkl')
df_apis

Unnamed: 0,__exception__,__exception__ ntterminateprocess,closesocket,cocreateinstance,cocreateinstance cocreateinstance,cocreateinstance ldrloaddll,cocreateinstance ldrloaddll ldrgetprocedureaddress,cocreateinstance ntallocatevirtualmemory,cocreateinstance ntallocatevirtualmemory ntallocatevirtualmemory,cocreateinstance ntopenkey,...,thread32next ntclose __exception__,thread32next ntclose ldrgetdllhandle,thread32next thread32next,thread32next thread32next ntclose,thread32next thread32next thread32next,uuidcreate,uuidcreate ntopenthread,uuidcreate ntopenthread ldrunloaddll,writeconsolew,wsastartup
0,0.000324,0.000440,0.000456,0.004928,0.002036,0.000896,0.0009,0.0,0.0,0.001044,...,0.000497,0.000493,0.518241,0.001010,0.517259,0.000784,0.000481,0.000484,0.0,0.004217
1,0.000000,0.000000,0.060404,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.042962
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,0.000000,0.000000,0.000000,0.000718,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26837,0.000351,0.000477,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000538,0.000534,0.577225,0.001094,0.576162,0.000000,0.000000,0.000000,0.0,0.000351
26838,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
26839,0.000358,0.000487,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000550,0.000546,0.577205,0.001117,0.576120,0.000000,0.000000,0.000000,0.0,0.000000
26840,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [14]:
df_train_apis = df_apis[df_apis.index <= 13886]
df_test_apis = df_apis[df_apis.index > 13886]
df_test_apis

Unnamed: 0,__exception__,__exception__ ntterminateprocess,closesocket,cocreateinstance,cocreateinstance cocreateinstance,cocreateinstance ldrloaddll,cocreateinstance ldrloaddll ldrgetprocedureaddress,cocreateinstance ntallocatevirtualmemory,cocreateinstance ntallocatevirtualmemory ntallocatevirtualmemory,cocreateinstance ntopenkey,...,thread32next ntclose __exception__,thread32next ntclose ldrgetdllhandle,thread32next thread32next,thread32next thread32next ntclose,thread32next thread32next thread32next,uuidcreate,uuidcreate ntopenthread,uuidcreate ntopenthread ldrunloaddll,writeconsolew,wsastartup
13887,0.000324,0.000440,0.000456,0.004928,0.002036,0.000896,0.0009,0.0,0.0,0.001044,...,0.000497,0.000493,0.518241,0.001010,0.517259,0.000784,0.000481,0.000484,0.0,0.004217
13888,0.000000,0.000000,0.060404,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.042962
13889,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
13890,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
13891,0.000000,0.000000,0.000000,0.000718,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26837,0.000351,0.000477,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000538,0.000534,0.577225,0.001094,0.576162,0.000000,0.000000,0.000000,0.0,0.000351
26838,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
26839,0.000358,0.000487,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000550,0.000546,0.577205,0.001117,0.576120,0.000000,0.000000,0.000000,0.0,0.000000
26840,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [15]:

df_test_apis.index = range(len(df_test_apis))
df_test_apis

Unnamed: 0,__exception__,__exception__ ntterminateprocess,closesocket,cocreateinstance,cocreateinstance cocreateinstance,cocreateinstance ldrloaddll,cocreateinstance ldrloaddll ldrgetprocedureaddress,cocreateinstance ntallocatevirtualmemory,cocreateinstance ntallocatevirtualmemory ntallocatevirtualmemory,cocreateinstance ntopenkey,...,thread32next ntclose __exception__,thread32next ntclose ldrgetdllhandle,thread32next thread32next,thread32next thread32next ntclose,thread32next thread32next thread32next,uuidcreate,uuidcreate ntopenthread,uuidcreate ntopenthread ldrunloaddll,writeconsolew,wsastartup
0,0.000324,0.000440,0.000456,0.004928,0.002036,0.000896,0.0009,0.0,0.0,0.001044,...,0.000497,0.000493,0.518241,0.001010,0.517259,0.000784,0.000481,0.000484,0.0,0.004217
1,0.000000,0.000000,0.060404,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.042962
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,0.000000,0.000000,0.000000,0.000718,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12950,0.000351,0.000477,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000538,0.000534,0.577225,0.001094,0.576162,0.000000,0.000000,0.000000,0.0,0.000351
12951,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
12952,0.000358,0.000487,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000550,0.000546,0.577205,0.001117,0.576120,0.000000,0.000000,0.000000,0.0,0.000000
12953,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [16]:
# 将tfidf特征 与原特征进行合并
df_train = df_train.merge(df_train_apis, left_index=True, right_index=True)
df_test = df_test.merge(df_test_apis, left_index=True, right_index=True)


In [17]:
df_test

Unnamed: 0,file_id,tid_count,tid_nuinque,tid_min,tid_max,tid_median,tid_std,index_count,index_nuinque,index_min,...,thread32next ntclose __exception__,thread32next ntclose ldrgetdllhandle,thread32next thread32next,thread32next thread32next ntclose,thread32next thread32next thread32next,uuidcreate,uuidcreate ntopenthread,uuidcreate ntopenthread ldrunloaddll,writeconsolew,wsastartup
0,1,97,4,2332,2568,2544.0,57.218548,97,31,0,...,0.000497,0.000493,0.518241,0.001010,0.517259,0.000784,0.000481,0.000484,0.0,0.004217
1,2,1361,7,2472,2748,2524.0,104.399149,1361,681,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.042962
2,3,16,1,2344,2344,2344.0,0.000000,16,16,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
3,4,193,3,2452,2584,2452.0,50.951508,193,193,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,5,803,3,2332,2780,2376.0,201.826813,803,268,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12950,12951,289,6,2644,2980,2776.0,75.402526,289,145,0,...,0.000538,0.000534,0.577225,0.001094,0.576162,0.000000,0.000000,0.000000,0.0,0.000351
12951,12952,112,1,2264,2264,2264.0,0.000000,112,112,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
12952,12953,5095,15,2324,2884,2708.0,196.695730,5095,1464,0,...,0.000550,0.000546,0.577205,0.001117,0.576120,0.000000,0.000000,0.000000,0.0,0.000000
12953,12954,2951,4,2424,2700,2680.0,126.124152,2951,1445,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [18]:
df_train

Unnamed: 0,file_id,label,tid_count,tid_nuinque,tid_min,tid_max,tid_median,tid_std,index_count,index_nuinque,...,thread32next ntclose __exception__,thread32next ntclose ldrgetdllhandle,thread32next thread32next,thread32next thread32next ntclose,thread32next thread32next thread32next,uuidcreate,uuidcreate ntopenthread,uuidcreate ntopenthread ldrunloaddll,writeconsolew,wsastartup
0,1,5,6786,11,2488,2812,2488.0,83.881299,6786,5001,...,0.000497,0.000493,0.518241,0.00101,0.517259,0.000784,0.000481,0.000484,0.000000,0.004217
1,2,2,816,4,2320,2604,2480.0,101.506783,816,204,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.042962
2,3,0,463,1,2208,2208,2208.0,0.000000,463,463,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,4,0,2046,9,2284,2980,2340.0,150.460506,2046,1028,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,5,0,10002,6,2500,2676,2596.0,49.556301,10002,5001,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13882,13883,2,178221,998,100,6568,3392.0,1405.045515,178221,5001,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000952,0.000000,0.000000,0.001258,0.000000
13883,13884,5,1319,2,2592,2748,2592.0,4.295386,1319,1319,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.001388,0.000000,0.000000,0.735495,0.000000
13884,13885,0,1033,3,2240,2744,2240.0,33.152020,1033,1033,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
13885,13886,1,5316,10,2324,2836,2600.0,154.796790,5316,2503,...,0.000000,0.000000,0.000000,0.00000,0.000000,0.002535,0.000000,0.000000,0.000000,0.000000


In [19]:
df_train.to_pickle('./df_train2.pkl')
df_test.to_pickle('./df_test2.pkl')

In [20]:
# 查看某个变量的资源使用
import sys
sys.getsizeof(df_train) / 1024 /1024 # M

1513.5490188598633

In [21]:
#print(df_train.dtypes.values)
#df_train.select_dtypes(include='O')
df_train.drop('api', axis=1, inplace=True)
df_test.drop('api', axis=1, inplace=True)

In [22]:
df_train[df_train.columns[:15]]

Unnamed: 0,file_id,label,tid_count,tid_nuinque,tid_min,tid_max,tid_median,tid_std,index_count,index_nuinque,index_min,index_max,index_median,index_std,__exception__
0,1,5,6786,11,2488,2812,2488.0,83.881299,6786,5001,0,5000,1607.5,1510.694221,0.000324
1,2,2,816,4,2320,2604,2480.0,101.506783,816,204,0,203,101.5,58.925137,0.000000
2,3,0,463,1,2208,2208,2208.0,0.000000,463,463,0,462,231.0,133.800847,0.000000
3,4,0,2046,9,2284,2980,2340.0,150.460506,2046,1028,0,1027,511.0,295.407885,0.000000
4,5,0,10002,6,2500,2676,2596.0,49.556301,10002,5001,0,5000,2500.0,1443.736493,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13882,13883,2,178221,998,100,6568,3392.0,1405.045515,178221,5001,0,5000,47.0,1008.636040,0.000000
13883,13884,5,1319,2,2592,2748,2592.0,4.295386,1319,1319,0,1318,659.0,380.906813,0.000000
13884,13885,0,1033,3,2240,2744,2240.0,33.152020,1033,1033,0,1032,516.0,298.345717,0.000000
13885,13886,1,5316,10,2324,2836,2600.0,154.796790,5316,2503,0,2502,1165.5,755.545651,0.002094


In [23]:
%%time
import lightgbm as lgb
clf = lgb.LGBMClassifier(
    num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='multiclass',
    max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
    n_estimators=2000, subsample=1, colsample_bytree=1
)
clf.fit(df_train.drop('label', axis=1), df_train['label'])


Wall time: 17min 24s


LGBMClassifier(colsample_bytree=1, learning_rate=0.005, min_child_samples=3,
               n_estimators=2000, objective='multiclass', random_state=2021,
               reg_alpha=0.25, reg_lambda=0.25, subsample=1)

In [30]:
result = clf.predict_proba(df_test)
result
result_lgb = pd.DataFrame(result, columns=['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7'])
result_lgb['file_id'] = df_test['file_id'].values
result_lgb

Unnamed: 0,prob0,prob1,prob2,prob3,prob4,prob5,prob6,prob7,file_id
0,0.000118,0.000006,0.000018,0.000011,0.000002,0.999742,0.000019,0.000084,1
1,0.000064,0.000004,0.999531,0.000006,0.000001,0.000017,0.000017,0.000360,2
2,0.995603,0.000048,0.000185,0.000226,0.000030,0.000977,0.000071,0.002860,3
3,0.996608,0.000014,0.002648,0.000021,0.000007,0.000052,0.000101,0.000549,4
4,0.998940,0.000035,0.000176,0.000040,0.000007,0.000205,0.000052,0.000545,5
...,...,...,...,...,...,...,...,...,...
12950,0.000091,0.000007,0.000044,0.000029,0.000004,0.999290,0.000033,0.000503,12951
12951,0.997572,0.000006,0.001952,0.000061,0.000008,0.000159,0.000020,0.000223,12952
12952,0.000212,0.000043,0.000812,0.000128,0.000017,0.995303,0.000321,0.003165,12953
12953,0.996817,0.000120,0.000268,0.000143,0.000019,0.001066,0.000115,0.001451,12954


In [31]:
columns = ['file_id', 'prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']
result_lgb.to_csv('./baseline_lgb_2000_tfidf.csv', index=False, columns=columns)

In [32]:
import xgboost as xgb
model_xgb = xgb.XGBClassifier(
            max_depth=9, learning_rate=0.005, n_estimators=2000, 
            objective='multi:softprob', tree_method='gpu_hist', 
            subsample=0.8, colsample_bytree=0.8, 
            min_child_samples=3, eval_metric='logloss', reg_lambda=0.5)
model_xgb.fit(df_train.drop('label', axis=1), df_train['label'])



Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8,
              enable_categorical=False, eval_metric='logloss', gamma=0,
              gpu_id=0, importance_type=None, interaction_constraints='',
              learning_rate=0.005, max_delta_step=0, max_depth=9,
              min_child_samples=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=2000, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=0.5,
              scale_pos_weight=None, subsample=0.8, tree_method='gpu_hist',
              validate_parameters=1, ...)

In [33]:
result_xgb = model_xgb.predict_proba(df_test)
result_xgb = pd.DataFrame(result_xgb, columns=['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7'])
result_xgb['file_id'] = df_test['file_id'].values
result_xgb

Unnamed: 0,prob0,prob1,prob2,prob3,prob4,prob5,prob6,prob7,file_id
0,0.000355,0.000052,0.000125,0.000058,0.000041,0.998967,0.000107,0.000296,1
1,0.000324,0.000071,0.998072,0.000093,0.000058,0.000184,0.000111,0.001087,2
2,0.992375,0.000365,0.000610,0.000441,0.000231,0.002195,0.000454,0.003329,3
3,0.994906,0.000172,0.003564,0.000164,0.000112,0.000192,0.000250,0.000640,4
4,0.997760,0.000153,0.000449,0.000147,0.000083,0.000361,0.000214,0.000833,5
...,...,...,...,...,...,...,...,...,...
12950,0.000286,0.000089,0.000263,0.000157,0.000095,0.998233,0.000206,0.000671,12951
12951,0.993361,0.000143,0.004119,0.000399,0.000120,0.000731,0.000384,0.000742,12952
12952,0.000425,0.000174,0.000867,0.000339,0.000228,0.995728,0.000631,0.001608,12953
12953,0.987301,0.000712,0.001584,0.001119,0.000588,0.003236,0.000961,0.004500,12954


In [34]:
columns = ['file_id', 'prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']
result_xgb.to_csv('./baseline_xgb_2000_tfidf.csv', index=False, columns=columns)

In [35]:
# 对两个模型的结果 进行加权平均
result = result_lgb.copy()
weight_lgb, weight_xgb = 0.5, 0.5
result['prob0'] = result['prob0'] * weight_lgb + result_xgb['prob0'] * weight_xgb
result['prob1'] = result['prob1'] * weight_lgb + result_xgb['prob1'] * weight_xgb
result['prob2'] = result['prob2'] * weight_lgb + result_xgb['prob2'] * weight_xgb
result['prob3'] = result['prob3'] * weight_lgb + result_xgb['prob3'] * weight_xgb
result['prob4'] = result['prob4'] * weight_lgb + result_xgb['prob4'] * weight_xgb
result['prob5'] = result['prob5'] * weight_lgb + result_xgb['prob5'] * weight_xgb
result['prob6'] = result['prob6'] * weight_lgb + result_xgb['prob6'] * weight_xgb
result['prob7'] = result['prob7'] * weight_lgb + result_xgb['prob7'] * weight_xgb

columns = ['file_id', 'prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']
result.to_csv('./baseline_lgb_xgb_2000_tfidf.csv', index=False, columns=columns)

In [36]:
%%time
import lightgbm as lgb
clf = lgb.LGBMClassifier(
    num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='multiclass',
    max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
    n_estimators=10000, subsample=1, colsample_bytree=1
)
clf.fit(df_train.drop('label', axis=1), df_train['label'])
result = clf.predict_proba(df_test)
result
result_lgb = pd.DataFrame(result, columns=['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7'])
result_lgb['file_id'] = df_test['file_id'].values
columns = ['file_id', 'prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']
result_lgb.to_csv('./baseline_lgb_10000_tfidf.csv', index=False, columns=columns)

Wall time: 50min 31s


In [37]:
import xgboost as xgb
model_xgb = xgb.XGBClassifier(
            max_depth=9, learning_rate=0.005, n_estimators=10000, 
            objective='multi:softprob', tree_method='gpu_hist', 
            subsample=0.8, colsample_bytree=0.8, 
            min_child_samples=3, eval_metric='logloss', reg_lambda=0.5)
model_xgb.fit(df_train.drop('label', axis=1), df_train['label'])
result_xgb = model_xgb.predict_proba(df_test)
result_xgb = pd.DataFrame(result_xgb, columns=['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7'])
result_xgb['file_id'] = df_test['file_id'].values
result_xgb

columns = ['file_id', 'prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']
result_xgb.to_csv('./baseline_xgb_10000_tfidf.csv', index=False, columns=columns)





Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [38]:
# 对两个模型的结果 进行加权平均
result = result_lgb.copy()
weight_lgb, weight_xgb = 0.5, 0.5
result['prob0'] = result['prob0'] * weight_lgb + result_xgb['prob0'] * weight_xgb
result['prob1'] = result['prob1'] * weight_lgb + result_xgb['prob1'] * weight_xgb
result['prob2'] = result['prob2'] * weight_lgb + result_xgb['prob2'] * weight_xgb
result['prob3'] = result['prob3'] * weight_lgb + result_xgb['prob3'] * weight_xgb
result['prob4'] = result['prob4'] * weight_lgb + result_xgb['prob4'] * weight_xgb
result['prob5'] = result['prob5'] * weight_lgb + result_xgb['prob5'] * weight_xgb
result['prob6'] = result['prob6'] * weight_lgb + result_xgb['prob6'] * weight_xgb
result['prob7'] = result['prob7'] * weight_lgb + result_xgb['prob7'] * weight_xgb

columns = ['file_id', 'prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']
result.to_csv('./baseline_lgb_xgb_10000_tfidf.csv', index=False, columns=columns)