In [1]:
import pandas as pd

def get_data(filename):
    result = []
    chunk_index = 0
    for df in pd.read_csv(open(filename, 'r'), chunksize = 1000000):
        result.append(df)
        chunk_index += 1
    results = pd.concat(result, ignore_index = True, axis = 0)
    return results

In [2]:
%%time
train = get_data('./datasets/security_train.csv')

Wall time: 32.5 s


In [3]:
%%time
test = get_data('./datasets/security_test.csv')

Wall time: 25.8 s


In [4]:
%%time
import pickle
with open('./datasets/train.pkl', 'wb') as f:
    pickle.dump(train, f)
with open('./datasets/test.pkl', 'wb') as f:
    pickle.dump(test, f)

Wall time: 8.97 s


In [5]:
%%time
import pickle
with open('./datasets/train.pkl', 'rb') as f:
    train = pickle.load(f)
with open('./datasets/test.pkl', 'rb') as f:
    test = pickle.load(f)

Wall time: 7.19 s


In [6]:
train.shape

(89806693, 5)

In [7]:
#train
import sys
sys.getsizeof(train)/1024/1024

8887.502380371094

In [8]:
## api encode / labelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df_all = pd.concat([train, test])
df_all['api'] = le.fit_transform(df_all['api'])

train['api'] = df_all[df_all['label'].notnull()]['api']
test['api'] = df_all[df_all['label'].isnull()]['api']

In [10]:
#train
import sys
sys.getsizeof(train)/1024/1024

3083.268253326416

In [11]:
import os
import psutil
mem = psutil.virtual_memory()
print('总内存', mem.total/1024/1024)
print('已使用内存', mem.used/1024/1024)
print('空闲内存', mem.free/1024/1024)
print('使用占比', mem.percent)
print('当前线程PID', os.getpid())
print('当前线程使用：',  psutil.Process(os.getpid()).memory_info().rss/1024/1024)

#CPU 
print('CPU counts', psutil.cpu_count())
print('cpu ratio', psutil.cpu_percent())


总内存 65468.93359375
已使用内存 23178.8046875
空闲内存 42290.12890625
使用占比 35.4
当前线程PID 3872
当前线程使用： 13724.51953125
CPU counts 8
cpu ratio 15.9


In [12]:
import gc
del df_all
gc.collect()

0

In [13]:
import os
import psutil
mem = psutil.virtual_memory()
print('总内存', mem.total/1024/1024)
print('已使用内存', mem.used/1024/1024)
print('空闲内存', mem.free/1024/1024)
print('使用占比', mem.percent)
print('当前线程PID', os.getpid())
print('当前线程使用：',  psutil.Process(os.getpid()).memory_info().rss/1024/1024)

#CPU 
print('CPU counts', psutil.cpu_count())
print('cpu ratio', psutil.cpu_percent())


总内存 65468.93359375
已使用内存 16072.3671875
空闲内存 49396.56640625
使用占比 24.5
当前线程PID 3872
当前线程使用： 6629.01171875
CPU counts 8
cpu ratio 17.8


In [15]:
# 构造新特征

def get_feature(df):
    df_file = df.groupby('file_id')
    if 'label' in df.columns:
        df1 = df.drop_duplicates(subset = ['file_id', 'label'], keep='first')
    else:
        df1 = df.drop_duplicates(subset = ['file_id'], keep='first')
    df1 = df1.sort_values('file_id')
    
    # 提取多个特征 统计特征
    features = ['api', 'tid', 'index']
    for f in features:
        df1[f + '_count'] = df_file[f].count().values
        df1[f + '_nuinque'] = df_file[f].nunique().values
        df1[f + '_min'] = df_file[f].min().values
        df1[f + '_max'] = df_file[f].max().values
        df1[f + '_median'] = df_file[f].median().values
        df1[f + '_std'] = df_file[f].std().values
    return df1


In [16]:
df_train = get_feature(train)
df_train

Unnamed: 0,file_id,label,api,tid,index,api_count,api_nuinque,api_min,api_max,api_median,...,tid_min,tid_max,tid_median,tid_std,index_count,index_nuinque,index_min,index_max,index_median,index_std
0,1,5,135,2488,0,6786,116,6,298,172.0,...,2488,2812,2488.0,83.881299,6786,5001,0,5000,1607.5,1510.694221
6786,2,2,95,2320,0,816,30,89,298,153.0,...,2320,2604,2480.0,101.506783,816,204,0,203,101.5,58.925137
7602,3,0,151,2208,0,463,42,9,258,151.0,...,2208,2208,2208.0,0.000000,463,463,0,462,231.0,133.800847
8065,4,0,95,2284,0,2046,51,9,257,151.0,...,2284,2980,2340.0,150.460506,2046,1028,0,1027,511.0,295.407885
10111,5,0,249,2500,0,10002,65,6,254,227.0,...,2500,2676,2596.0,49.556301,10002,5001,0,5000,2500.0,1443.736493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89620181,13883,2,95,100,0,178221,71,6,279,152.0,...,100,6568,3392.0,1405.045515,178221,5001,0,5000,47.0,1008.636040
89798402,13884,5,95,2592,0,1319,39,6,279,139.0,...,2592,2748,2592.0,4.295386,1319,1319,0,1318,659.0,380.906813
89799721,13885,0,151,2240,0,1033,71,8,259,170.0,...,2240,2744,2240.0,33.152020,1033,1033,0,1032,516.0,298.345717
89800754,13886,1,95,2324,0,5316,80,9,281,167.0,...,2324,2836,2600.0,154.796790,5316,2503,0,2502,1165.5,755.545651


In [17]:
df_test = get_feature(test)
df_test

Unnamed: 0,file_id,api,tid,index,api_count,api_nuinque,api_min,api_max,api_median,api_std,...,tid_min,tid_max,tid_median,tid_std,index_count,index_nuinque,index_min,index_max,index_median,index_std
0,1,226,2332,0,97,15,13,262,152.0,50.338954,...,2332,2568,2544.0,57.218548,97,31,0,30,14.0,9.210466
97,2,226,2472,0,1361,40,6,261,138.0,49.557795,...,2472,2748,2524.0,104.399149,1361,681,0,680,340.0,196.515744
1458,3,95,2344,0,16,9,16,257,134.0,75.152179,...,2344,2344,2344.0,0.000000,16,16,0,15,7.5,4.760952
1474,4,135,2452,0,193,34,13,262,170.0,45.377632,...,2452,2584,2452.0,50.951508,193,193,0,192,96.0,55.858452
1667,5,95,2332,0,803,34,16,261,153.0,55.316733,...,2332,2780,2376.0,201.826813,803,268,0,267,133.0,77.317048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79277890,12951,151,2644,0,289,37,9,269,151.0,63.718398,...,2644,2980,2776.0,75.402526,289,145,0,144,72.0,41.786414
79278179,12952,151,2264,0,112,28,56,261,152.0,42.920708,...,2264,2264,2264.0,0.000000,112,112,0,111,55.5,32.475632
79278291,12953,135,2324,0,5095,72,6,286,214.0,63.838795,...,2324,2884,2708.0,196.695730,5095,1464,0,1463,454.0,393.605016
79283386,12954,135,2424,0,2951,65,9,298,139.0,69.088685,...,2424,2700,2680.0,126.124152,2951,1445,0,1444,555.0,397.358069


In [18]:
# 0.72
import lightgbm as lgb

clf = lgb.LGBMClassifier(
    num_leaves=2**5 - 1, reg_alpha=0.25, reg_lambda= 0.25, objective='multiclass', max_depth=-1, learning_rate=0.005, min_child_samples=3,
    random_state=2021, n_estimators=10000, subsample=1, colsample_bytree=1
)

clf.fit(df_train.drop('label', axis=1), df_train['label'])

# result = ensemble_model(clf, features, labels, test_fea, cate_features)
result = clf.predict_proba(df_test)
result = pd.DataFrame(result, columns = ['prob0', 'prob1','prob2','prob3','prob4','prob5','prob6','prob7'])
result['file_id'] = df_test['file_id'].values
columns = ['file_id', 'prob0', 'prob1','prob2','prob3','prob4','prob5','prob6','prob7']
result.to_csv('v1 nl25 ne10000.csv', index = False, columns = columns)


In [19]:
%%time

# 0.683331

import xgboost as xgb

model_xgb = xgb.XGBClassifier(
    max_depth=9, learning_rate=0.005, n_estimators=2000, 
    objective='multi:softprob', tree_method='gpu_hist', 
    subsample=0.8, colsample_bytree=0.8, 
    min_child_samples=3, eval_metric='logloss', reg_lambda=0.5
)


model_xgb = model_xgb.fit(df_train.drop('label', axis=1), df_train['label'])

# result = ensemble_model(clf, features, labels, test_fea, cate_features)
result = model_xgb.predict_proba(df_test)
result = pd.DataFrame(result, columns = ['prob0', 'prob1','prob2','prob3','prob4','prob5','prob6','prob7'])
result['file_id'] = df_test['file_id'].values
columns = ['file_id', 'prob0', 'prob1','prob2','prob3','prob4','prob5','prob6','prob7']
result.to_csv('v1 xgb.csv', index = False, columns = columns)




Parameters: { "min_child_samples" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Wall time: 3min 1s


In [20]:
result1 = pd.read_csv('./v1 nl25 ne10000.csv')
result2 = pd.read_csv('./v1 xgb.csv')

weight1 = 0.3
result['prob0'] = result1['prob0'] * weight1  + result2['prob0'] * (1-weight1)
result['prob1'] = result1['prob1'] * weight1  + result2['prob1'] * (1-weight1)
result['prob2'] = result1['prob2'] * weight1  + result2['prob2'] * (1-weight1)
result['prob3'] = result1['prob3'] * weight1  + result2['prob3'] * (1-weight1)
result['prob4'] = result1['prob4'] * weight1  + result2['prob4'] * (1-weight1)
result['prob5'] = result1['prob5'] * weight1  + result2['prob5'] * (1-weight1)
result['prob6'] = result1['prob6'] * weight1  + result2['prob6'] * (1-weight1)
result['prob7'] = result1['prob7'] * weight1  + result2['prob7'] * (1-weight1)



In [21]:
columns = ['file_id', 'prob0', 'prob1','prob2','prob3','prob4','prob5','prob6','prob7']

result.to_csv('merge.csv', index=False, columns=columns)