In [9]:
import pandas as pd

def get_data(filename):
    result = []
    chunk_index = 0
    for df in pd.read_csv(open(filename, 'r'), chunksize = 1000000):
        result.append(df)
        chunk_index += 1
    results = pd.concat(result, ignore_index = True, axis = 0)
    return results

In [10]:
%%time
train = get_data('./datasets/security_train.csv')

Wall time: 32.6 s


In [None]:
%%time
test = get_data('./datasets/security_test.csv')

In [None]:
%%time
import pickle
with open('./datasets/train.pkl', 'wb') as f:
    pickle.dump(train, f)
with open('./datasets/test.pkl', 'wb') as f:
    pickle.dump(test, f)

In [None]:
%%time
import pickle
with open('./datasets/train.pkl', 'rb') as f:
    train = pickle.load(f)
with open('./datasets/test.pkl', 'rb') as f:
    test = pickle.load(f)

In [None]:
train.shape

In [None]:
#train
import sys
sys.getsizeof(train)/1024/1024

In [None]:
## api encode / labelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df_all = pd.concat([train, test])
df_all['api'] = le.fit_transform(df_all['api'])

train['api'] = df_all[df_all['label'].notnull()]['api']
test['api'] = df_all[df_all['label'].isnull()]['api']

In [None]:
train

In [None]:
#train
import sys
sys.getsizeof(train)/1024/1024

In [None]:
import os
import psutil
mem = psutil.virtual_memory()
print('总内存', mem.total/1024/1024)
print('已使用内存', mem.used/1024/1024)
print('空闲内存', mem.free/1024/1024)
print('使用占比', mem.percent)
print('当前线程PID', os.getpid())
print('当前线程使用：',  psutil.Process(os.getpid()).memory_info().rss/1024/1024)

#CPU 
print('CPU counts', psutil.cpu_count())
print('cpu ratio', psutil.cpu_percent())


In [None]:
import gc
del df_all
gc.collect()

In [None]:
import os
import psutil
mem = psutil.virtual_memory()
print('总内存', mem.total/1024/1024)
print('已使用内存', mem.used/1024/1024)
print('空闲内存', mem.free/1024/1024)
print('使用占比', mem.percent)
print('当前线程PID', os.getpid())
print('当前线程使用：',  psutil.Process(os.getpid()).memory_info().rss/1024/1024)

#CPU 
print('CPU counts', psutil.cpu_count())
print('cpu ratio', psutil.cpu_percent())


In [None]:
train.head()

In [None]:
# 构造新特征

def get_feature(df):
    df_file = df.groupby('file_id')
    if 'label' in df.columns:
        df1 = df.drop_duplicates(subset = ['file_id', 'label'], keep='first')
    else:
        df1 = df.drop_duplicates(subset = ['file_id'], keep='first')
    df1 = df1.sort_values('file_id')
    
    # 提取多个特征 统计特征
    features = ['api', 'tid', 'index']
    for f in features:
        df1[f + '_count'] = df_file[f].count().values
        df1[f + '_nuinque'] = df_file[f].nunique().values
        df1[f + '_min'] = df_file[f].min().values
        df1[f + '_max'] = df_file[f].max().values
        df1[f + '_median'] = df_file[f].median().values
        df1[f + '_std'] = df_file[f].std().values
    return df1


In [None]:
df_train = get_feature(train)
df_train

In [None]:
df_test = get_feature(test)
df_test

In [None]:
# 0.72
import lightgbm as lgb

clf = lgb.LGBMClassifier(
    num_leaves=2**5 - 1, reg_alpha=0.25, reg_lambda= 0.25, objective='multiclass', max_depth=-1, learning_rate=0.005, min_child_samples=3,
    random_state=2021, n_estimators=10000, subsample=1, colsample_bytree=1
)

clf.fit(df_train.drop('label', axis=1), df_train['label'])

# result = ensemble_model(clf, features, labels, test_fea, cate_features)
result = clf.predict_proba(df_test)
result = pd.DataFrame(result, columns = ['prob0', 'prob1','prob2','prob3','prob4','prob5','prob6','prob7'])
result['file_id'] = df_test['file_id'].values
columns = ['file_id', 'prob0', 'prob1','prob2','prob3','prob4','prob5','prob6','prob7']
result.to_csv('v1 nl25 ne10000.csv', index = False, columns = columns)


In [None]:
%%time

# 0.683331

import xgboost as xgb

model_xgb = xgb.XGBClassifier(
    max_depth=9, learning_rate=0.005, n_estimators=2000, 
    objective='multi:softprob', tree_method='gpu_hist', 
    subsample=0.8, colsample_bytree=0.8, 
    min_child_samples=3, eval_metric='logloss', reg_lambda=0.5
)


model_xgb = model_xgb.fit(df_train.drop('label', axis=1), df_train['label'])

# result = ensemble_model(clf, features, labels, test_fea, cate_features)
result = model_xgb.predict_proba(df_test)
result = pd.DataFrame(result, columns = ['prob0', 'prob1','prob2','prob3','prob4','prob5','prob6','prob7'])
result['file_id'] = df_test['file_id'].values
columns = ['file_id', 'prob0', 'prob1','prob2','prob3','prob4','prob5','prob6','prob7']
result.to_csv('v1 xgb.csv', index = False, columns = columns)


In [None]:
result1 = pd.read_csv('./v1 nl25 ne10000.csv')
result2 = pd.read_csv('./v1 xgb.csv')

weight1 = 0.3
result['prob0'] = result1['prob0'] * weight1  + result2['prob0'] * (1-weight1)
result['prob1'] = result1['prob1'] * weight1  + result2['prob1'] * (1-weight1)
result['prob2'] = result1['prob2'] * weight1  + result2['prob2'] * (1-weight1)
result['prob3'] = result1['prob3'] * weight1  + result2['prob3'] * (1-weight1)
result['prob4'] = result1['prob4'] * weight1  + result2['prob4'] * (1-weight1)
result['prob5'] = result1['prob5'] * weight1  + result2['prob5'] * (1-weight1)
result['prob6'] = result1['prob6'] * weight1  + result2['prob6'] * (1-weight1)
result['prob7'] = result1['prob7'] * weight1  + result2['prob7'] * (1-weight1)



In [None]:
columns = ['file_id', 'prob0', 'prob1','prob2','prob3','prob4','prob5','prob6','prob7']

result.to_csv('merge.csv', index=False, columns=columns)