## 导入所需要的库

In [1]:
import pandas as pd
import numpy as np
import lightgbm
from sklearn.model_selection import train_test_split

## 读取文件路径

In [2]:
from data import *

## 特征工程

In [3]:
#读一下数据
org_kernel_log = pd.read_csv(kernel_log_data_path)

In [4]:
#聚合到5分钟粒度
org_kernel_log['collect_time'] = pd.to_datetime(org_kernel_log['collect_time']).dt.ceil("5min")

In [5]:
#按sum整理一下
group_min = org_kernel_log.groupby(['serial_number','collect_time'],as_index=False).agg('sum')

In [6]:
#读取tag文件
failure_tag = pd.read_csv(failure_tag_data_path)

In [7]:
#链接为一个表
failure_tag['failure_time']= pd.to_datetime(failure_tag['failure_time'])
merged_data = pd.merge(group_min, failure_tag[['serial_number','failure_time']],how='left',on=['serial_number'])

In [8]:
merged_data['failure_dis']=(merged_data['failure_time'] - merged_data['collect_time']).dt.total_seconds()
# 去掉大于1200的数据
remove_id = []
for sn, tmp_df in merged_data.groupby('serial_number', as_index=False):
    if np.min(tmp_df['failure_dis'].values) > 1200:
        remove_id.extend(list(tmp_df.index))
org_size = merged_data.shape[0]
merged_data = merged_data.drop(remove_id).reset_index(drop=True)
new_size = merged_data.shape[0]
print("filter: %d -> %d" % (org_size, new_size))

#以240,480,720,960,1200分割点做多分类lable
label = np.zeros(merged_data.shape[0], dtype=int)
label[merged_data['failure_dis'] < 240] = 1
label[(merged_data['failure_dis'] >= 240) & (merged_data['failure_dis'] < 480)] = 2
label[(merged_data['failure_dis'] >= 480) & (merged_data['failure_dis'] < 720)] = 3
label[(merged_data['failure_dis'] >= 720) & (merged_data['failure_dis'] < 960)] = 4
label[(merged_data['failure_dis'] >= 960) & (merged_data['failure_dis'] < 1200)] = 5
    
merged_data['failure_tag'] = label
merged_data.drop('failure_dis', axis=1, inplace=True)

filter: 490469 -> 474004


In [9]:
feature_data = merged_data.drop(['serial_number', 'collect_time','manufacturer','vendor','failure_time'], axis=1)
# 负样本下采样
sample_0 = feature_data[feature_data['failure_tag']==0].sample(frac=0.1)
sample = sample_0.append(feature_data[feature_data['failure_tag']!=0])

In [10]:
# 切分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(sample.iloc[:,:-1], sample['failure_tag'], test_size=0.05)

In [11]:
# 将数据加载为数据集
train_dataset = lightgbm.Dataset(x_train, label=y_train)
test_dataset = lightgbm.Dataset(x_test, label=y_test)

## 训练模型

In [12]:
params = {  
    'objective': 'multiclass',  
    'metric': 'multi_error',  
    'num_class' : 7,
    'verbose': -1, 
}
model = lightgbm.train(params, train_dataset, valid_sets=[test_dataset],
                     callbacks=[lightgbm.early_stopping(10)])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[12]	valid_0's multi_error: 0.0243902


### 将模型保存为文件

In [13]:
model.save_model(model_data_path)

<lightgbm.basic.Booster at 0x116815e4b20>