In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
X = pd.read_csv('independent_variable.csv')
X.head()

Unnamed: 0,Sulfur_content_of_raw_material,RON_of_raw_material,Saturated_hydrocarbon,olefin,Bromine,density,Substitute_coke,Substitute_S,Regenerated_coke,Regenerated_S,...,S-ZORB.CAL_1.CANGLIANG.PV,S-ZORB.FT_1006.DACA.PV,S-ZORB.FT_5204.DACA.PV,S-ZORB.FT_1006.TOTALIZERA.PV,S-ZORB.FT_5204.TOTALIZERA.PV,S-ZORB.FT_1503.DACA.PV,S-ZORB.FT_1503.TOTALIZERA.PV,S-ZORB.FT_1504.DACA.PV,S-ZORB.FT_1504.TOTALIZERA.PV,S-ZORB.PC_1001A.PV
0,188.0,90.6,53.23,24.4,61.49,726.09,2.32,7.3,1.84,5.98,...,2.0454,6368.747,233.3108,83086802.0,832503.795,2216.4094,39063124.5,1840.1447,39608757.0,0.3533
1,169.0,90.5,52.3,26.4,61.88,731.3,2.37,7.34,0.55,4.38,...,2.026,6360.6453,242.3692,82318954.0,803462.665,2370.5874,38810581.5,1641.7326,39389299.0,0.3545
2,177.0,90.7,52.3,26.31,61.72,729.61,2.43,7.27,1.89,5.82,...,1.9647,6504.9649,233.0769,82012004.0,791925.055,2326.4654,38693812.0,1600.6758,39312616.5,0.3502
3,159.0,90.4,52.3,26.1,61.33,725.4,3.08,7.35,0.98,4.67,...,2.039,6506.825,238.3499,81231373.5,762863.81,2495.2236,38410862.5,1563.7122,39120204.5,0.3539
4,173.0,89.6,52.24,26.67,61.33,725.43,2.45,6.58,0.83,4.52,...,1.9869,6560.2423,236.5762,80915707.5,751362.3,2807.7891,38283000.0,1554.3574,39045953.5,0.3581


In [9]:
y = pd.read_csv('dependent_variable.csv').values.reshape(-1)

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
param = {'boosting_type':'gbdt',
         'objective' : 'regression', #任务类型
         'metric' : 'mse', #评估指标
         'learning_rate' : 0.01, #学习率
         'max_depth' : 20, #树的最大深度
         'feature_fraction':0.8, #设置在每次迭代中使用特征的比例
         'bagging_fraction': 0.8, #样本采样比例
         'bagging_freq': 8, #bagging的次数
         'lambda_l1': 0.3, #L1正则
          'lambda_l2': 0, #L2正则
        }
trn_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_valid, label=y_valid)
model = lgb.train(param,trn_data,valid_sets=[trn_data,val_data],\
                  num_boost_round = 100000,early_stopping_rounds=1000,verbose_eval=25)

Training until validation scores don't improve for 1000 rounds
[25]	training's l2: 0.653367	valid_1's l2: 0.567413
[50]	training's l2: 0.433529	valid_1's l2: 0.379083
[75]	training's l2: 0.300202	valid_1's l2: 0.267159
[100]	training's l2: 0.216995	valid_1's l2: 0.196481
[125]	training's l2: 0.156562	valid_1's l2: 0.147873
[150]	training's l2: 0.118572	valid_1's l2: 0.116816
[175]	training's l2: 0.0946077	valid_1's l2: 0.0974379
[200]	training's l2: 0.0771785	valid_1's l2: 0.0840825
[225]	training's l2: 0.0656658	valid_1's l2: 0.0756431
[250]	training's l2: 0.057517	valid_1's l2: 0.0700732
[275]	training's l2: 0.0512845	valid_1's l2: 0.0669741
[300]	training's l2: 0.0469252	valid_1's l2: 0.0645268
[325]	training's l2: 0.0435855	valid_1's l2: 0.0632908
[350]	training's l2: 0.0407063	valid_1's l2: 0.0622546
[375]	training's l2: 0.0378894	valid_1's l2: 0.0606776
[400]	training's l2: 0.0354574	valid_1's l2: 0.0597145
[425]	training's l2: 0.033168	valid_1's l2: 0.0588564
[450]	training's l2

In [36]:
from sklearn.model_selection import KFold

## 通过交叉验证方法计算各特征的重要

In [52]:
def get_feature_importance(cv,x,y,param):
    folds = KFold(n_splits=cv,shuffle=True,random_state=15)
    oof = np.zeros(X_train.shape[0])
    feature_names = x.columns
    feature_importance = pd.DataFrame({
        'column': feature_names,
        'importance': np.zeros(len(feature_names)),
    })
    for fold_,(trn_idx,val_idx) in enumerate(folds.split(x,y)):
        print('fold {}'.format(fold_))
        trn_data = lgb.Dataset(x.iloc[trn_idx,:],
                               label=y[trn_idx],
                               )
        val_data = lgb.Dataset(x.iloc[val_idx,:],
                               label=y[val_idx],
                               )
        model = lgb.train(param,trn_data,valid_sets=[trn_data,val_data],\
                  num_boost_round = 100000,early_stopping_rounds=1000,verbose_eval=25)
        feature_importance['importance'] += model.feature_importance()
    feature_importance['importance'] /= cv
    return feature_importance

In [None]:
param = {'boosting_type':'gbdt',
         'objective' : 'regression', #任务类型
         'metric' : 'mse', #评估指标
         'learning_rate' : 0.01, #学习率
         'max_depth' : 20, #树的最大深度
         'feature_fraction':0.8, #设置在每次迭代中使用特征的比例
         'bagging_fraction': 0.8, #样本采样比例
         'bagging_freq': 8, #bagging的次数
         'lambda_l1': 0.3, #L1正则
          'lambda_l2': 0, #L2正则
        }

In [53]:
feature_importance = get_feature_importance(5,X_train,y_train,param)

fold 0
Training until validation scores don't improve for 1000 rounds
[25]	training's l2: 0.61506	valid_1's l2: 0.957861
[50]	training's l2: 0.417935	valid_1's l2: 0.682142
[75]	training's l2: 0.294822	valid_1's l2: 0.514004
[100]	training's l2: 0.217287	valid_1's l2: 0.402598
[125]	training's l2: 0.165185	valid_1's l2: 0.324817
[150]	training's l2: 0.131481	valid_1's l2: 0.270882
[175]	training's l2: 0.108024	valid_1's l2: 0.232383
[200]	training's l2: 0.0919636	valid_1's l2: 0.20534
[225]	training's l2: 0.0807134	valid_1's l2: 0.184547
[250]	training's l2: 0.0714235	valid_1's l2: 0.168263
[275]	training's l2: 0.0647425	valid_1's l2: 0.157094
[300]	training's l2: 0.0593374	valid_1's l2: 0.148553
[325]	training's l2: 0.054878	valid_1's l2: 0.141631
[350]	training's l2: 0.0511723	valid_1's l2: 0.135954
[375]	training's l2: 0.048006	valid_1's l2: 0.133024
[400]	training's l2: 0.0452098	valid_1's l2: 0.131582
[425]	training's l2: 0.042988	valid_1's l2: 0.129163
[450]	training's l2: 0.0407

In [55]:
feature_importance.sort_values(by='importance',ascending=False).values

array([['RON_of_raw_material', 861.2],
       ['density', 151.8],
       ['S-ZORB.FT_9301.PV', 133.0],
       ['S-ZORB.TE_5008.DACA', 116.2],
       ['S-ZORB.AT_1001.DACA', 114.0],
       ['S-ZORB.PDT_2606.DACA', 112.6],
       ['S-ZORB.FT_1003.PV', 93.0],
       ['S-ZORB.LT_1002.DACA', 88.8],
       ['S-ZORB.FT_1504.DACA.PV', 86.8],
       ['S-ZORB.TE_1105.PV', 84.8],
       ['S-ZORB.AT-0009.DACA.PV', 81.0],
       ['S-ZORB.PT_1604.DACA', 80.0],
       ['S-ZORB.FT_3702.DACA', 78.8],
       ['S-ZORB.LT_9001.DACA', 78.4],
       ['S-ZORB.FC_1203.PV', 77.6],
       ['S-ZORB.FT_2431.DACA', 75.6],
       ['S-ZORB.TE_5102.PV', 75.2],
       ['S-ZORB.FC_2702.DACA', 72.4],
       ['S-ZORB.FT_1001.PV', 71.8],
       ['S-ZORB.FC_1102.PV', 69.6],
       ['S-ZORB.FC_5001.DACA', 68.6],
       ['S-ZORB.FC_1201.PV', 67.4],
       ['S-ZORB.PC_1001A.PV', 67.2],
       ['S-ZORB.AT-0011.DACA.PV', 67.2],
       ['S-ZORB.CAL_H2.PV', 65.4],
       ['S-ZORB.TE_5202.PV', 65.4],
       ['Substitute_S', 65.4],

In [57]:
feature_importances = get_feature_importance(10,X_train,y_train,param)

fold 0
Training until validation scores don't improve for 1000 rounds
[25]	training's l2: 0.634143	valid_1's l2: 0.953833
[50]	training's l2: 0.425587	valid_1's l2: 0.70587
[75]	training's l2: 0.295418	valid_1's l2: 0.544858
[100]	training's l2: 0.212112	valid_1's l2: 0.433296
[125]	training's l2: 0.155098	valid_1's l2: 0.343604
[150]	training's l2: 0.12006	valid_1's l2: 0.288937
[175]	training's l2: 0.0970554	valid_1's l2: 0.254851
[200]	training's l2: 0.0808049	valid_1's l2: 0.225746
[225]	training's l2: 0.0690904	valid_1's l2: 0.201965
[250]	training's l2: 0.0604772	valid_1's l2: 0.180016
[275]	training's l2: 0.0549816	valid_1's l2: 0.168844
[300]	training's l2: 0.0504276	valid_1's l2: 0.160748
[325]	training's l2: 0.046484	valid_1's l2: 0.158205
[350]	training's l2: 0.0432415	valid_1's l2: 0.151281
[375]	training's l2: 0.0398983	valid_1's l2: 0.143997
[400]	training's l2: 0.0371471	valid_1's l2: 0.140179
[425]	training's l2: 0.0351035	valid_1's l2: 0.137066
[450]	training's l2: 0.0

In [58]:
feature_importance.sort_values(by='importance',ascending=False)

Unnamed: 0,column,importance
1,RON_of_raw_material,861.2
5,density,151.8
32,S-ZORB.FT_9301.PV,133.0
305,S-ZORB.TE_5008.DACA,116.2
190,S-ZORB.AT_1001.DACA,114.0
...,...,...
118,S-ZORB.FT_9102.TOTAL,0.0
119,S-ZORB.FT_1001.TOTAL,0.0
111,S-ZORB.FT_9403.TOTAL,0.0
96,S-ZORB.FT_1004.TOTAL,0.0


In [125]:
df.iloc[:30,:]

Unnamed: 0,column,importance
1,RON_of_raw_material,861.2
5,density,151.8
32,S-ZORB.FT_9301.PV,133.0
305,S-ZORB.TE_5008.DACA,116.2
190,S-ZORB.AT_1001.DACA,114.0
232,S-ZORB.PDT_2606.DACA,112.6
55,S-ZORB.FT_1003.PV,93.0
171,S-ZORB.LT_1002.DACA,88.8
351,S-ZORB.FT_1504.DACA.PV,86.8
62,S-ZORB.TE_1105.PV,84.8


In [131]:
dfs = feature_importances.sort_values(by='importance',ascending=False)
dfs.iloc[:30,:9]

Unnamed: 0,column,importance
1,RON_of_raw_material,860.9
351,S-ZORB.FT_1504.DACA.PV,60.8
232,S-ZORB.PDT_2606.DACA,53.6
32,S-ZORB.FT_9301.PV,48.8
83,S-ZORB.TC_2801.PV,41.9
80,S-ZORB.FC_1203.PV,41.3
171,S-ZORB.LT_1002.DACA,41.2
170,S-ZORB.SIS_TE_2802,37.4
55,S-ZORB.FT_1003.PV,37.2
227,S-ZORB.FT_3702.DACA,35.7


In [152]:
columns_1 = df.iloc[:30,:]['column'].values
columns_2 = dfs.iloc[:30,:]['column'].values

In [153]:
list(set(columns_1).intersection(set(columns_2)))

['S-ZORB.PC_1001A.PV',
 'S-ZORB.FC_1201.PV',
 'S-ZORB.PT_1604.DACA',
 'S-ZORB.FT_1001.PV',
 'S-ZORB.FT_3702.DACA',
 'S-ZORB.LT_1002.DACA',
 'S-ZORB.TE_1501.DACA',
 'S-ZORB.TE_5008.DACA',
 'RON_of_raw_material',
 'S-ZORB.TE_1105.PV',
 'S-ZORB.FT_1504.DACA.PV',
 'S-ZORB.PDT_2606.DACA',
 'S-ZORB.FC_1203.PV',
 'S-ZORB.TE_5202.PV',
 'S-ZORB.FC_5001.DACA',
 'S-ZORB.FT_1003.PV',
 'S-ZORB.FT_9301.PV']

In [None]:
['S-ZORB.LT_1002.DACA',
 'RON_of_raw_material',
 'S-ZORB.FT_1504.DACA.PV',
 'S-ZORB.PDT_2606.DACA',
 'S-ZORB.FT_1003.PV',
 'S-ZORB.FT_9301.PV']

In [154]:
x = X_train[['S-ZORB.PC_1001A.PV',
 'S-ZORB.FC_1201.PV',
 'S-ZORB.PT_1604.DACA',
 'S-ZORB.FT_1001.PV',
 'S-ZORB.FT_3702.DACA',
 'S-ZORB.LT_1002.DACA',
 'S-ZORB.TE_1501.DACA',
 'S-ZORB.TE_5008.DACA',
 'RON_of_raw_material',
 'S-ZORB.TE_1105.PV',
 'S-ZORB.FT_1504.DACA.PV',
 'S-ZORB.PDT_2606.DACA',
 'S-ZORB.FC_1203.PV',
 'S-ZORB.TE_5202.PV',
 'S-ZORB.FC_5001.DACA',
 'S-ZORB.FT_1003.PV',
 'S-ZORB.FT_9301.PV']]
x1 = X_valid[['S-ZORB.PC_1001A.PV',
 'S-ZORB.FC_1201.PV',
 'S-ZORB.PT_1604.DACA',
 'S-ZORB.FT_1001.PV',
 'S-ZORB.FT_3702.DACA',
 'S-ZORB.LT_1002.DACA',
 'S-ZORB.TE_1501.DACA',
 'S-ZORB.TE_5008.DACA',
 'RON_of_raw_material',
 'S-ZORB.TE_1105.PV',
 'S-ZORB.FT_1504.DACA.PV',
 'S-ZORB.PDT_2606.DACA',
 'S-ZORB.FC_1203.PV',
 'S-ZORB.TE_5202.PV',
 'S-ZORB.FC_5001.DACA',
 'S-ZORB.FT_1003.PV',
 'S-ZORB.FT_9301.PV']]
trn_data = lgb.Dataset(x, label=y_train)
val_data = lgb.Dataset(x1, label=y_valid)
model = lgb.train(param,trn_data,valid_sets=[trn_data,val_data],\
                  num_boost_round = 100000,early_stopping_rounds=1000,verbose_eval=25)

Training until validation scores don't improve for 1000 rounds
[25]	training's l2: 0.655256	valid_1's l2: 0.563855
[50]	training's l2: 0.448924	valid_1's l2: 0.388061
[75]	training's l2: 0.315777	valid_1's l2: 0.276304
[100]	training's l2: 0.225206	valid_1's l2: 0.199105
[125]	training's l2: 0.162195	valid_1's l2: 0.148396
[150]	training's l2: 0.125184	valid_1's l2: 0.119751
[175]	training's l2: 0.099887	valid_1's l2: 0.0984244
[200]	training's l2: 0.0831099	valid_1's l2: 0.0861992
[225]	training's l2: 0.0714436	valid_1's l2: 0.0778355
[250]	training's l2: 0.0633596	valid_1's l2: 0.0719615
[275]	training's l2: 0.0571631	valid_1's l2: 0.0678692
[300]	training's l2: 0.0530895	valid_1's l2: 0.0665469
[325]	training's l2: 0.049793	valid_1's l2: 0.0643434
[350]	training's l2: 0.046853	valid_1's l2: 0.0630545
[375]	training's l2: 0.044133	valid_1's l2: 0.062164
[400]	training's l2: 0.0418425	valid_1's l2: 0.0609886
[425]	training's l2: 0.0395311	valid_1's l2: 0.0600771
[450]	training's l2: 0

In [129]:
def get_feature(feature_importance,x_train,y_train,x_val,y_val):
    importance = feature_importance['importance'].values
    Loss = []
    for i in range(1,31):
        column = feature_importance['column'][:i].values
        a = x_train[columns]
        b = x_val[columns]
        trn_data = lgb.Dataset(a, label=y_train)
        val_data = lgb.Dataset(b, label=y_val)
        model = lgb.train(param,trn_data,valid_sets=[trn_data,val_data],\
                  num_boost_round = 100000,early_stopping_rounds=1000,verbose_eval=25)
        loss = list(model.best_score['valid_1'].items())[0][1]
        Loss.append(loss)
    return index

In [122]:
column = feature_importance['column'][:17].values
a = X_train[columns]
b = X_valid[columns]
trn_data = lgb.Dataset(a, label=y_train)
val_data = lgb.Dataset(b, label=y_valid)
model = lgb.train(param,trn_data,valid_sets=[val_data],\
                  num_boost_round = 100000,early_stopping_rounds=1000,verbose_eval=25)

Training until validation scores don't improve for 1000 rounds
[25]	valid_0's l2: 0.58738
[50]	valid_0's l2: 0.412365
[75]	valid_0's l2: 0.296564
[100]	valid_0's l2: 0.218539
[125]	valid_0's l2: 0.168525
[150]	valid_0's l2: 0.135905
[175]	valid_0's l2: 0.115203
[200]	valid_0's l2: 0.101726
[225]	valid_0's l2: 0.0913761
[250]	valid_0's l2: 0.0840688
[275]	valid_0's l2: 0.0795088
[300]	valid_0's l2: 0.0770386
[325]	valid_0's l2: 0.0749748
[350]	valid_0's l2: 0.0732486
[375]	valid_0's l2: 0.0718352
[400]	valid_0's l2: 0.070946
[425]	valid_0's l2: 0.0697162
[450]	valid_0's l2: 0.0686863
[475]	valid_0's l2: 0.0677853
[500]	valid_0's l2: 0.0676794
[525]	valid_0's l2: 0.0674591
[550]	valid_0's l2: 0.0674169
[575]	valid_0's l2: 0.0665503
[600]	valid_0's l2: 0.0657651
[625]	valid_0's l2: 0.0651051
[650]	valid_0's l2: 0.0650964
[675]	valid_0's l2: 0.0648784
[700]	valid_0's l2: 0.0648222
[725]	valid_0's l2: 0.0644033
[750]	valid_0's l2: 0.0645886
[775]	valid_0's l2: 0.0643832
[800]	valid_0's l2: 

In [118]:
column

array(['Sulfur_content_of_raw_material', 'RON_of_raw_material',
       'Saturated_hydrocarbon', 'olefin'], dtype=object)

In [111]:
index = get_feature(feature_importance,X_train,y_train,X_valid,y_valid)

Training until validation scores don't improve for 1000 rounds
[25]	training's l2: 0.670927	valid_1's l2: 0.58738
[50]	training's l2: 0.462914	valid_1's l2: 0.412365
[75]	training's l2: 0.324266	valid_1's l2: 0.296564
[100]	training's l2: 0.232438	valid_1's l2: 0.218539
[125]	training's l2: 0.170285	valid_1's l2: 0.168525
[150]	training's l2: 0.130602	valid_1's l2: 0.135905
[175]	training's l2: 0.104982	valid_1's l2: 0.115203
[200]	training's l2: 0.088014	valid_1's l2: 0.101726
[225]	training's l2: 0.075542	valid_1's l2: 0.0913761
[250]	training's l2: 0.0666617	valid_1's l2: 0.0840688
[275]	training's l2: 0.0598698	valid_1's l2: 0.0795088
[300]	training's l2: 0.0551357	valid_1's l2: 0.0770386
[325]	training's l2: 0.051238	valid_1's l2: 0.0749748
[350]	training's l2: 0.048011	valid_1's l2: 0.0732486
[375]	training's l2: 0.0452335	valid_1's l2: 0.0718352
[400]	training's l2: 0.0426275	valid_1's l2: 0.070946
[425]	training's l2: 0.0403484	valid_1's l2: 0.0697162
[450]	training's l2: 0.038

In [112]:
index

1

In [113]:
list(model.best_score['valid_1'].items())[0][1]

0.06281392841050108