In [1]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import gc

import warnings
warnings.simplefilter('ignore')

In [2]:
train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('evaluation_public.csv')

train['isnull'] = train.iloc[:,1:].isnull().mean(axis = 1)
test['isnull'] = test.iloc[:,1:].isnull().mean(axis = 1)

data = pd.concat([train, test]).reset_index(drop=True)

data['time'] = pd.to_datetime(data['time'])
data['hour'] = data['time'].dt.hour
data['minute'] = data['time'].dt.minute
data['weekday'] = data['time'].dt.weekday
data['ts'] = data['hour']*60 + data['minute']

In [3]:
train = data.iloc[:140480].reset_index(drop = True)
test = data.iloc[140480:].reset_index(drop = True)

## 对比测试集每行样本的特征缺失比，选取特征缺失比率相识的
train = train[train['isnull']<0.15].reset_index(drop=True)

train['Label1_log'] = np.log1p(train['Label1'])
train['Label2_log'] = np.log1p(train['Label2'])


## 通过特征重要性、对抗验证、特征相关性，剔除了这7个特征，可能剔除错了也可能没剔除干净
feas = [f for f in train.columns if f not in ['time', 'Label1', 'Label2','Label1_log','Label2_log', 
                                              'B_QY_ORP','JS_TN',
                                              'CS_SW','MCCS_NH4','N_HYC_JS_DO','MCCS_NO3','JS_SW',
                                             ]]

train = train.dropna(subset=['Label1', 'Label2']).reset_index(drop=True)

x_train = train[feas]
x_test = test[feas]

In [6]:
def lgb_model(clf, train_x, train_y, test_x):
    folds = 10
    seed = 2222
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
    test_pre = []
    Feass = pd.DataFrame()

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)

        fea = pd.DataFrame()

        params = {
            'learning_rate': 0.03,
            'boosting_type': 'gbdt',
            'objective': 'mse',
            'metric': 'mse',
            'verbose': -1,
            'seed': 2020,
            'n_jobs': -1,
            'max_depth':-1,
            'min_child_weight': 4,
            'num_leaves': 2 ** 4,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 4,
        }

        model = clf.train(params, train_matrix, num_boost_round=20000, valid_sets=[train_matrix, valid_matrix], 
                          categorical_feature =[] ,verbose_eval=1000,early_stopping_rounds=200)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        test_pre.append(test_pred)
        fea['feas'] = train_x.columns.tolist()
        fea['sorce'] = model.feature_importance()
        Feass = pd.concat([Feass,fea],axis = 0)
        print(list(sorted(zip(train_x.columns.tolist(), model.feature_importance()), key=lambda x: x[1], reverse=True))[:20])
            
            
        train[valid_index] = val_pred
        test = test_pred
        cv_scores.append(np.sqrt(mean_squared_error(np.expm1(val_y), np.expm1(val_pred))))
#         cv_scores.append(mean_squared_error(val_y, val_pred))
  
        print(cv_scores)
    test = sum(test_pre) / folds
    print("s_scotrainre_list:" , cv_scores)
    print("s_score_mean:" , np.mean(cv_scores))
    print("s_score_std:", np.std(cv_scores))

    return train, test, Feass, np.mean(cv_scores)

In [7]:
for idx, i in enumerate([[x_train, train['Label1_log'], x_test], [x_train, train['Label2_log'], x_test],]):
    locals()[f'lgb_train{idx}'], locals()[f'lgb_test{idx}'], \
    locals()[f'Feass{idx}'],     locals()[f'scores{idx}'] = lgb_model(lgb, i[0], i[1], i[2])

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[1000]	training's l2: 0.00742455	valid_1's l2: 0.0154584
[2000]	training's l2: 0.00504084	valid_1's l2: 0.0144517
[3000]	training's l2: 0.00378112	valid_1's l2: 0.0138582
[4000]	training's l2: 0.00299196	valid_1's l2: 0.0135563
[5000]	training's l2: 0.00243645	valid_1's l2: 0.0134311
[6000]	training's l2: 0.0020204	valid_1's l2: 0.0133132
[7000]	training's l2: 0.00169659	valid_1's l2: 0.0131875
Early stopping, best iteration is:
[7369]	training's l2: 0.00159041	valid_1's l2: 0.0131449
[('ts', 7765), ('N_QY_ORP', 6969), ('N_HYC_DO', 6785), ('B_HYC_DO', 6448), ('B_HYC_MLSS', 5879), ('N_HYC_MLSS', 5675), ('N_HYC_NH4', 5593), ('CS_COD', 5421), ('B_CS_MQ_SSLL', 5020), ('JS_COD', 5016), ('JS_LL', 4954), ('CS_TN', 4919), ('CS_NH3', 4898), ('B_HYC_XD', 4744), ('B_HYC_JS_DO', 4683), ('N_HYC_XD', 4513), ('JS_NH3', 4430), ('minute', 4196), ('CS_LL', 4118), ('N_

[1000]	training's l2: 0.0076503	valid_1's l2: 0.00938698
[2000]	training's l2: 0.00512838	valid_1's l2: 0.00860978
[3000]	training's l2: 0.00384481	valid_1's l2: 0.00815838
[4000]	training's l2: 0.00304582	valid_1's l2: 0.00790186
[5000]	training's l2: 0.00247294	valid_1's l2: 0.00771709
[6000]	training's l2: 0.00205327	valid_1's l2: 0.00756572
[7000]	training's l2: 0.00172318	valid_1's l2: 0.00751117
Early stopping, best iteration is:
[6876]	training's l2: 0.00175941	valid_1's l2: 0.00749819
[('ts', 7343), ('N_QY_ORP', 6677), ('N_HYC_DO', 6245), ('B_HYC_DO', 5957), ('B_HYC_MLSS', 5567), ('N_HYC_MLSS', 5349), ('CS_COD', 5345), ('N_HYC_NH4', 5313), ('JS_COD', 4715), ('JS_LL', 4524), ('B_CS_MQ_SSLL', 4523), ('CS_TN', 4489), ('B_HYC_XD', 4474), ('JS_NH3', 4381), ('CS_NH3', 4307), ('B_HYC_JS_DO', 4229), ('minute', 4151), ('N_HYC_XD', 4115), ('CS_LL', 3732), ('N_CS_MQ_SSLL', 3471)]
[1552.3658769487379, 1388.0937554803734, 1354.9401447481623, 1302.2242757212068, 1333.4989974261428, 1143.6966

[1000]	training's l2: 0.00739726	valid_1's l2: 0.0096588
[2000]	training's l2: 0.00501277	valid_1's l2: 0.00860538
[3000]	training's l2: 0.00378498	valid_1's l2: 0.00814354
[4000]	training's l2: 0.00299881	valid_1's l2: 0.00784134
Early stopping, best iteration is:
[4461]	training's l2: 0.00272305	valid_1's l2: 0.00776025
[('ts', 4725), ('N_QY_ORP', 4723), ('N_HYC_DO', 4493), ('CS_COD', 4029), ('B_HYC_DO', 3606), ('B_HYC_MLSS', 3546), ('N_HYC_MLSS', 3480), ('N_HYC_NH4', 3447), ('JS_COD', 3151), ('JS_NH3', 3147), ('CS_NH3', 3104), ('CS_TN', 3027), ('B_CS_MQ_SSLL', 2837), ('N_HYC_XD', 2799), ('JS_LL', 2775), ('B_HYC_XD', 2595), ('minute', 2502), ('B_HYC_JS_DO', 2426), ('CS_LL', 2079), ('N_CS_MQ_SSLL', 1778)]
[1291.2217039848586, 1087.7955111528925, 1163.5123361279268, 1071.96424301152, 1121.2998302260548]
************************************ 6 ************************************
Training until validation scores don't improve for 200 rounds
[1000]	training's l2: 0.00735536	valid_1's l2: 

In [8]:
# 特征重要性
Feass0.groupby(['feas'])['sorce'].mean().sort_values(ascending=False).reset_index(),\
Feass1.groupby(['feas'])['sorce'].mean().sort_values(ascending=False).reset_index()

(            feas   sorce
 0             ts  8497.6
 1       N_QY_ORP  7950.9
 2       N_HYC_DO  7659.3
 3       B_HYC_DO  7434.1
 4     B_HYC_MLSS  6867.1
 5      N_HYC_NH4  6542.4
 6     N_HYC_MLSS  6524.3
 7         CS_COD  6202.8
 8   B_CS_MQ_SSLL  5801.9
 9          JS_LL  5752.0
 10      B_HYC_XD  5702.0
 11   B_HYC_JS_DO  5666.5
 12        JS_COD  5542.2
 13      N_HYC_XD  5451.4
 14         CS_TN  5392.0
 15        CS_NH3  5290.4
 16        minute  5206.7
 17         CS_LL  5089.1
 18        JS_NH3  5017.5
 19  N_CS_MQ_SSLL  4714.4
 20     B_HYC_NH4  2026.7
 21       weekday  1641.7
 22          hour  1484.7
 23        isnull    34.8,
             feas   sorce
 0       N_HYC_DO  7837.2
 1             ts  7802.2
 2       N_QY_ORP  7557.0
 3       B_HYC_DO  6935.7
 4     N_HYC_MLSS  6562.5
 5     B_HYC_MLSS  6461.3
 6      N_HYC_NH4  6393.7
 7         CS_COD  5816.7
 8   B_CS_MQ_SSLL  5719.0
 9          JS_LL  5641.3
 10   B_HYC_JS_DO  5574.0
 11      B_HYC_XD  5572.4
 12      N_

In [9]:
# 线下得分

1/(1+(scores0+scores1)/2)*1000

0.8030430881469699

In [10]:
sub = pd.read_csv('sample_submission.csv')
sub['Label1'] = np.expm1(lgb_test0)
sub['Label2'] = np.expm1(lgb_test1)
sub.to_csv('0928_lgb_base_0.8030430881469699.csv', index=False)