In [1]:
import getpass
KAGGLE = False if getpass.getuser() == "Kaihua" else True

import os, copy, zipfile, re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import networkx as nx
from functools import reduce
from sklearn.metrics import mean_squared_error

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
if KAGGLE:
    sys.path.append("/kaggle/input/2023zhuxiaoshifengxiang")
    DATA_PATH = '/kaggle/input/2023zhuxiaoshifengxiang'
else:
    sys.path.append("../code集合/models")
    sys.path.append("../code集合/feature_extraction")
    DATA_PATH = 'data'


SEED = 2023
MODEL_SAVE_PATH = 'model'
RESULT_SAVE_PATH = 'results'
if not os.path.exists(MODEL_SAVE_PATH):
    os.makedirs(MODEL_SAVE_PATH)
if not os.path.exists(RESULT_SAVE_PATH):
    os.makedirs(RESULT_SAVE_PATH)
cat_feats = []

In [4]:
train_data = pd.read_parquet(os.path.join(DATA_PATH, 'track3_train.parquet'))
test_data = pd.read_parquet(os.path.join(DATA_PATH, 'track3_a.parquet'))

train_data['type'] = 'train'
test_data['type'] = 'test'
data = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)
data = pd.concat([data, data['ID'].str.split('_', expand=True).rename(columns={0:'station',1:'sample',2:'time'})], axis=1)

print(f'train_data: {train_data.shape}')
print(f'test_data: {test_data.shape}')
print(f'all: {data.shape}')

(data['sample'].value_counts() != 672).sum(), train_data.isna().sum().sum()

train_data: (791616, 77)
test_data: (216384, 74)
all: (1008000, 80)


(0, 0)

In [5]:
TARGET_FEATS = ['wdir_2min', 'spd_2min', 'spd_inst_max']

In [6]:
# 删除有问题的目标
for item in TARGET_FEATS:
    idx = data[item] >= 199999.0
    data.loc[idx, item] = np.nan

In [7]:
data['station'] = data['station'].apply(lambda x: int(x.split('D')[-1]))
cat_feats += ['station']

data['sample'] = data['sample'].apply(lambda x: int(x.split('Sample')[-1]))
data['time'] = data['time'].astype(int)

In [8]:
feats = [item for item in data.columns if item not in TARGET_FEATS+['ID', 'sample', 'type']]
print(f'feats: {len(feats)}, {feats}')
print(cat_feats)

feats: 74, ['100u', '100v', '10u', '10v', '2d', '2t', 'cape', 'capes', 'cp', 'deg0l', 'lcc', 'msl', 'skt', 'sp', 'sst', 'tcc', 'd_L1000', 'q_L1000', 'r_L1000', 't_L1000', 'u_L1000', 'v_L1000', 'w_L1000', 'd_L950', 'q_L950', 'r_L950', 't_L950', 'u_L950', 'v_L950', 'w_L950', 'd_L925', 'q_L925', 'r_L925', 't_L925', 'u_L925', 'v_L925', 'w_L925', 'd_L900', 'q_L900', 'r_L900', 't_L900', 'u_L900', 'v_L900', 'w_L900', 'd_L850', 'q_L850', 'r_L850', 't_L850', 'u_L850', 'v_L850', 'w_L850', 'd_L700', 'q_L700', 'r_L700', 't_L700', 'u_L700', 'v_L700', 'w_L700', 'd_L500', 'q_L500', 'r_L500', 't_L500', 'u_L500', 'v_L500', 'w_L500', 'd_L200', 'q_L200', 'r_L200', 't_L200', 'u_L200', 'v_L200', 'w_L200', 'station', 'time']
['station']


In [9]:
train_data = data.query('type=="train"').reset_index(drop=True)
test_data = data.query('type=="test"').reset_index(drop=True)

In [10]:
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'mse', #mse mape
    'metric': 'rmse',
    # 'max_depth': 6,
    'num_leaves': 2 ** 6, # 2**4
    # 'num_leaves': 31,
    'min_data_in_leaf': 20,
#     'lambda_l1': 0.5,  
#     'lambda_l2': 0.5,  
    'feature_fraction': 0.7,  
    'bagging_fraction': 0.7, 
    'bagging_freq': 5,  
    'learning_rate': 0.01,  
    'n_jobs': -1,
    'verbose': -1,
    "device_type": "cpu",
    'feature_fraction_seed':SEED,
    'bagging_seed':SEED,
    'seed': SEED,
}


In [11]:
# 逐小时预报采用四舍五入保留1位小数位
# 24小时内最大风由保留1位小数位的逐小时平均风速求最大值
# 24小时内极大风由保留1位小数位的逐小时极大风速求最大值


In [12]:
task_name = "lgb"
task_params = {"lgb": lgb_params}[task_name]

In [13]:
train_oof = {}
test_pred = {}
feats_importance = {}

for item_target in TARGET_FEATS:
    train_y = train_data[item_target]
    idx = ~train_y.isna()

    trian_id = train_data.loc[idx, 'ID'].reset_index(drop=True)
    train_x = train_data.loc[idx, feats].reset_index(drop=True)
    testA_x = test_data[feats].reset_index(drop=True)
    train_y = train_y.loc[idx].reset_index(drop=True)
    group_x = train_data.loc[idx, 'sample'].reset_index(drop=True)
    print(train_x.shape, testA_x.shape)

    item_oof = np.zeros(train_x.shape[0])
    item_pred = np.zeros(testA_x.shape[0])

    fold_num = 5
    item_importance = 0
    from sklearn.model_selection import GroupKFold
    import lightgbm as lgb
    kf = GroupKFold(n_splits=fold_num, )
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_x, groups=group_x)):
        print('-----------', fold)
        train = lgb.Dataset(
            train_x.loc[train_idx],
            train_y.loc[train_idx],
            categorical_feature=cat_feats
        )
        val = lgb.Dataset(
            train_x.loc[val_idx],
            train_y.loc[val_idx],
            categorical_feature=cat_feats
        )
        model = lgb.train(task_params, train, valid_sets=[train, val], num_boost_round=10_000,
                            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(5_000)])

        item_oof[val_idx] += (model.predict(train_x.loc[val_idx]))
        item_pred += (model.predict(testA_x))/fold_num
        item_importance += model.feature_importance(importance_type='gain') / fold_num

    importance = pd.DataFrame()
    importance['name'] = feats
    importance['importance'] = item_importance

    train_oof[item_target] = pd.DataFrame({"ID": trian_id, f"{item_target}_true": train_y, f"{item_target}_pred": item_oof})
    test_pred[item_target] = pd.DataFrame({"ID": test_data['ID'], f"{item_target}": item_pred})
    feats_importance[item_target] = importance

(770742, 74) (216384, 74)
----------- 0
Training until validation scores don't improve for 100 rounds
[5000]	training's rmse: 59.232	valid_1's rmse: 65.9939
[10000]	training's rmse: 52.6263	valid_1's rmse: 63.5982
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 52.6263	valid_1's rmse: 63.5982
----------- 1
Training until validation scores don't improve for 100 rounds
[5000]	training's rmse: 59.0363	valid_1's rmse: 66.3904
[10000]	training's rmse: 52.3619	valid_1's rmse: 63.9672
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 52.3619	valid_1's rmse: 63.9672
----------- 2
Training until validation scores don't improve for 100 rounds
[5000]	training's rmse: 59.081	valid_1's rmse: 66.5326
[10000]	training's rmse: 52.4349	valid_1's rmse: 64.0907
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 52.4349	valid_1's rmse: 64.0907
----------- 3
Training until validation scores don't improve for 100 rounds
[5000]	training's 

In [14]:
train_oof = reduce(lambda x,y: pd.merge(x,y,on='ID', how='outer'), train_oof.values())
test_pred = reduce(lambda x,y: pd.merge(x,y,on='ID', how='outer'), test_pred.values())
train_oof.shape, test_pred.shape

((782462, 7), (216384, 4))

In [15]:
score_str = 0

In [16]:
# for item_key in feats_importance:
#     item_importance = feats_importance[item_key]
#     item_importance['mean'] = item_importance[[item for item in item_importance.columns if item.startswith('fold')]].mean(1)
#     item_importance['std'] = item_importance[[item for item in item_importance.columns if item.startswith('fold')]].std(1)
#     item_importance = item_importance.sort_values(['mean', 'std'], ascending=False)
#     item_importance.to_csv(os.path.join(RESULT_SAVE_PATH, f'feats_importance_{item_key}_{score_str}.csv'), index=False)
    

In [17]:
train_oof.to_csv(os.path.join(RESULT_SAVE_PATH, f'lgb_oof_{score_str}.csv'), index=False)
test_pred.to_csv(os.path.join(RESULT_SAVE_PATH, f'lgb_pre_{score_str}.csv'), index=False)