In [None]:
import gc
import os
from glob import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats
import lightgbm as lgb
import joblib
import matplotlib.pyplot as plt

np.random.seed(42)

In [None]:
def pearsonr(preds: np.array, dset: lgb.Dataset):
    """
    Helper function to compute Pearson correlation 
    on validation dataset for LightGBM as tracking metric.
    Args:
        preds: 1d-array with the model predictions
        dset: LightGBM dataset with the labels
    Returs:
        Tuple with the corresponding output
    """
    labels = dset.get_label() # 获取lgb.Dataset的label
    return 'pearsonr', stats.pearsonr(preds, labels)[0], True 

def reduce_mem_usage(df, verbose=False):
    """
    Utility function to reduce the memory usage of pandas dataframes
    
    Parameters
    ----------
    df: pandas.Dataframe
    verbose: Boolean
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                # -128 to 127
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## read data

In [None]:
feat_names = [f"f_{i}" for i in range(300)] # 300个特征名称
col_names = feat_names + ["target"] # 特征 + label
data_dir = "../input/ubiquant-market-prediction" # 数据目录

# 读取训练数据
if os.path.exists(f"{data_dir}/train.pkl"):
    train_data = pd.read_pickle(f"{data_dir}/train.pkl") 
else:
    train_data = pd.read_csv(f"{data_dir}/train.csv", usecols=col_names)
    train_data = reduce_mem_usage(train_data, verbose=True) # 减少内存占用
    train_data.to_pickle(f"{data_dir}/train.pkl") # 保存训练数据
    gc.collect()
train_data.head()

In [None]:
# Optimization
from sklearn.model_selection import TimeSeriesSplit # 导入时间序列交叉验证
tscv = TimeSeriesSplit(5) # 5折时间序列交叉验证
for fold, (trn_ind, val_ind) in enumerate(tscv.split(train_data)):
    # print(f"train length: {len(trn_ind)}, valid length: {len(val_ind)}")
    train_df = train_data.loc[trn_ind, :] # 训练集
    valid_df = train_data.loc[val_ind, :] # 验证集
    
print(f"train_df.shape:{train_df.shape}; valid_df.shape:{valid_df.shape}") 

In [None]:
train_dset = lgb.Dataset(
    data=train_df[feat_names], # 训练集特征
    label=train_df["target"].values, # 训练集label
    free_raw_data=False, # 关闭原始数据的内存占用
)

valid_dset = lgb.Dataset(
    data=valid_df[feat_names], # 验证集特征
    label=valid_df["target"].values, # 验证集label
    free_raw_data=False, # 关闭原始数据的内存占用
)

del train_data
del train_df
del valid_df
gc.collect()

In [None]:
import optuna # 导入optuna


def objective(trial):
    model_params = {
        'boosting': 'dart', # dart 提升算法 DART: Dropouts meet Multiple Additive Regression Trees https://arxiv.org/abs/1505.01866 
        'linear_tree': True, # 线性树
        'objective': 'mse', # 均方误差 损失函数
        'metric': 'rmse', # 均方根误差 评估函数
        'learning_rate': trial.suggest_loguniform("learning_rate", 0.005, 0.1), # 学习率
        'num_leaves': trial.suggest_int("num_leaves", 10, 64), # 最大叶子数量
        'max_bin': trial.suggest_int("max_bin", 200, 600), # 最大分箱数
        'force_col_wise': True, # 强制建立列直方图，可以减少内存占用
        'bagging_freq': 1, # 每 k 次迭代执行bagging
        'seed': 42, # 随机种子
        'verbosity': -1, # 不输出
        'first_metric_only': False, # 计算多个评估指标
        'bin_construct_sample_cnt': 100000000, # 分箱构造样本数量
        'feature_pre_filter': False, # 特征预过滤
        'bagging_fraction': 1.0, # 不进行重采样的情况下随机选择部分数据
        'drop_rate': 0.05, # 丢弃率（树） 
        'feature_fraction': trial.suggest_discrete_uniform("feature_fraction", 0.05, 0.5, 0.1), # 特征采样比例
        'lambda_l1': 3.2608153782775893, # L1正则化 
        'lambda_l2': 24.65715474841406, # L2正则化
        'linear_lambda': 15.831719022196562, # 线性回归正则化
        'max_drop': 5, # 在一次提升迭代中被丢弃的树的最大数量
        'min_data_in_leaf': 2200, # 叶子节点最少样本数
        'num_iterations': 1900, # 迭代次数
        'path_smooth': 4.714076496843463, # 树节点的平滑度, 有助于防止对样本少的树叶进行过度拟合 ####
        'skip_drop': 0.65 # 跳过丢弃的概率
    }

    _model_params = dict(model_params)
    _model_params["seed"] = 42 # 随机种子
    
    log_callback = lgb.log_evaluation(period=20) # 训练日志频率
    
    model = lgb.train(
        params=_model_params, # 参数
        train_set=train_dset, # 训练集
        valid_sets=[train_dset, valid_dset], # 验证集
        feval=pearsonr, # 评估函数
        callbacks=[log_callback,], # 训练日志
    )
    
    lgb.plot_importance(model, figsize=(8,15), importance_type="split", max_num_features=30) # split 特征重要度
    lgb.plot_importance(model, figsize=(8,15), importance_type="gain", max_num_features=30) # gain 特征重要度
    plt.show()

    return model.best_score["valid_1"]["pearsonr"] # best score

In [None]:
study = optuna.create_study(direction='maximize') # 创建study，最大化score
study.optimize(objective, n_trials=100) # 进行100次试验