In [None]:
import gc
from glob import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(2)

In [None]:
def pearsonr(preds: np.array, dset: lgb.Dataset):
    """
    Helper function to compute Pearson correlation 
    on validation dataset for LightGBM as tracking metric.
    Args:
        preds: 1d-array with the model predictions
        dset: LightGBM dataset with the labels
    Returs:
        Tuple with the corresponding output
    """
    labels = dset.get_label() # 获取lgb.Dataset的label
    return 'pearsonr', stats.pearsonr(preds, labels)[0], True

def reduce_mem_usage(df, verbose=False):
    """
    Utility function to reduce the memory usage of pandas dataframes
    
    Parameters
    ----------
    df: pandas.Dataframe
    verbose: Boolean
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

***
## load models

In [None]:
files = glob("./lgbm-seed*.txt")
display(files)

boosters_lgbm_linear_dart = [lgb.Booster(model_file=fn) for fn in files] # 导入训练好的模型
display(boosters_lgbm_linear_dart)

***
## model finetuning on newest data

In [None]:
feat_names = [f"f_{i}" for i in range(300)] # 300个特征名称
col_names = feat_names + ["target"] # 特征 + label

supp_data = pd.read_csv("../input/ubiquant-market-prediction/supplemental_train.csv", usecols=col_names) # 读取补充数据
supp_data = reduce_mem_usage(supp_data, verbose=True) # 减少内存占用
gc.collect()

In [None]:
train_dset = lgb.Dataset(
    data=supp_data[feat_names], # 补充数据特征
    label=supp_data["target"].values, # 补充数据label
    free_raw_data=False, # 关闭原始数据的释放
)
del supp_data
gc.collect()

In [None]:
model_params = {
    'boosting': 'dart', # dart 提升算法
    'linear_tree': True, # 线性树
    'objective': 'mse', # 均方误差
    'metric': 'rmse', # # 均方根误差
    'learning_rate': 0.05,  # 学习率
    'num_leaves': 32, # 最大叶子数量
    'max_bin': 511, # 最大分箱数
    'force_col_wise': True, # 强制建立列直方图
    'bagging_freq': 1, # 每 k 次迭代执行bagging
    'seed': 42, # 随机种子
    'verbosity': -1, # 不输出
    'first_metric_only': False, # 只计算第一个指标
    'bin_construct_sample_cnt': 100000000, # 分箱构造样本数量
    'feature_pre_filter': False, # 特征预过滤
    'bagging_fraction': 1.0, # 不进行重采样的情况下随机选择部分数据
    'drop_rate': 0.05, # 丢弃率
    'feature_fraction': 0.15000000000000002,  # 特征采样比例
    'lambda_l1': 3.2608153782775893, # L1正则化
    'lambda_l2': 24.65715474841406, # L2正则化
    'linear_lambda': 15.831719022196562, # 线性回归正则化
    'max_drop': 20, # 在一次提升迭代中被丢弃的树的最大数量
    'min_data_in_leaf': 2200, # 叶子节点最少样本数
    'num_iterations': 450, # 迭代次数
    'path_smooth': 4.714076496843463, # 树节点的平滑度, 有助于防止对样本少的树叶进行过度拟合
    'skip_drop': 0.65 # 跳过丢弃的概率
}

In [None]:
seeds = [2,7,11,19,23] # 随机种子列表
finetuned_models = list() # finetune模型列表

for seed,base_model in zip(seeds, boosters_lgbm_linear_dart):
    _model_params = dict(model_params) 
    _model_params["seed"] = seed # 设置随机种子
    
    log_callback = lgb.log_evaluation(period=20) # 训练日志频率
    
    model = lgb.train(
        params=_model_params, # 模型参数
        train_set=train_dset, # 训练集
        valid_sets=[train_dset,], # 验证集
        feval=pearsonr, # 评估函数
        callbacks=[log_callback,], # 训练日志
        init_model=base_model, # 继续训练的base模型
    )
    finetuned_models.append(model) # 加入到finetune模型列表

    lgb.plot_importance(model, figsize=(8,15), importance_type="split", max_num_features=30) # split 特征重要度
    lgb.plot_importance(model, figsize=(8,15), importance_type="gain", max_num_features=30) # gain 特征重要度
    plt.show()

In [None]:
del train_dset, boosters_lgbm_linear_dart
gc.collect()

***
## inference

In [None]:
import ubiquant # 导入ubiquant模块
env = ubiquant.make_env()  
iter_test = env.iter_test()

In [None]:
def predict(boosters, dataframe ):
    features = [f"f_{i}" for i in range(300)] # 300个特征
    preds = [
        model.predict(
            dataframe[features], # 特征数据
            start_iteration=0, # 起始迭代次数
            num_iteration=model.current_iteration(), # 迭代次数
        ) 
        for model in boosters
    ]
    return np.mean(preds, axis=0) # 各个模型预测结果的均值

In [None]:
for (test_df, sample_prediction_df) in iter_test:  
    predictions = predict(finetuned_models, test_df) # 预测结果
    sample_prediction_df['target'] = predictions # 将预测结果加入到sample_prediction_df中
    env.predict(sample_prediction_df) # 提交预测
    display(sample_prediction_df)

***