In [1]:
import pandas as pd#导入csv文件的库
import numpy as np#进行矩阵运算的库
#model lgb回归模型,日志评估,早停防止过拟合
from  lightgbm import LGBMRegressor,log_evaluation,early_stopping
#使用普通的k折交叉验证
from sklearn.model_selection import KFold

In [2]:
import random#提供了一些用于生成随机数的函数
class Config():
    seed=2024
    path="/kaggle/input/2024-wind-power/2024海力风电出力/"
    target='出力(MW)'
    num_folds=12
#两个基本信息的csv文件可以变成这个字典 位置和装机容量的关系
pos2mw={"f1":48,'f2':280,'f3':48,'f4':88,'f5':48}
#设置随机种子,保证模型可以复现
def seed_everything(seed):
    np.random.seed(seed)#numpy的随机种子
    random.seed(seed)#python内置的随机种子
seed_everything(seed=Config.seed)

In [3]:
def feature_engineer(df):
    df['站点装机容量']=df['站点编号'].apply(lambda x: pos2mw[x])

    #对时间这列做一系列特征工程,待填充.
    df['时间'] = pd.to_datetime(df['时间'])
    # 提取年、月、日、小时和分钟
    df['year'] = df['时间'].dt.year
    df['month'] = df['时间'].dt.month
#     df['date'] = df['时间'].dt.day
    df['hour'] = df['时间'].dt.hour
    df['minute'] = df['时间'].dt.minute

    df['比大气压']=df['气压(Pa）'] / df['温度（K）']
    #将温度转换成摄氏度
    df['温度（K）']=df['温度（K）']-273.15
    #相对湿度的取值范围为0.01~0.99
    df['相对湿度（%）']=df['相对湿度（%）'].apply(lambda x:max(1,x))
    df['相对湿度（%）']=df['相对湿度（%）'].apply(lambda x:min(99,x))

    #气象数据的特征工程
    #露点温度（°C） = 温度（°C） - (100 - 相对湿度（%）) / 5
    df['露点温度']=df['温度（K）']-(100-df['相对湿度（%）'])/5

    #风速和风向的特征工程
    df['sin_100m风速（100m/s）']=df['100m风速（100m/s）']*np.sin(np.pi*df['100m风向（°)']/180)
    df['cos_100m风速（100m/s）']=df['100m风速（100m/s）']*np.cos(np.pi*df['100m风向（°)']/180)

    df['sin_10米风速（10m/s）']=df['10米风速（10m/s）']*np.sin(np.pi*df['10米风向（°)']/180)
    df['cos_10米风速（10m/s）']=df['10米风速（10m/s）']*np.cos(np.pi*df['10米风向（°)']/180)
    
    df['100m风向（°)类别']=(df['100m风向（°)']+1)//90
    df['10米风向（°)类别']=(df['10米风向（°)']+1)//90
    df['10米风向（°)_100m风向（°)']=(df['100m风向（°)类别']==df['10米风向（°)类别'])
    
    for col in ['100m风向（°)类别','10米风向（°)类别']:
        unique_value=df[col].unique()
        for value in unique_value:
            df[col+"_"+str(value)]=(df[col]==value)
            
    #由于是时序数据,如果有缺失值,这个值肯定和前面一个数据是最接近的.
    df.fillna(method='ffill',inplace=True)
    
    df.drop(['时间'],axis=1,inplace=True)
    
    return df

In [4]:
train_df=pd.read_csv(Config.path+"trainA.csv",encoding='gbk')
print(f"len(train_df):{len(train_df)}")
#将缺失值替换为np.nan,并转数据类型为float.
train_df[Config.target]=train_df[Config.target].replace('<NULL>', np.nan).astype(float)
#用线性插值的方式填充缺失值
train_df[Config.target]=train_df[Config.target].interpolate()
train_df=feature_engineer(train_df)
train_df.head()

len(train_df):231840


  df.fillna(method='ffill',inplace=True)


Unnamed: 0,站点编号,气压(Pa）,相对湿度（%）,云量,10米风速（10m/s）,10米风向（°),温度（K）,辐照强度（J/m2）,降水（m）,100m风速（100m/s）,...,100m风向（°)类别_0.0,100m风向（°)类别_4.0,100m风向（°)类别_3.0,100m风向（°)类别_1.0,100m风向（°)类别_2.0,10米风向（°)类别_0.0,10米风向（°)类别_3.0,10米风向（°)类别_4.0,10米风向（°)类别_1.0,10米风向（°)类别_2.0
0,f1,102249.6094,74.8513,0.007812,7.7041,26.5195,12.9195,0.0,8e-06,9.082,...,True,False,False,False,False,True,False,False,False,False
1,f1,102252.0355,74.753,0.000924,7.771,23.5766,12.7147,0.0,8e-06,9.1374,...,True,False,False,False,False,True,False,False,False,False
2,f1,102248.59,74.4995,0.003009,7.8272,21.5451,12.5435,0.0,8e-06,9.1856,...,True,False,False,False,False,True,False,False,False,False
3,f1,102240.4725,74.1432,0.011402,7.8637,20.2394,12.4012,0.0,8e-06,9.2158,...,True,False,False,False,False,True,False,False,False,False
4,f1,102228.8828,73.7366,0.023438,7.8781,19.487,12.283,0.0,8e-06,9.2237,...,True,False,False,False,False,True,False,False,False,False


In [5]:
test_df=pd.read_csv(Config.path+"testA.csv",encoding='gbk')
print(f"len(test_df):{len(test_df)}")
test_df=feature_engineer(test_df)
test_df.head()

len(test_df):44160


  df.fillna(method='ffill',inplace=True)


Unnamed: 0,站点编号,气压(Pa）,相对湿度（%）,云量,10米风速（10m/s）,10米风向（°),温度（K）,辐照强度（J/m2）,降水（m）,100m风速（100m/s）,...,100m风向（°)类别_0.0,100m风向（°)类别_4.0,100m风向（°)类别_3.0,100m风向（°)类别_1.0,100m风向（°)类别_2.0,10米风向（°)类别_0.0,10米风向（°)类别_4.0,10米风向（°)类别_3.0,10米风向（°)类别_1.0,10米风向（°)类别_2.0
0,f1,101309.625,84.3487,0.67969,4.7181,18.4066,16.8791,0.0,0.003588,5.5467,...,True,False,False,False,False,True,False,False,False,False
1,f1,101303.259,84.48,0.65654,4.7575,18.2344,16.853,0.0,0.000739,5.5886,...,True,False,False,False,False,True,False,False,False,False
2,f1,101291.8681,84.1823,0.61713,4.7772,16.0279,16.8286,0.0,0.000641,5.6422,...,True,False,False,False,False,True,False,False,False,False
3,f1,101277.4571,83.5966,0.58752,4.7852,12.8178,16.7995,0.0,0.002015,5.6978,...,True,False,False,False,False,True,False,False,False,False
4,f1,101262.0313,82.8641,0.59375,4.7795,9.6014,16.7596,0.0,0.003588,5.7344,...,True,False,False,False,False,True,False,False,False,False


In [6]:
# #计算两组变量的皮尔逊相关系数
# def pearson_corr(x1,x2):
#     """
#     x1,x2:np.array
#     """
#     mean_x1=np.mean(x1)
#     mean_x2=np.mean(x2)
#     std_x1=np.std(x1)
#     std_x2=np.std(x2)
#     pearson=np.mean((x1-mean_x1)*(x2-mean_x2))/(std_x1*std_x2)
#     return pearson
# drop_cols=[]
# for col in train_df.drop([Config.target],axis=1).columns:
#     pearson=pearson_corr(train_df[col].values,train_df[Config.target].values)
#     print(f"col:{col},pearson_corr:{pearson}")
#     if abs(pearson)<=0.01:#如果基本上没有什么相关性的特征那就直接drop好了
#         drop_cols+=[col]
drop_cols=['相对湿度（%）', 'year']
print(f"drop_cols:{drop_cols}")
print(f"total_feature_counts:{len(test_df.columns)}")

drop_cols:['相对湿度（%）', 'year']
total_feature_counts:35


In [7]:
print(f"gap feature")
total_df=pd.concat((train_df,test_df),axis=0)
gaps=[1,2,4,7,15,30,50,80]
for gap in gaps:
    for col in ['气压(Pa）', '相对湿度（%）', '云量', '10米风速（10m/s）', '10米风向（°)', '温度（K）',
           '辐照强度（J/m2）', '降水（m）', '100m风速（100m/s）', '100m风向（°)']:
        total_df[col+f"_shift{gap}"]=total_df[col].groupby(total_df['站点编号']).shift(gap)
        total_df[col+f"_gap{gap}"]=total_df[col+f"_shift{gap}"]-total_df[col]
        total_df.drop([col+f"_shift{gap}"],axis=1,inplace=True)
    
print("one hot encoder")
for col in ['站点编号']:
    unique_value=total_df[col].unique()
    for value in unique_value:
        total_df[col+"_"+str(value)]=(total_df[col]==value)
    total_df.drop([col],axis=1,inplace=True)
train_df=total_df[:len(train_df)]
test_df=total_df[len(train_df):]
train_df.dropna(inplace=True)
test_df.head()

gap feature


  total_df[col+f"_gap{gap}"]=total_df[col+f"_shift{gap}"]-total_df[col]
  total_df[col+f"_shift{gap}"]=total_df[col].groupby(total_df['站点编号']).shift(gap)
  total_df[col+f"_gap{gap}"]=total_df[col+f"_shift{gap}"]-total_df[col]
  total_df[col+f"_shift{gap}"]=total_df[col].groupby(total_df['站点编号']).shift(gap)
  total_df[col+f"_gap{gap}"]=total_df[col+f"_shift{gap}"]-total_df[col]
  total_df[col+f"_shift{gap}"]=total_df[col].groupby(total_df['站点编号']).shift(gap)
  total_df[col+f"_gap{gap}"]=total_df[col+f"_shift{gap}"]-total_df[col]
  total_df[col+f"_shift{gap}"]=total_df[col].groupby(total_df['站点编号']).shift(gap)
  total_df[col+f"_gap{gap}"]=total_df[col+f"_shift{gap}"]-total_df[col]
  total_df[col+f"_shift{gap}"]=total_df[col].groupby(total_df['站点编号']).shift(gap)
  total_df[col+f"_gap{gap}"]=total_df[col+f"_shift{gap}"]-total_df[col]
  total_df[col+f"_shift{gap}"]=total_df[col].groupby(total_df['站点编号']).shift(gap)
  total_df[col+f"_gap{gap}"]=total_df[col+f"_shift{gap}"]-total_df[col]
  to

one hot encoder


  total_df[col+"_"+str(value)]=(total_df[col]==value)
  total_df[col+"_"+str(value)]=(total_df[col]==value)
  total_df[col+"_"+str(value)]=(total_df[col]==value)
  total_df[col+"_"+str(value)]=(total_df[col]==value)
  total_df[col+"_"+str(value)]=(total_df[col]==value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.dropna(inplace=True)


Unnamed: 0,气压(Pa）,相对湿度（%）,云量,10米风速（10m/s）,10米风向（°),温度（K）,辐照强度（J/m2）,降水（m）,100m风速（100m/s）,100m风向（°),...,温度（K）_gap80,辐照强度（J/m2）_gap80,降水（m）_gap80,100m风速（100m/s）_gap80,100m风向（°)_gap80,站点编号_f1,站点编号_f2,站点编号_f3,站点编号_f4,站点编号_f5
0,101309.625,84.3487,0.67969,4.7181,18.4066,16.8791,0.0,0.003588,5.5467,20.4047,...,-0.5495,0.0,0.029327,4.1642,26.7224,True,False,False,False,False
1,101303.259,84.48,0.65654,4.7575,18.2344,16.853,0.0,0.000739,5.5886,20.1468,...,-0.5239,0.0,0.032183,4.1283,26.2132,True,False,False,False,False
2,101291.8681,84.1823,0.61713,4.7772,16.0279,16.8286,0.0,0.000641,5.6422,17.9834,...,-0.4943,0.0,0.032281,4.1263,27.5775,True,False,False,False,False
3,101277.4571,83.5966,0.58752,4.7852,12.8178,16.7995,0.0,0.002015,5.6978,14.9278,...,-0.457,0.0,0.030903,4.1251,29.7502,True,False,False,False,False
4,101262.0313,82.8641,0.59375,4.7795,9.6014,16.7596,0.0,0.003588,5.7344,11.924,...,-0.4092,0.0,0.029327,4.1039,31.7221,True,False,False,False,False


In [8]:
#对train_feats做交叉验证,然后用
def RMSE(y_true,y_pred):
    return  np.sqrt(np.mean((y_true-y_pred)**2))
def metric(y_true,y_pred):
    rmses=[]
    for i in range(0,len(y_true),len(y_true)//5):
        rmse=RMSE(y_true[i:i+len(y_true)//5],y_pred[i:i+len(y_true)//5])
        rmses.append(rmse)
    return np.mean(np.array(rmses))
#训练数据选择的是2022年1月到2023年1月
train_feats=train_df[((train_df['year']==2022))|((train_df['year']==2023)&(train_df['month']==1))]
#将表格数据打乱,不按照时间顺序排列
train_feats = train_feats.sample(frac=1).reset_index(drop=True)
#验证集选择的是2023年2月到2023年4月
valid_feats=train_df[(train_df['year']==2023)&(train_df['month']>1)]
#年份和相对湿度基本和target无关
train_feats.drop(drop_cols,axis=1,inplace=True)
valid_feats.drop(drop_cols,axis=1,inplace=True)

# import optuna#自动超参数优化软件框架

# def objective(trial):
#     lgb_params = {
#         "verbosity": -1,'objective': 'regression',
#         'metric': 'rmse','boosting_type': 'gbdt',
#         'random_state': Config.seed,
#         'n_estimators': trial.suggest_int('n_estimators', 50, 200),
#         'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
#         'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),#对数分布的建议值
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),#浮点数
#         'subsample': trial.suggest_float('subsample', 0.5, 1),
#         'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.5, log=True),
#         'num_leaves' : trial.suggest_int('num_leaves', 8, 64),#整数
#         'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
#     }
#     X=train_feats.drop([Config.target],axis=1).copy()
#     y=train_feats[Config.target].copy()
#     test_X=valid_feats.drop([Config.target],axis=1).values.copy()
#     test_y=valid_feats[Config.target].values.copy()
#     test_preds=np.zeros((Config.num_folds,len(test_X)))
#     # 初始化 KFold
#     kf = KFold(n_splits=Config.num_folds, shuffle=True,random_state=Config.seed)
#     # 进行 k 折交叉验证
#     for fold, (train_index, valid_index) in (enumerate(kf.split(X))):
#         X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
#         y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

#         model=LGBMRegressor(**lgb_params)
#         model.fit(X_train,y_train)
#         test_preds[fold]=model.predict(test_X)
#     test_preds=test_preds.mean(axis=0)
#     mean_rmse=metric(test_y,test_preds)
#     return mean_rmse
# #创建的研究命名,找最小值.
# study = optuna.create_study(direction='minimize', study_name='Optimize boosting hyperparameters')
# #目标函数,尝试的次数
# study.optimize(objective, n_trials=50)
# lgb_params=study.best_trial.params
#Best is trial 29 with value: 15.058260259234075.
lgb_params= {'n_estimators': 75, 'reg_alpha': 0.022825982577566684, 
             'reg_lambda': 5.284325352952156, 'colsample_bytree': 0.8286196779453388,
             'subsample': 0.8853286861359038, 'learning_rate': 0.2484233791090533,
             'num_leaves': 37, 'min_child_samples': 44, 
             'objective': 'regression', 'metric': 'rmse',
             'boosting_type': ('gbdt',), 'random_state': 2024}
lgb_params['objective']='regression'
lgb_params['metric']='rmse'
lgb_params['boosting_type']='gbdt',
lgb_params['random_state']=Config.seed
#输出最佳的参数
print('lgb_params=', lgb_params)
# print('Best rmse: ', study.best_value)

lgb_params= {'n_estimators': 75, 'reg_alpha': 0.022825982577566684, 'reg_lambda': 5.284325352952156, 'colsample_bytree': 0.8286196779453388, 'subsample': 0.8853286861359038, 'learning_rate': 0.2484233791090533, 'num_leaves': 37, 'min_child_samples': 44, 'objective': 'regression', 'metric': 'rmse', 'boosting_type': ('gbdt',), 'random_state': 2024}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_feats.drop(drop_cols,axis=1,inplace=True)


In [9]:
train_df.drop(drop_cols,axis=1,inplace=True)
test_df.drop(drop_cols,axis=1,inplace=True)

X=train_df.drop([Config.target],axis=1).copy()
y=train_df[Config.target].copy()
test_X=test_df.drop([Config.target],axis=1).values
test_preds=np.zeros((Config.num_folds,len(test_X)))
# 初始化 KFold
kf = KFold(n_splits=Config.num_folds, shuffle=True,random_state=Config.seed)
# 进行 k 折交叉验证
for fold, (train_index, valid_index) in (enumerate(kf.split(X))):

    print(f"fold:{fold}")

    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    model=LGBMRegressor(**lgb_params)
    model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],callbacks=[log_evaluation(100)])
    test_preds[fold]=model.predict(test_X)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(drop_cols,axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop(drop_cols,axis=1,inplace=True)


fold:0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.268821 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212153, number of used features: 117
[LightGBM] [Info] Start training from score 36.552821
fold:1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.533869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212153, number of used features: 117
[LightGBM] [Info] Start training from score 36.539215
fold:2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.533666 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24311
[LightGBM] [Info] Number of data points in the train set: 212153, number of used features:

In [10]:
submission=pd.read_csv(Config.path+"sample_submission.csv")  
test_preds=test_preds.mean(axis=0)
submission['出力(MW)']=test_preds
submission.to_csv("baseline.csv",index=None)
submission.head()

Unnamed: 0,站点编号,时间,出力(MW)
0,f1,2023-5-1 0:00,5.498896
1,f1,2023-5-1 0:15,4.65327
2,f1,2023-5-1 0:30,4.889664
3,f1,2023-5-1 0:45,5.991598
4,f1,2023-5-1 1:00,6.471863
