# Training

In [40]:
from lightgbm import LGBMRegressor 
import pandas as pd
from sklearn import linear_model
import pickle
import xgboost as xgb  


In [41]:
train_df = pd.read_csv('./train.csv').set_index(['time_id', 'stock_id'])

# 填充缺失数据
train_df.fillna(0, inplace=True)
print(train_df)

                         0         1         2         3         4         5  \
time_id stock_id                                                               
0       0        -0.209346 -1.010500 -0.287233 -0.600063 -0.579602  3.176390   
        1        -1.090990 -1.467420 -0.756335 -0.305982 -0.770341  0.167490   
        2        -0.172040 -0.359719  0.829011  0.323263 -0.086078 -0.853505   
        3        -0.089830  0.999348  0.130448  0.441553  0.366906 -2.294330   
        4         0.231559 -0.169006 -0.376518  0.985890 -0.505112 -1.006770   
...                    ...       ...       ...       ...       ...       ...   
727     2511      2.382260  0.966825  1.178910  1.184540  0.234592 -0.727670   
        2512     -0.343863  0.531484  2.455260 -1.387990  0.565996  0.487475   
        2513     -0.777487 -0.577208  1.499070 -1.236990 -1.455700 -1.187930   
        2534      2.311920  2.827400  0.207041  0.211622  1.645400 -1.621990   
        2539     -2.244430 -0.197322 -0.

In [45]:
# 创建线性回归模型
reg = linear_model.LinearRegression()

lgbm_model = LGBMRegressor(boosting_type='gbdt',  
                      objective='regression',  
                      metric='rmse',  
                      num_leaves=60,  
                      learning_rate=0.05,  
                      feature_fraction=0.9,  
                      bagging_fraction=0.8,  
                      bagging_freq=5,  
                      verbose=0) 

xgb_model = xgb.XGBRegressor(  
    objective='reg:squarederror',  
    max_depth=6,  
    learning_rate=0.1,  
    n_estimators=100,  
    verbosity=1,  
    random_state=42  
)

In [88]:
# 将数据集划分为特征和目标变量
X = train_df.iloc[:,:-1].values
print(X)
y = train_df.iloc[:,-1].values

# 训练模型
lgbm_model.fit(X, y)
xgb_model.fit(X, y)

[[-0.209346  -1.0105    -0.287233  ... -0.243538   1.70851    1.56062  ]
 [-1.09099   -1.46742   -0.756335  ... -0.531805  -0.297925  -0.554371 ]
 [-0.17204   -0.359719   0.829011  ... -0.805768   0.0232186 -0.358794 ]
 ...
 [-0.777487  -0.577208   1.49907   ...  1.15729    1.73198    1.33202  ]
 [ 2.31192    2.8274     0.207041  ... -0.287656  -1.70743    1.27572  ]
 [-2.24443   -0.197322  -0.732056  ... -0.0139914 -1.57233   -1.71468  ]]


In [52]:
# 保存模型
with open('lgbm_model.pkl', 'wb') as f:
    pickle.dump(lgbm_model, f)
    
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

## Train linear layer

In [89]:
y_lgbm = lgbm_model.predict(X)
y_xgb = xgb_model.predict(X)



In [76]:
train_df1 = pd.read_csv('./train.csv')
train_df1.head()

Unnamed: 0,time_id,stock_id,0,1,2,3,4,5,6,7,...,291,292,293,294,295,296,297,298,299,label
0,0,0,-0.209346,-1.0105,-0.287233,-0.600063,-0.579602,3.17639,0.916548,-0.233095,...,0.332558,1.21496,0.601278,-0.537854,1.05396,0.440747,-0.243538,1.70851,1.56062,-0.023618
1,0,1,-1.09099,-1.46742,-0.756335,-0.305982,-0.770341,0.16749,0.745434,-0.210874,...,-0.365191,-0.132949,-0.165515,-0.170072,-0.463976,0.429221,-0.531805,-0.297925,-0.554371,-0.016899
2,0,2,-0.17204,-0.359719,0.829011,0.323263,-0.086078,-0.853505,-0.770341,-0.521315,...,-0.168552,-0.56664,0.084729,-0.506098,-0.56576,-0.691749,-0.805768,0.023219,-0.358794,-0.004793
3,0,3,-0.08983,0.999348,0.130448,0.441553,0.366906,-2.29433,1.0564,-0.445677,...,-1.19484,0.698424,0.430864,-0.907378,0.57988,-0.145048,0.307333,-0.63299,0.038204,0.042915
4,0,4,0.231559,-0.169006,-0.376518,0.98589,-0.505112,-1.00677,0.308333,-0.220056,...,0.407966,-1.10216,0.532668,0.0457,-0.64493,0.38125,-0.258222,-0.053198,0.170072,0.017077


In [90]:
import pandas as pd
import numpy as np

train_df1 = pd.read_csv('./train.csv')
train_df1.head()
print(selected_columns)
y_lgbm=pd.Series(y_lgbm, name='y_lgbm') 
y_xgb=pd.Series(y_xgb, name='y_xgb') 
df = pd.concat([train_df1[['time_id', 'stock_id']], y_lgbm,y_xgb,train_df1[["label"]]], axis=1)  
print(len(y_lgbm))

print(df.info()) 

         time_id  stock_id
0              0         0
1              0         1
2              0         2
3              0         3
4              0         4
...          ...       ...
1284582      727      2511
1284583      727      2512
1284584      727      2513
1284585      727      2534
1284586      727      2539

[1284587 rows x 2 columns]
1284587
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1284587 entries, 0 to 1284586
Data columns (total 5 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   time_id   1284587 non-null  int64  
 1   stock_id  1284587 non-null  int64  
 2   y_lgbm    1284587 non-null  float64
 3   y_xgb     1284587 non-null  float32
 4   label     1284587 non-null  float64
dtypes: float32(1), float64(2), int64(2)
memory usage: 44.1 MB
None


In [91]:
X = df.iloc[:,:-1].values
print(X)
y = df.iloc[:,-1].values

# 训练模型
reg.fit(X,y)

[[ 0.00000000e+00  0.00000000e+00  7.66380652e-03  5.88802667e-03]
 [ 0.00000000e+00  1.00000000e+00  1.42757764e-03  8.09455058e-04]
 [ 0.00000000e+00  2.00000000e+00  1.09450188e-03  7.18510244e-04]
 ...
 [ 7.27000000e+02  2.51300000e+03 -3.22541878e-03 -2.94190529e-03]
 [ 7.27000000e+02  2.53400000e+03  1.01281052e-02  9.91014671e-03]
 [ 7.27000000e+02  2.53900000e+03  3.78144617e-05 -7.24847661e-04]]


# Rank_ic

In [58]:
import pandas as pd
import argparse

def rank_ic(result_path, label_path):

    test_label = pd.read_csv(label_path).set_index(['time_id', 'stock_id'])
    pred = pd.read_csv(result_path).set_index(['time_id', 'stock_id'])
    result = pd.concat([pred, test_label], axis=1)

    rank = result.groupby('time_id').apply(lambda df: (df['pred'].rank()).corr(df['label'].rank())).mean()
    return rank


# Run

In [None]:
import pandas as pd
import pickle


In [49]:
#加载测试数据
test_df = pd.read_csv('./test.csv').set_index(['time_id', 'stock_id'])
test_df.fillna(0, inplace=True)
X_test = test_df.values

In [53]:
# 读取模型

with open('lgbm_model.pkl', 'rb') as f:
    lgbm_model = pickle.load(f)
with open('xgb_model.pkl', 'rb') as f:
    xgb_model = pickle.load(f)

In [70]:
y_pred_lgbm = lgbm_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)




In [92]:

import pandas as pd
import numpy as np

test_df1 = pd.read_csv('./test.csv')
test_df1.head()
print(selected_columns)
y_lgbm=pd.Series(y_pred_lgbm, name='y_lgbm') 
y_xgb=pd.Series(y_pred_xgb, name='y_xgb') 
df = pd.concat([test_df1[['time_id', 'stock_id']], y_lgbm,y_xgb], axis=1)  
print(len(y_lgbm))

print(df.info()) 

X = df.iloc[:,:-1].values
print(X)

# 训练模型


y_pred = reg.predict(X)
#保存结果
result = pd.DataFrame(y_pred, index = test_df.index, columns=['pred'])

result.to_csv('./result.csv')


         time_id  stock_id
0              0         0
1              0         1
2              0         2
3              0         3
4              0         4
...          ...       ...
1284582      727      2511
1284583      727      2512
1284584      727      2513
1284585      727      2534
1284586      727      2539

[1284587 rows x 2 columns]
175817
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175817 entries, 0 to 175816
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   time_id   175817 non-null  int64  
 1   stock_id  175817 non-null  int64  
 2   y_lgbm    175817 non-null  float64
 3   y_xgb     175817 non-null  float32
dtypes: float32(1), float64(1), int64(2)
memory usage: 4.7 MB
None
[[ 7.47000000e+02  0.00000000e+00 -1.37814146e-03]
 [ 7.47000000e+02  1.00000000e+00  1.92634537e-03]
 [ 7.47000000e+02  2.00000000e+00  3.58563920e-03]
 ...
 [ 8.44000000e+02  2.53400000e+03  2.01757057e-03]
 [ 8.44000000e+

ValueError: X has 3 features, but LinearRegression is expecting 4 features as input.

In [71]:
#计算rank_ic
def rank_ic(result_path, label_path):

    test_label = pd.read_csv(label_path).set_index(['time_id', 'stock_id'])
    pred = pd.read_csv(result_path).set_index(['time_id', 'stock_id'])
    result = pd.concat([pred, test_label], axis=1)

    rank = result.groupby('time_id').apply(lambda df: (df['pred'].rank()).corr(df['label'].rank())).mean()
    return rank
rank = rank_ic('./result.csv',"./test_label.csv")
print('rank_ic: ', rank)

rank_ic:  0.09978426234730715
