In [1]:
import pandas as pd
import pickle
import lightgbm as lgb
import glob
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
import time

reduce memory usage

In [2]:
# reduce memory usage
def reduce_mem_usage(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
print('========LOADING DATA========')
df_500 = pd.read_csv('pct1_cal/modified_alter_alphas_066_labels_500.csv')
print(df_500.shape)
print('========COMPLETE LOADING DATA========')

(569537, 293)


In [None]:
filenames = glob.glob('pct1_cal' + '/500*')
for file in filenames:
    df = pd.read_csv(file)
    #print(df.shape)
    dates = df.tradeDate.sort_values().unique()
    total = 0
    for date in dates:
        temp_df = df[df.tradeDate == date]
        temp_IC = temp_df[['y', 'y_pred']].corr().iloc[0,1]
        total += temp_IC
    IC = total/len(list(dates)) #df[['y', 'y_pred']].corr().iloc[0,1]
    print('{}: {}'.format(file[9:-4], IC))
# 500_Alter_066_full_XGBoostR-openclose_pct1_rank-r-25--99: 0.030344625575579553
# 500_Alter_066_full_DecisionTreeR-openclose_pct1_rank-r-25--99: 0.01931140387478804
# 500_Alter_066_full_LGBMRegressor-openclose_pct1_rank-r-25--99: 0.03376780851376303
# 500_Alter_066_full_RidgeR-openclose_pct1_rank-r-25--99: 0.03796098294064469

In [None]:
f_index = ['ticker', 'tradeDate']
f_x_036_222 = pickle.load(open('pct5_cal/f_x_036_222', 'rb'))
f_222 = pickle.load(open('pct5_cal/f_alphas_222', 'rb'))
f_x = [x for x in f_x_036_222 if x not in f_222]

In [None]:
f_y = ['PCT5_rank']

In [None]:
df_036 = out[f_index + f_x + f_y]
df_036 = df_036.to_csv('pct5_cal/modified_alter_alphas_036.csv', index=False)

In [None]:
df_500.shape

In [None]:
dates = df_500.tradeDate.sort_values().unique()
epoch_ts = list(dates)

f_index = ['ticker', 'tradeDate']
f_x = pickle.load(open("pct1_cal/f_x_066", "rb"))
label_list = ['PCT5_rank', 'PCT2_rank', 'openclose_pct1_rank', 'askbid_pct1_rank']
f_y = label_list[2]

if_pcas = ['', 'pca'] # pca 或者空字符串# 是否做PCA
if_pca = if_pcas[0] # pca 或者空字符串
pca_components_list = [0.99, 0.95, 0.90, 0.85, 0.80]
pca_components = pca_components_list[0]
num_leaves_list = [10, 15, 20, 25, 30]
num_leaves = num_leaves_list[3]
depth_list = [3, 4, 5, 6, 7, 8, 9, 10]
depth = depth_list[3]

model_list = ['RidgeR', 'DecisionTreeR', 'XGBoostR', 'LGBMRegressor', 'RandomForestR']
result = {}

for model_name in model_list:
#def run_num(model_name):
#for f_y in label_list:
    print('======== LEN_TRAIN {} ========'.format(f_y))
    target_types = ['r', 'c'] # 分类问题还是回归问题 r 回归问题 c 分类问题
    target_type = target_types[0]

    result_name = '500_Alter_066_full_{}-{}-{}-{}-{}-{}'.format(model_name, f_y, target_type, num_leaves, if_pca, int(100*pca_components))
    print(result_name)

    update = 22 # 训练长度：22天
    train_si = epoch_ts.index('2017-01-03') # included. '2017-01-03'
    train_ei = epoch_ts.index('2019-01-02') # excluded. '2018-12-28'
    test_si = epoch_ts.index('2019-01-02') # included. '2019-01-02'
    test_ei = epoch_ts.index('2019-02-01') # excluded. '2019-01-31'
    test_fi = len(epoch_ts) - 1 # excluded.

    # number of epochs，循环次数
    num_epoch = round((test_fi - test_ei) / 22)
    epoch_range = range(0, num_epoch + 1)
    #epoch_range = range(0, 1)

    start = time.time()
    df_result_all = pd.DataFrame()
    for epoch in epoch_range:
        print('----- EPOCH {}------'.format(epoch))
        update_n = epoch * update
        # get a list of train dates
        epoch_t_train = epoch_ts[train_si + update_n : train_ei + update_n]
        # get a list of test dates
        epoch_t_test = epoch_ts[test_si + update_n : test_ei + update_n]
        df_train = df_500[df_500.tradeDate.apply(lambda x: x in epoch_t_train)].reset_index(drop=True)
        df_test = df_500[df_500.tradeDate.apply(lambda x: x in epoch_t_test)].reset_index(drop=True)
        print('预测时间：', epoch_t_test)
        print('数据大小：', df_train.shape, df_test.shape)

        # 获得 x
        # PCA处理
        if if_pca == 'pca':
            from sklearn.decomposition import PCA
            pca = PCA(n_components=pca_components)
            pca.fit(df_train[f_x])
            x_train = pca.transform(df_train[f_x])
            x_test = pca.transform(df_test[f_x])
        else:
            x_train = df_train[f_x].values
            x_test = df_test[f_x].values
        print('处理后x：', x_train.shape, x_test.shape)

        # 获得y
        y_train = df_train[f_y].copy()
        y_test = df_test[f_y].copy()
        print('处理后y：', y_train.shape, y_test.shape)

        if model_name=='RidgeR': # alpha: 200
            from sklearn.linear_model import Ridge
            model = Ridge(alpha=1)
            model.fit(x_train, y_train)
        elif model_name=='DecisionTreeR': # 'splitter': ['random'], 'criterion': ['friedman_mse'], 'max_depth': [6], 'min_samples_leaf': [41], 'min_impurity_decrease': [0.5]
            from sklearn.tree import DecisionTreeRegressor
            model = DecisionTreeRegressor(splitter = 'random', criterion = 'friedman_mse', max_depth = 6, min_samples_leaf = 41, min_impurity_decrease = 0.5)
            model.fit(x_train, y_train)
        elif model_name=='RandomForestR': # 'n_estimators': [400], 'max_depth': [9], 'max_features': [29]
            from sklearn.ensemble import RandomForestRegressor
            model = RandomForestRegressor(n_estimators=400, max_depth=9, max_features=29)
            model.fit(x_train, y_train)
        elif model_name == 'XGBoostR': # 'n_estimators': [20], 'max_depth': [3], 'max_features': [10], 'subsample': [1.0]},
            from xgboost import XGBRegressor
            model = XGBRegressor(n_estimators=20, max_depth=6, subsample = 1.0)
            model.fit(x_train, y_train)
        elif model_name=='LGBMRegressor':
            model = lgb.LGBMRegressor(learning_rate=0.09, num_leaves = num_leaves, max_depth=depth)
            model.fit(x_train, y_train, eval_set=[(x_train,y_train), (x_test,y_test)], eval_metric='l2')
        
        y_pred = model.predict(x_test)

        # 获得结果
        print('get result')
        df_result = df_test[f_index].copy()
        df_result['y'] = y_test
        df_result['y_pred'] = y_pred
        df_result_all = df_result_all.append(df_result)

    print(f'耗时:{time.time() - start}') 
    print('sort values')
    df_result_all = df_result_all.sort_values(by=['ticker', 'tradeDate']).reset_index(drop=True)
    IC = df_result_all[['y', 'y_pred']].corr().iloc[0,1]
    result[model_name] = IC
    print('store data')
    df_result_all.to_csv('pct1_cal/{}.csv'.format(result_name), index=False)
    print('======== COMPLETE {} ========'.format(model_name))