In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import seaborn as sns
import lightgbm as lgb

In [2]:
#vanilla data
data_van = pd.read_parquet('./features_ready_220507.parquet.gzip')
data_van = data_van.drop(['Open', 'High', 'Low', 'Close', 'Volume','AdjustmentFactor','ExpectedDividend','SupervisionFlag','CumFactor',
                       'SMA50','SMA10','k','EMA12','EMA26','SMA14UP','SMA14DOWN','Lower_line','Upper_line',
                       'SD10','MACD','EMA9','tot_volume','ind_volume'], axis=1)
print(data_van.columns)

Index(['Date', 'SecuritiesCode', 'Target', 'Mo', 'Tu', 'We', 'Th', 'Fr',
       'DividendRatio', 'SMA50_Close_diff', 'SMA10_Close_diff', 'Volume_share',
       'UL_Close-diff', 'LL_Close-diff', 'RSI', 'Stochastic', 'EMA_diff',
       'BeforeDiv', 'AfterDiv', 'Day_number', 'Close_lag_1', 'Gap',
       'Open_lag_1', 'Close_lag_2', 'Open_lag_2', 'Close_lag_3', 'Open_lag_3',
       'Candle', 'Target_lag_2', 'Target_lag_3', 'Target_lag_4'],
      dtype='object')


In [3]:
#logged data
data_log = pd.read_parquet('./log_features_220526.parquet.gzip')
data_log = data_log.drop(['Open', 'High', 'Low', 'Close', 'Volume','AdjustmentFactor','ExpectedDividend','SupervisionFlag','CumFactor',
                       'SMA50','SMA10','k','EMA12','EMA26','SMA14UP','SMA14DOWN','Lower_line','Upper_line',
                       'SD10','MACD','EMA9','tot_volume','ind_volume'], axis=1)
print(data_log.columns)

Index(['Date', 'SecuritiesCode', 'Target', 'Mo', 'Tu', 'We', 'Th', 'Fr',
       'DividendRatio', 'SMA50_Close_diff', 'SMA10_Close_diff', 'Volume_share',
       'UL_Close-diff', 'LL_Close-diff', 'RSI', 'Stochastic', 'EMA_diff',
       'BeforeDiv', 'AfterDiv', 'Day_number', 'Close_lag_1', 'Gap',
       'Open_lag_1', 'Close_lag_2', 'Open_lag_2', 'Close_lag_3', 'Open_lag_3',
       'Candle', 'Target_lag_2', 'Target_lag_3', 'Target_lag_4'],
      dtype='object')


In [4]:
#normalized data
def normalize(df):
    norm = (df-df.min()) / (df.max()-df.min())
    norm['Date'] = df['Date']
    norm['SecuritiesCode'] = df['SecuritiesCode']
    norm['Target'] = df['Target']
    return norm
data_n_van = normalize(data_van)
data_n_log = normalize(data_log)

In [5]:
#function for generating the train/test for linreg for dec 6 and 7th only
def train_test_split(df):
    X_train = df[-df['Date'].isin(['2021-12-06', '2021-12-07'])]
    X_train = X_train.fillna(0)
    y_train = X_train['Target']
    X_train = X_train.drop(['Date', 'SecuritiesCode', 'Target'], axis=1)

    X_test = df[df['Date'].isin(['2021-12-06'])]
    X_test = X_test.fillna(0)
    X_sub = X_test[['Date', 'SecuritiesCode']].reset_index()
    X_test = X_test.drop(['Date', 'SecuritiesCode', 'Target'], axis=1)
    
    return X_train, y_train, X_test, X_sub

In [7]:
Xtr_van, ytr_van, Xt_van, Xs_van = train_test_split(data_van)
Xtr_log, ytr_log, Xt_log, Xs_log = train_test_split(data_log)
Xtr_nlog, ytr_nlog, Xt_nlog, Xs_nlog = train_test_split(data_n_log)
Xtr_nvan, ytr_nvan, Xt_nvan, Xs_nvan = train_test_split(data_n_van)

In [8]:
test_df = pd.read_csv('../files/supplemental_files/stock_prices.csv', index_col=0, parse_dates=[1])

In [9]:
def top_calc(row):
    pred_rank = row['Predicted_Rank']
    if pred_rank < 200:
        return (2- pred_rank / 200) * row['Predicted_Target'] / 1.5
def bot_calc(row):
    pred_rank = row['Predicted_Rank']
    if pred_rank >=1800:
        return (2- np.abs(pred_rank - 1999) / 200) * row['Predicted_Target'] / 1.5
#calc function
def calculate(X_train, y_train, X_test, X_sub, test_list):
    reg = LinearRegression().fit(X_train, y_train)
    reg_score = reg.score(X_train, y_train)
    
    y = pd.DataFrame(reg.predict(X_test), columns = ['Predicted_Target'])
    
    result = pd.concat([X_sub, y], axis=1)
    result['Predicted_Rank'] = result['Predicted_Target'].rank()
    
    test_list = test_list[test_list['Date']=='2021-12-06'][['SecuritiesCode', 'Target']]
    test_list['Real_Rank'] = test_list['Target'].rank()
    test_list.rename(columns ={'Target':'True_value'}, inplace = True)
    
    score_calc = result.merge(test_list, how='left', on='SecuritiesCode')
    
    score_calc['Top_result'] = score_calc.apply(top_calc, axis=1)
    score_calc['Bot_result'] = score_calc.apply(bot_calc, axis=1)

    sharpe = score_calc['Top_result'].sum() - score_calc['Bot_result'].sum()
    
    return reg_score, sharpe
    

In [10]:
R_van, sharpe_van = calculate(Xtr_van, ytr_van, Xt_van, Xs_van, test_df)
R_log, sharpe_log = calculate(Xtr_log, ytr_log, Xt_log, Xs_log, test_df)
R_nvan, sharpe_nvan = calculate(Xtr_nvan, ytr_nvan, Xt_nvan, Xs_nvan, test_df)
R_nlog, sharpe_nlog = calculate(Xtr_nlog, ytr_nlog, Xt_nlog, Xs_nlog, test_df)
print('Vanilla: ', R_van, sharpe_van)
print('Logged:', R_log, sharpe_log)
print('Noramilized Vanilla: ', R_nvan, sharpe_nvan)
print('Normalized Logged: ', R_nlog, sharpe_nlog )

Vanilla:  0.005798878123100426 -0.4833767916782674
Logged: 0.004155084343604454 -0.5118210463474194
Noramilized Vanilla:  0.005722803946326072 -0.43185766537363324
Normalized Logged:  0.004181906270685598 -0.4827561928378418


In [22]:
#train-validation test split
X_train = data_van[-data_van['Date'].isin(['2021-12-03', '2021-12-06', '2021-12-07'])]
#X_train.fillna(0, inplace=True)
y_train = X_train['Target']
X_train = X_train.drop(['Target'], axis=1)
X_train['Date'] = X_train['Date'].astype('int64')  #.astype('str').replace('-','')#.astype('int')

X_val = data_van[data_van['Date'].isin(['2021-12-03'])]
#X_test.fillna(0, inplace = True)
#X_sub = X_test[['Date', 'SecuritiesCode']].reset_index()
y_val = X_val['Target']
X_val = X_val.drop(['Target'], axis=1)
X_val['Date'] = X_val['Date'].astype('int64') #.replace('-','').astype('int')

X_test = data_van[data_van['Date'].isin(['2021-12-06'])]
#X_test.fillna(0, inplace = True)
X_sub = X_test[['Date', 'SecuritiesCode']].reset_index()
X_test = X_test.drop(['Target'], axis=1)
X_test['Date'] = X_test['Date'].astype('int64')

In [23]:
#training
def build_lgb_model(params, X_train, X_val, y_train, y_val, cat_features):
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    model = lgb.train(params=params, train_set=lgb_train, valid_sets=(lgb_train, lgb_val), verbose_eval=50,
                     categorical_feature=cat_features)
    return model

In [24]:
params = {
    'objective': 'rmse',
    'metric': 'rmse'#,
    #'num_leaves': 1023,
    #'min_data_in_leaf':10,
    #'feature_fraction':0.7,
    #'learning_rate': 0.01,
    #'num_rounds': 1000,
    #'early_stopping_rounds': 30,
    #'seed': 1
}
#designating the categorical features which should be focused on
cat_features = ['SecuritiesCode']#,'SecuritiesCode']

lgb_model = build_lgb_model(params, X_train, X_val, y_train, y_val, cat_features)

New categorical_feature is ['SecuritiesCode']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7310
[LightGBM] [Info] Number of data points in the train set: 2330531, number of used features: 28
[LightGBM] [Info] Start training from score 0.000422




[50]	training's rmse: 0.0219109	valid_1's rmse: 0.0318206
[100]	training's rmse: 0.021525	valid_1's rmse: 0.0313573


In [26]:
y_tree = pd.DataFrame(lgb_model.predict(X_test), columns = ['Predicted_Target'])
result_tree = pd.concat([X_sub, y_tree], axis=1)
result_tree['Predicted_Rank'] = result_tree['Predicted_Target'].rank()
    
test_list_tree = test_df[test_df['Date']=='2021-12-06'][['SecuritiesCode', 'Target']]
test_list_tree['Real_Rank'] = test_list_tree['Target'].rank()
test_list_tree.rename(columns ={'Target':'True_value'}, inplace = True)
    
score_calc_tree = result_tree.merge(test_list_tree, how='left', on='SecuritiesCode')
    
score_calc_tree['Top_result'] = score_calc_tree.apply(top_calc, axis=1)
score_calc_tree['Bot_result'] = score_calc_tree.apply(bot_calc, axis=1)

sharpe_tree = score_calc_tree['Top_result'].sum() - score_calc_tree['Bot_result'].sum()

print(sharpe_tree)

-1.3918581193859425
