In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
import sys
import numpy as np
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

from importlib import reload
sys.path.append("D:/DST/")
import cathay_db as db
import utils as ut
import financial_statement as fs

reload(ut)
reload(fs)
import matplotlib.pyplot as plt
import datetime
import os

sys.path.append("./src")

from toolbox import print_progress_bar

# set max display rows
pd.set_option("display.max_rows", 200)

# set max display columns
pd.set_option("display.max_columns", 100)

# Set the float format to display without scientific notation
pd.options.display.float_format = '{:.4f}'.format

import json

# 1. Read data from feather file

In [2]:
# # following is data from tej
# df_adjusted_price = pd.read_feather('data/tej_adjusted_price.feather')

# # following is data from cmoney
# df_price = pd.read_feather('data/cmoney_price.feather')
# df_company_info = pd.read_feather('data/cmoney_company_info.feather')

# ticker_list = np.sort(pd.unique(df_company_info['股票代號']))

# df_price = ut.cmoney_data_clean_up(df_price)
# df_price = deepcopy(df_price[df_price['股票代號'].isin(ticker_list)])

# df_adjusted_price = ut.tej_data_clean_up(df_adjusted_price)
# df_adjusted_price = deepcopy(df_adjusted_price[df_adjusted_price['股票代號'].isin(ticker_list)])

In [3]:
df_factor_all = pd.read_feather('data/df_factor_all.feather')
df_factor_all = df_factor_all[df_factor_all['date'] >= '2004-01-01']

In [4]:
df_factor_all.sort_values(['ticker', 'date'], inplace=True)
df_factor_all['y'] = df_factor_all.groupby('ticker', as_index=False, group_keys=False)['price'].pct_change(-62)
# group by date and ticker and normalize the y
df_factor_all['norm_y'] = df_factor_all.groupby('date', as_index=False, group_keys=False)['y'].apply(lambda x: (x - x.mean()) / x.std())

# replace inf with 9999
df_factor_all.replace([np.inf], 9999, inplace=True)
df_factor_all.replace([-np.inf], -9999, inplace=True)

In [5]:
def get_rebalance_date(date_list, start_date, end_date, freq='Q'):
    next_four_seasons = [start_date]
    rebalance_date = start_date
    i = 0
    while rebalance_date < end_date:
        if freq == 'Q':
            rebalance_date = start_date + pd.DateOffset(months=3 * (i + 1))
        elif freq == 'M':
            rebalance_date = start_date + pd.DateOffset(months=1 * (i + 1))
        elif freq == 'Y':
            rebalance_date = start_date + pd.DateOffset(years=1 * (i + 1))

        # check if rebalance_date is in date_list
        if rebalance_date in date_list:
            next_four_seasons.append(rebalance_date)
        else:
            # if not, find the next date in date_list
            for date in date_list:
                if date > rebalance_date:
                    # chang np.datetime64 to pd.Timestamp
                    rebalance_date = pd.Timestamp(date)
                    next_four_seasons.append(rebalance_date)
                    break
        i += 1
    return next_four_seasons

In [6]:
date_list = np.sort(pd.unique(df_factor_all['date']))

In [7]:
rebalance_date_lst = get_rebalance_date(date_list, pd.to_datetime('2004-03-15'), pd.to_datetime('2023-3-15'), freq='Q')

In [8]:
factor_columns = df_factor_all.columns.tolist()
factor_columns.remove('date')
factor_columns.remove('ticker')
factor_columns.remove('price')
factor_columns.remove('y')
factor_columns.remove('norm_y')
factor_columns.remove('ppe_qoq')
factor_columns.remove('ppe_yoy')
factor_columns.remove('excess_return')
print(factor_columns)

['asset_qoq', 'asset_yoy', 'ni_qoq', 'ni_yoy', 'roe', 'roe_yoy', 'roe_4q_sum', 'roe_4q_sum_yoy', 'tobins_q', 'ocf / asset', '20_d_return', '40_d_return', '60_d_return', 'dividend_1Y_sum_yield', 'dividend_2Y_sum_yield', 'dividend_3Y_sum_yield', 'last_dividend_yield']


In [9]:
target_cols = ['norm_y']
df_factor_all = df_factor_all.dropna(subset=factor_columns + target_cols, how='any')

In [10]:
len(rebalance_date_lst)

77

In [11]:
start_index = 4

print(rebalance_date_lst[start_index])

2005-03-15 00:00:00


In [12]:
# name the folder by current date and time
folder_name = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

model_save_path = f'D:/Projects/0_HIDIV/models/{folder_name}'

# create a folder to save model
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)


total = len(rebalance_date_lst) - start_index  # Total number of iterations

print_progress_bar(0, total, prefix = 'Progress:', suffix = 'Complete', length = 50)

for i in range(start_index, len(rebalance_date_lst)):
    # expanding window
    train_date = rebalance_date_lst[0:i]
    rebalance_date = rebalance_date_lst[i]

    # data valid date = rebalance_date - 62 days
    data_valid_date = rebalance_date - pd.DateOffset(days=62)

    # remove date in train_date if date > data_valid_date
    valid_train_date = [date for date in train_date if date <= data_valid_date]

    # 訓練股票
    param_search = dict(
        learning_rate=[0.01, 0.05, 0.1],
        max_depth=[3, 5, 7],
        n_estimators=[5, 10, 50, 100, 200],
    )

    df_factor_all_train = df_factor_all[df_factor_all['date'].isin(train_date)]
    
    model_stock = XGBRegressor()
    gscv = GridSearchCV(
        model_stock, param_search, refit=True, scoring="neg_root_mean_squared_error"
    )

    gscv.fit(df_factor_all_train[factor_columns], df_factor_all_train[target_cols])
    # timestamp to string format for saving model
    rebalance_date = rebalance_date.strftime("%Y%m%d")
    gscv.best_estimator_.save_model(f"{model_save_path}/{rebalance_date}.json")

    # Update Progress Bar
    print_progress_bar(i-start_index+1, total, prefix = 'Progress:', suffix = 'Complete', length = 50)

print_progress_bar(total, total, prefix = 'Progress:', suffix = 'Complete', length = 50)


Progress: |--------------------------------------------------| 0.0% Complete

Progress: |██████████████████████████████████████████████████| 100.0% Complete
Progress: |██████████████████████████████████████████████████| 100.0% Complete


In [13]:

# create a data/{folder_name} if not exist
if not os.path.exists(f'data/model/{folder_name}'):
    os.makedirs(f'data/model/{folder_name}')

In [43]:
# string the rebalance_date_lst for saving json
rebalance_date_lst_str = [date.strftime("%Y%m%d") for date in rebalance_date_lst]

In [44]:
# save the setting as a json file
with open(f'data/model/{folder_name}/setting.json', 'w') as f:
    json.dump({
        'start_index': start_index,
        'rebalance_date_lst': rebalance_date_lst_str,
        'factor_columns': factor_columns,
        'target_cols': target_cols,
        'model_save_path': model_save_path,
        # also the param_search
        'param_search': param_search
    }, f)

In [45]:
# reset_index for df_factor_all
df_factor_all = df_factor_all.reset_index(drop=True)

# save the df_factor_all as a feather file
df_factor_all.to_feather(f'data/model/{folder_name}/df_factor_all.feather')