In [1]:
import sys, os
import numpy as np
import pandas as pd
import qlib

In [2]:
from pathlib import Path
scripts_dir = Path("/data/students/huzb/qlib/scripts")
print(scripts_dir.joinpath("get_data.py"))
assert scripts_dir.joinpath("get_data.py").exists()

/data/students/huzb/qlib/scripts/get_data.py


In [3]:
if not scripts_dir.joinpath("get_data.py").exists():
    # download get_data.py script
    scripts_dir = Path("~/tmp/qlib_code/scripts").expanduser().resolve()
    scripts_dir.mkdir(parents=True, exist_ok=True)
    import requests
    with requests.get("https://raw.githubusercontent.com/microsoft/qlib/main/scripts/get_data.py") as resp:
        with open(scripts_dir.joinpath("get_data.py"), "wb") as fp:
            fp.write(resp.content)

In [4]:
from qlib.constant import REG_CN
from qlib.utils import exists_qlib_data, init_instance_by_config
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.utils import flatten_dict

In [5]:
provider_uri = "/data/students/huzb/qlib/qlib_data/cn_data"  # target_dir
# if not exists_qlib_data(provider_uri):
#     print(f"Qlib data is not found in {provider_uri}")
#     sys.path.append(str(scripts_dir))
#     from get_data import GetData
#     GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
qlib.init(provider_uri=provider_uri, region=REG_CN)

[18248:MainThread](2022-08-29 15:12:50,755) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[18248:MainThread](2022-08-29 15:12:50,762) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[18248:MainThread](2022-08-29 15:12:50,763) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/students/huzb/qlib/qlib_data/cn_data')}


In [6]:
market = "csi300"
benchmark = "SH000300"

In [7]:
from qlib.data import D
from qlib.data.filter import ExpressionDFilter
from qlib.data.filter import NameDFilter

In [8]:
instruments = D.instruments(market='csi300')
fields = ['$close', '(Ref($close, -1)-$close)/$close', '(Ref($close, -2)-Ref($close, -1))/Ref($close, -1)', '(Ref($close, -3)-Ref($close, -2))/Ref($close, -2)', '(Ref($close, -4)-Ref($close, -3))/Ref($close, -3)', '(Ref($close, -5)-Ref($close, -4))/Ref($close, -4)', '(Ref($close, -6)-Ref($close, -5))/Ref($close, -5)', '(Ref($close, -7)-Ref($close, -6))/Ref($close, -6)']
f_d = D.features(instruments, fields, start_time='2008-01-01', end_time='2020-08-01', freq='day')
df = f_d
df.index = df.index.get_level_values('datetime')
print(df.index.min(), df.index.max())

start_time = pd.to_datetime(df.index.min())
end_time = pd.to_datetime(df.index.max())
print(start_time.strftime('%Y-%m-%d'), end_time.strftime('%Y-%m-%d'))

2008-01-02 00:00:00 2020-07-31 00:00:00
2008-01-02 2020-07-31


In [9]:
experiment_name="online_srv"

In [10]:
###################################
# train model
###################################
data_handler_config = {
    "start_time": start_time, # 
    "end_time": end_time,
    "fit_start_time": start_time,
    "fit_end_time": "2014-12-31",
    "instruments": market,
    "infer_processors": [
      {
        "class": "RobustZScoreNorm",
        "kwargs": {
          "fields_group": "feature",
          "clip_outlier": True
        }
      },
      {
        "class": "Fillna",
        "kwargs": {
          "fields_group": "feature"
        }
      }
    ],
    "learn_processors": [
      {
        "class": "DropnaLabel"
      },
      {
        "class": "CSRankNorm",
        "kwargs": {
          "fields_group": "label"
        }
      }
    ],
    "label": [
      "Ref($close, -2) / Ref($close, -1) - 1"
    ]
}

task = {   
    "model": {
        "class": "LSTM",
        "module_path": "qlib.contrib.model.pytorch_lstm",
        "kwargs": {
            "d_feat": 6,
            "hidden_size": 64,
            "num_layers": 2,
            "dropout": 0.0,
            "dec_dropout": 0.0,
            "n_epochs": 200,
            "lr": 1e-3,
            "early_stop": 20,
            "batch_size": 800,
            "metric": "loss",
            "loss": "mse",
            "optimizer": "adam",
            "GPU": 0
        },
    },
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha360",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": {
                "train": (start_time, "2014-12-31"),
                "valid": ("2015-01-01", "2016-12-31"),
                "test": ("2017-01-01", "2020-08-01"),
            },
        },
    },
}

# model initiaiton
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])

[18248:MainThread](2022-08-29 15:12:58,182) INFO - qlib.LSTM - [pytorch_lstm.py:58] - LSTM pytorch version...
[18248:MainThread](2022-08-29 15:12:58,213) INFO - qlib.LSTM - [pytorch_lstm.py:75] - LSTM parameters setting:
d_feat : 6
hidden_size : 64
num_layers : 2
dropout : 0.0
n_epochs : 200
lr : 0.001
metric : loss
batch_size : 800
early_stop : 20
optimizer : adam
loss_type : mse
visible_GPU : 0
use_GPU : True
seed : None
[18248:MainThread](2022-08-29 15:13:34,958) INFO - qlib.timer - [log.py:117] - Time cost: 33.701s | Loading data Done
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
[18248:MainThread](2022-08-29 15:16:28,517) INFO - qlib.timer - [log.py:117] - Time cost: 169.442s | RobustZScoreNorm Done
[18248:MainThread](2022-08-29 15:16:30,348) INFO - qlib.timer - [log.py:117] - Time cost: 1.827s | Fillna Done
[18248:MainThread](2022-08-29 15:16:32,736) INFO - qlib.timer - [log.py:117] - Time cost: 1.396s | DropnaLabel Done
A value is trying to be set on a c

In [11]:
# start exp to train model

experiment_id = 'cn_backtest'
# experiment_name: Optional[Text] = None,
# recorder_id: Optional[Text] = None,

# with R.start(experiment_name=experiment_name, experimen
# t_id=experiment_id):
with R.start(experiment_name=experiment_name):
    R.log_params(**flatten_dict(task))
    model.fit(dataset)
    R.save_objects(trained_model=model)
    rid = R.get_recorder().id
    # prediction
    recorder = R.get_recorder()
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

[18248:MainThread](2022-08-29 15:16:34,746) INFO - qlib.workflow - [expm.py:315] - <mlflow.tracking.client.MlflowClient object at 0x7f4af01efd60>
[18248:MainThread](2022-08-29 15:16:34,813) INFO - qlib.workflow - [exp.py:257] - Experiment 1 starts running ...
[18248:MainThread](2022-08-29 15:16:35,537) INFO - qlib.workflow - [recorder.py:295] - Recorder 33917683e4b04dd1b6510d63a2b12e2d starts running under Experiment 1 ...
[18248:MainThread](2022-08-29 15:16:39,393) INFO - qlib.LSTM - [pytorch_lstm.py:236] - training...
[18248:MainThread](2022-08-29 15:16:39,396) INFO - qlib.LSTM - [pytorch_lstm.py:240] - Epoch0:
[18248:MainThread](2022-08-29 15:16:39,398) INFO - qlib.LSTM - [pytorch_lstm.py:241] - training...
[18248:MainThread](2022-08-29 15:16:48,593) INFO - qlib.LSTM - [pytorch_lstm.py:243] - evaluating...
[18248:MainThread](2022-08-29 15:16:53,704) INFO - qlib.LSTM - [pytorch_lstm.py:246] - train -0.992868, valid -0.994185
[18248:MainThread](2022-08-29 15:16:53,710) INFO - qlib.LST

'The following are prediction results of the LSTM model.'
                          score
datetime   instrument          
2017-01-03 SH600000    0.055336
           SH600008    0.089107
           SH600009    0.160764
           SH600010    0.006614
           SH600015    0.006592


[18248:MainThread](2022-08-29 15:23:05,204) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | waiting `async_log` Done


## WeekTopkDropoutStrategy

In [15]:
###################################
# prediction, backtest & analysis
###################################
port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": {
        "class": "WeekTopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy.signal_strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "topk": 50,
            "n_drop": 5,
        },
    },
    "backtest": {
        "start_time": "2017-01-01",
        "end_time": "2020-09-01",
        "account": 100000000,
        "benchmark": benchmark,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.095,
            "deal_price": "close",
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        },
    },
}

# backtest and analysis
with R.start(experiment_name="backtest_analysis"):
    recorder = R.get_recorder(recorder_id=rid, experiment_name="online_srv")
    model = recorder.load_object("trained_model")

    # prediction
    recorder = R.get_recorder()
    ba_rid = recorder.id
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # backtest & analysis
    par = PortAnaRecord(recorder, port_analysis_config, "day")
    par.generate()

[18248:MainThread](2022-08-29 15:31:29,933) INFO - qlib.workflow - [expm.py:315] - <mlflow.tracking.client.MlflowClient object at 0x7f490019fa60>
[18248:MainThread](2022-08-29 15:31:29,940) INFO - qlib.workflow - [exp.py:257] - Experiment 2 starts running ...
[18248:MainThread](2022-08-29 15:31:29,969) INFO - qlib.workflow - [recorder.py:295] - Recorder b38bbad3319b4d9e9faa4b7908e40f4d starts running under Experiment 2 ...
  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
[18248:MainThread](2022-08-29 15:31:32,995) INFO - qlib.workflow - [record_temp.py:194] - Signal record 'pred.pkl' has been saved as the artifact of the Experiment 2
[18248:MainThread](2022-08-29 15:31:33,107) INFO - qlib.backtest caller - [__init__.py:94] - Create new exchange


'The following are prediction results of the LSTM model.'
                          score
datetime   instrument          
2017-01-03 SH600000    0.055336
           SH600008    0.089107
           SH600009    0.160764
           SH600010    0.006614
           SH600015    0.006592


  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


backtest loop:   0%|          | 0/893 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wha.loc[:, 'A_Week_HighAndLow'] = wha.apply(lambda x: x.sum(), axis=1).copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wha.loc[:, 'A_Week_HighAndLow'] = wha.apply(lambda x: x.sum(), axis=1).copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wha.loc[:, 'A_Week_HighAndLow'] = wha.apply(lambd

'The following are analysis results of benchmark return(1day).'
                       risk
mean               0.000501
std                0.012264
annualized_return  0.119337
information_ratio  0.630762
max_drawdown      -0.370479
'The following are analysis results of the excess return without cost(1day).'
                        risk
mean                0.005718
std                 0.006427
annualized_return   1.360770
information_ratio  13.724126
max_drawdown       -0.018622
'The following are analysis results of the excess return with cost(1day).'
                        risk
mean                0.005533
std                 0.006436
annualized_return   1.316792
information_ratio  13.261860
max_drawdown       -0.018744
'The following are analysis results of indicators(1day).'
     value
ffr    1.0
pa     0.0
pos    0.0


In [16]:
from qlib.contrib.report import analysis_model, analysis_position
from qlib.data import D
recorder = R.get_recorder(recorder_id=ba_rid, experiment_name="online_srv")
print(recorder)
pred_df = recorder.load_object("pred.pkl")
pred_df_dates = pred_df.index.get_level_values(level='datetime')
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")

{'class': 'Recorder', 'id': 'b38bbad3319b4d9e9faa4b7908e40f4d', 'name': 'mlflow_recorder', 'experiment_id': '1', 'start_time': '2022-08-29 15:31:29', 'end_time': '2022-08-29 17:01:05', 'status': 'FINISHED'}


In [17]:
report_normal_df

Unnamed: 0_level_0,account,return,total_turnover,turnover,total_cost,cost,value,cash,bench
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-03,9.995250e+07,0.000000,9.500000e+07,0.950000,4.750000e+04,0.000475,9.500000e+07,4.952500e+06,0.009713
2017-01-04,1.004872e+08,0.005562,1.183374e+08,0.233485,6.873084e+04,0.000212,9.976903e+07,7.181268e+05,0.007803
2017-01-05,1.007598e+08,0.002913,1.373533e+08,0.189237,8.873994e+04,0.000199,9.807525e+07,2.684594e+06,-0.000154
2017-01-06,1.004981e+08,-0.002396,1.586933e+08,0.211791,1.090527e+05,0.000202,9.988835e+07,6.097961e+05,-0.005974
2017-01-09,1.011633e+08,0.006832,1.801266e+08,0.213270,1.304715e+05,0.000213,1.006039e+08,5.594298e+05,0.004848
...,...,...,...,...,...,...,...,...,...
2020-08-26,1.831463e+10,-0.003194,4.006014e+11,0.168769,4.007921e+08,0.000169,1.823395e+10,8.068819e+07,-0.011722
2020-08-27,1.861602e+10,0.016640,4.039614e+11,0.183460,4.041571e+08,0.000184,1.852870e+10,8.732058e+07,0.005359
2020-08-28,1.923813e+10,0.033626,4.078270e+11,0.207649,4.080311e+08,0.000208,1.913781e+10,1.003203e+08,0.023866
2020-08-31,1.938411e+10,0.007752,4.110003e+11,0.164949,4.111974e+08,0.000165,1.930103e+10,8.308161e+07,-0.005790


In [18]:
###################################
# prediction, backtest & analysis
###################################
port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": {
        "class": "TopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy.signal_strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "topk": 50,
            "n_drop": 5,
        },
    },
    "backtest": {
        "start_time": "2017-01-01",
        "end_time": "2020-09-01",
        "account": 100000000,
        "benchmark": benchmark,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.095,
            "deal_price": "close",
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        },
    },
}

# backtest and analysis
with R.start(experiment_name="backtest_analysis"):
    recorder = R.get_recorder(recorder_id=rid, experiment_name="online_srv")
    model = recorder.load_object("trained_model")

    # prediction
    recorder = R.get_recorder()
    ba_rid = recorder.id
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # backtest & analysis
    par = PortAnaRecord(recorder, port_analysis_config, "day")
    par.generate()

[18248:MainThread](2022-08-29 17:02:34,764) INFO - qlib.workflow - [expm.py:315] - <mlflow.tracking.client.MlflowClient object at 0x7f4ade909850>
[18248:MainThread](2022-08-29 17:02:34,773) INFO - qlib.workflow - [exp.py:257] - Experiment 2 starts running ...
[18248:MainThread](2022-08-29 17:02:34,796) INFO - qlib.workflow - [recorder.py:295] - Recorder 8e22a47e4dde45c49529f1c901b5ec3b starts running under Experiment 2 ...

RNN module weights are not part of single contiguous chunk of memory. This means they need to be compacted at every call, possibly greatly increasing memory usage. To compact weights again call flatten_parameters(). (Triggered internally at  /opt/conda/conda-bld/pytorch_1646755853042/work/aten/src/ATen/native/cudnn/RNN.cpp:926.)

[18248:MainThread](2022-08-29 17:02:37,864) INFO - qlib.workflow - [record_temp.py:194] - Signal record 'pred.pkl' has been saved as the artifact of the Experiment 2
[18248:MainThread](2022-08-29 17:02:37,989) INFO - qlib.backtest caller - 

'The following are prediction results of the LSTM model.'
                          score
datetime   instrument          
2017-01-03 SH600000    0.055336
           SH600008    0.089107
           SH600009    0.160764
           SH600010    0.006614
           SH600015    0.006592



`model` `dataset` is deprecated; use `signal`.


RNN module weights are not part of single contiguous chunk of memory. This means they need to be compacted at every call, possibly greatly increasing memory usage. To compact weights again call flatten_parameters(). (Triggered internally at  /opt/conda/conda-bld/pytorch_1646755853042/work/aten/src/ATen/native/cudnn/RNN.cpp:926.)



backtest loop:   0%|          | 0/893 [00:00<?, ?it/s]


Mean of empty slice


Mean of empty slice


Mean of empty slice


Mean of empty slice

[18248:MainThread](2022-08-29 17:03:18,653) INFO - qlib.workflow - [record_temp.py:499] - Portfolio analysis record 'port_analysis_1day.pkl' has been saved as the artifact of the Experiment 2
[18248:MainThread](2022-08-29 17:03:18,664) INFO - qlib.workflow - [record_temp.py:524] - Indicator analysis record 'indicator_analysis_1day.pkl' has been saved as the artifact of the Experiment 2
[18248:MainThread](2022-08-29 17:03:18,728) INFO - qlib.timer - [log.py:117] - Time cost: 0.007s | waiting `async_log` Done


'The following are analysis results of benchmark return(1day).'
                       risk
mean               0.000501
std                0.012264
annualized_return  0.119337
information_ratio  0.630762
max_drawdown      -0.370479
'The following are analysis results of the excess return without cost(1day).'
                       risk
mean               0.000325
std                0.004321
annualized_return  0.077240
information_ratio  1.158636
max_drawdown      -0.048673
'The following are analysis results of the excess return with cost(1day).'
                       risk
mean               0.000154
std                0.004320
annualized_return  0.036699
information_ratio  0.550636
max_drawdown      -0.063305
'The following are analysis results of indicators(1day).'
     value
ffr    1.0
pa     0.0
pos    0.0


In [19]:
from qlib.contrib.report import analysis_model, analysis_position
from qlib.data import D
recorder = R.get_recorder(recorder_id=ba_rid, experiment_name="online_srv")
print(recorder)
pred_df = recorder.load_object("pred.pkl")
pred_df_dates = pred_df.index.get_level_values(level='datetime')
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")

{'class': 'Recorder', 'id': '8e22a47e4dde45c49529f1c901b5ec3b', 'name': 'mlflow_recorder', 'experiment_id': '1', 'start_time': '2022-08-29 17:02:34', 'end_time': '2022-08-29 17:03:18', 'status': 'FINISHED'}


In [20]:
report_normal_df

Unnamed: 0_level_0,account,return,total_turnover,turnover,total_cost,cost,value,cash,bench
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-03,1.000000e+08,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,1.000000e+08,0.009713
2017-01-04,9.995250e+07,0.000000,9.500000e+07,0.950000,4.750000e+04,0.000475,9.500000e+07,4.952500e+06,0.007803
2017-01-05,1.000751e+08,0.001439,1.183669e+08,0.233780,6.876071e+04,0.000213,9.935625e+07,7.188750e+05,-0.000154
2017-01-06,9.991693e+07,-0.001388,1.377707e+08,0.193893,8.807013e+04,0.000193,9.940623e+07,5.106998e+05,-0.005974
2017-01-09,1.005785e+08,0.006833,1.589241e+08,0.211709,1.092536e+05,0.000212,1.000287e+08,5.498069e+05,0.004848
...,...,...,...,...,...,...,...,...,...
2020-08-26,1.664047e+08,-0.013812,1.959187e+10,0.000000,1.955393e+07,0.000000,1.618351e+08,4.569598e+06,-0.011722
2020-08-27,1.667445e+08,0.002042,1.959187e+10,0.000000,1.955393e+07,0.000000,1.621750e+08,4.569598e+06,0.005359
2020-08-28,1.694931e+08,0.016484,1.959187e+10,0.000000,1.955393e+07,0.000000,1.649235e+08,4.569598e+06,0.023866
2020-08-31,1.683187e+08,-0.006929,1.959187e+10,0.000000,1.955393e+07,0.000000,1.637491e+08,4.569598e+06,-0.005790
