In [2]:
import sys, os
import numpy as np
import pandas as pd
import qlib

In [3]:
from pathlib import Path
scripts_dir = Path("/data/students/huzb/qlib/scripts")
print(scripts_dir.joinpath("get_data.py"))
assert scripts_dir.joinpath("get_data.py").exists()

/data/students/huzb/qlib/scripts/get_data.py


In [4]:
if not scripts_dir.joinpath("get_data.py").exists():
    # download get_data.py script
    scripts_dir = Path("~/tmp/qlib_code/scripts").expanduser().resolve()
    scripts_dir.mkdir(parents=True, exist_ok=True)
    import requests
    with requests.get("https://raw.githubusercontent.com/microsoft/qlib/main/scripts/get_data.py") as resp:
        with open(scripts_dir.joinpath("get_data.py"), "wb") as fp:
            fp.write(resp.content)

In [5]:
from qlib.constant import REG_CN
from qlib.utils import exists_qlib_data, init_instance_by_config
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.utils import flatten_dict

#### 出现ModuleNotFoundError: No module named ‘qlib.data._libs.rolling’
在项目根目录运行python setup.py build_ext --inplace解决

In [6]:
provider_uri = "/data/students/huzb/qlib/qlib_data/cn_data"  # target_dir
# if not exists_qlib_data(provider_uri):
#     print(f"Qlib data is not found in {provider_uri}")
#     sys.path.append(str(scripts_dir))
#     from get_data import GetData
#     GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
qlib.init(provider_uri=provider_uri, region=REG_CN)

[15417:MainThread](2022-08-04 16:06:50,702) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[15417:MainThread](2022-08-04 16:06:50,709) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[15417:MainThread](2022-08-04 16:06:50,711) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/students/huzb/qlib/qlib_data/cn_data')}


In [7]:
market = "csi300"
benchmark = "SH000300"

In [8]:
from qlib.data import D
from qlib.data.filter import ExpressionDFilter
from qlib.data.filter import NameDFilter

In [9]:
instruments = D.instruments(market='csi300')
fields = ['$close', '(Ref($close, -1)-$close)/$close', '(Ref($close, -2)-Ref($close, -1))/Ref($close, -1)', '(Ref($close, -3)-Ref($close, -2))/Ref($close, -2)', '(Ref($close, -4)-Ref($close, -3))/Ref($close, -3)', '(Ref($close, -5)-Ref($close, -4))/Ref($close, -4)', '(Ref($close, -6)-Ref($close, -5))/Ref($close, -5)', '(Ref($close, -7)-Ref($close, -6))/Ref($close, -6)']
f_d = D.features(instruments, fields, start_time='2008-01-01', end_time='2020-08-01', freq='day')
df = f_d
df.index = df.index.get_level_values('datetime')
print(df.index.min(), df.index.max())

start_time = pd.to_datetime(df.index.min())
end_time = pd.to_datetime(df.index.max())
print(start_time.strftime('%Y-%m-%d'), end_time.strftime('%Y-%m-%d'))

2008-01-02 00:00:00 2020-07-31 00:00:00
2008-01-02 2020-07-31


In [10]:
experiment_name="online_srv"

In [11]:
###################################
# train model
###################################
data_handler_config = {
    "start_time": start_time, # 
    "end_time": end_time,
    "fit_start_time": start_time,
    "fit_end_time": "2014-12-31",
    "instruments": market,
    "infer_processors": [
      {
        "class": "RobustZScoreNorm",
        "kwargs": {
          "fields_group": "feature",
          "clip_outlier": True
        }
      },
      {
        "class": "Fillna",
        "kwargs": {
          "fields_group": "feature"
        }
      }
    ],
    "learn_processors": [
      {
        "class": "DropnaLabel"
      },
      {
        "class": "CSRankNorm",
        "kwargs": {
          "fields_group": "label"
        }
      }
    ],
    "label": [
      "Ref($close, -2) / Ref($close, -1) - 1"
    ]
}

task = {   
    "model": {
        "class": "LSTM",
        "module_path": "qlib.contrib.model.pytorch_lstm",
        "kwargs": {
            "d_feat": 6,
            "hidden_size": 64,
            "num_layers": 2,
            "dropout": 0.0,
            "dec_dropout": 0.0,
            "n_epochs": 200,
            "lr": 1e-3,
            "early_stop": 20,
            "batch_size": 800,
            "metric": "loss",
            "loss": "mse",
            "optimizer": "adam",
            "GPU": 0
        },
    },
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha360",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": {
                "train": (start_time, "2014-12-31"),
                "valid": ("2015-01-01", "2016-12-31"),
                "test": ("2017-01-01", "2020-08-01"),
            },
        },
    },
}

# model initiaiton
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])

[15417:MainThread](2022-08-04 16:06:52,930) INFO - qlib.LSTM - [pytorch_lstm.py:58] - LSTM pytorch version...
[15417:MainThread](2022-08-04 16:06:52,961) INFO - qlib.LSTM - [pytorch_lstm.py:75] - LSTM parameters setting:
d_feat : 6
hidden_size : 64
num_layers : 2
dropout : 0.0
n_epochs : 200
lr : 0.001
metric : loss
batch_size : 800
early_stop : 20
optimizer : adam
loss_type : mse
visible_GPU : 0
use_GPU : True
seed : None
[15417:MainThread](2022-08-04 16:07:22,644) INFO - qlib.timer - [log.py:117] - Time cost: 27.276s | Loading data Done
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
[15417:MainThread](2022-08-04 16:10:20,395) INFO - qlib.timer - [log.py:117] - Time cost: 174.974s | RobustZScoreNorm Done
[15417:MainThread](2022-08-04 16:10:21,404) INFO - qlib.timer - [log.py:117] - Time cost: 1.006s | Fillna Done
[15417:MainThread](2022-08-04 16:10:22,759) INFO - qlib.timer - [log.py:117] - Time cost: 0.710s | DropnaLabel Done
A value is trying to be set on a c

In [12]:
# start exp to train model

experiment_id = 'cn_backtest'
# experiment_name: Optional[Text] = None,
# recorder_id: Optional[Text] = None,

# with R.start(experiment_name=experiment_name, experimen
# t_id=experiment_id):
with R.start(experiment_name=experiment_name):
    R.log_params(**flatten_dict(task))
    model.fit(dataset)
    R.save_objects(trained_model=model)
    rid = R.get_recorder().id
    # prediction
    recorder = R.get_recorder()
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

[15417:MainThread](2022-08-04 16:10:23,958) INFO - qlib.workflow - [expm.py:315] - <mlflow.tracking.client.MlflowClient object at 0x7fbf01dd9a90>
[15417:MainThread](2022-08-04 16:10:23,982) INFO - qlib.workflow - [exp.py:257] - Experiment 1 starts running ...
[15417:MainThread](2022-08-04 16:10:24,275) INFO - qlib.workflow - [recorder.py:295] - Recorder f0b96802b1c64d838a1a30dd75d12e5a starts running under Experiment 1 ...
[15417:MainThread](2022-08-04 16:10:27,224) INFO - qlib.LSTM - [pytorch_lstm.py:236] - training...
[15417:MainThread](2022-08-04 16:10:27,226) INFO - qlib.LSTM - [pytorch_lstm.py:240] - Epoch0:
[15417:MainThread](2022-08-04 16:10:27,228) INFO - qlib.LSTM - [pytorch_lstm.py:241] - training...
[15417:MainThread](2022-08-04 16:10:33,822) INFO - qlib.LSTM - [pytorch_lstm.py:243] - evaluating...
[15417:MainThread](2022-08-04 16:10:36,277) INFO - qlib.LSTM - [pytorch_lstm.py:246] - train -0.993119, valid -0.994897
[15417:MainThread](2022-08-04 16:10:36,281) INFO - qlib.LST

'The following are prediction results of the LSTM model.'
                          score
datetime   instrument          
2017-01-03 SH600000    0.005775
           SH600008    0.065544
           SH600009    0.071585
           SH600010    0.007785
           SH600015   -0.002417


[15417:MainThread](2022-08-04 16:14:36,682) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | waiting `async_log` Done


In [13]:
p = model.predict(dataset)

In [14]:
p[4030:4050]

datetime    instrument
2017-01-20  SH601258     -0.019314
            SH601288      0.072039
            SH601318      0.067020
            SH601328      0.031324
            SH601333      0.026123
            SH601336      0.008868
            SH601377      0.004349
            SH601390      0.016718
            SH601398      0.067412
            SH601555      0.006407
            SH601600     -0.129601
            SH601601      0.060140
            SH601607     -0.121302
            SH601608     -0.074401
            SH601611     -0.118289
            SH601618     -0.004763
            SH601628      0.025532
            SH601633      0.054720
            SH601668     -0.038952
            SH601669     -0.036105
dtype: float32

In [15]:
sr.list()

['pred.pkl', 'label.pkl']

In [16]:
sr.load('pred.pkl')

Unnamed: 0_level_0,Unnamed: 1_level_0,score
datetime,instrument,Unnamed: 2_level_1
2017-01-03,SH600000,0.005775
2017-01-03,SH600008,0.065544
2017-01-03,SH600009,0.071585
2017-01-03,SH600010,0.007785
2017-01-03,SH600015,-0.002417
...,...,...
2020-07-31,SZ300413,-0.033918
2020-07-31,SZ300433,-0.192951
2020-07-31,SZ300498,-0.013752
2020-07-31,SZ300601,-0.326905


In [17]:
sr.load('label.pkl')

Unnamed: 0_level_0,Unnamed: 1_level_0,"Ref($close, -2) / Ref($close, -1) - 1"
datetime,instrument,Unnamed: 2_level_1
2017-01-03,SH600000,-0.001831
2017-01-03,SH600008,-0.002398
2017-01-03,SH600009,0.001493
2017-01-03,SH600010,0.003520
2017-01-03,SH600015,-0.007142
...,...,...
2020-07-31,SZ300413,-0.037566
2020-07-31,SZ300433,-0.031677
2020-07-31,SZ300498,-0.006531
2020-07-31,SZ300601,0.090264


In [18]:
pr = PortAnaRecord(recorder)

In [19]:
pr.list()

['report_normal_1day.pkl',
 'positions_normal_1day.pkl',
 'port_analysis_1day.pkl',
 'indicator_analysis_1day.pkl']

## TopkDropoutStrategy

In [20]:
###################################
# prediction, backtest & analysis
###################################
port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": {
        "class": "TopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy.signal_strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "topk": 50,
            "n_drop": 5,
        },
    },
    "backtest": {
        "start_time": "2017-01-01",
        "end_time": "2020-08-01",
        "account": 100000000,
        "benchmark": benchmark,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.095,
            "deal_price": "close",
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        },
    },
}

# backtest and analysis
with R.start(experiment_name="backtest_analysis"):
    recorder = R.get_recorder(recorder_id=rid, experiment_name="online_srv")
    model = recorder.load_object("trained_model")

    # prediction
    recorder = R.get_recorder()
    ba_rid = recorder.id
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # backtest & analysis
    par = PortAnaRecord(recorder, port_analysis_config, "day")
    par.generate()

[15417:MainThread](2022-08-04 16:14:38,651) INFO - qlib.workflow - [expm.py:315] - <mlflow.tracking.client.MlflowClient object at 0x7fbf01e55610>
[15417:MainThread](2022-08-04 16:14:38,656) INFO - qlib.workflow - [exp.py:257] - Experiment 2 starts running ...
[15417:MainThread](2022-08-04 16:14:38,665) INFO - qlib.workflow - [recorder.py:295] - Recorder eaabb02d26874f7c8375750017e822e3 starts running under Experiment 2 ...
  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
[15417:MainThread](2022-08-04 16:14:40,520) INFO - qlib.workflow - [record_temp.py:194] - Signal record 'pred.pkl' has been saved as the artifact of the Experiment 2
[15417:MainThread](2022-08-04 16:14:40,607) INFO - qlib.backtest caller - [__init__.py:94] - Create new exchange


'The following are prediction results of the LSTM model.'
                          score
datetime   instrument          
2017-01-03 SH600000    0.005775
           SH600008    0.065544
           SH600009    0.071585
           SH600010    0.007785
           SH600015   -0.002417




backtest loop:   0%|          | 0/871 [00:00<?, ?it/s]

  return np.nanmean(self.data)
  return np.nanmean(self.data)
  return np.nanmean(self.data)
  return np.nanmean(self.data)
[15417:MainThread](2022-08-04 16:15:09,746) INFO - qlib.workflow - [record_temp.py:499] - Portfolio analysis record 'port_analysis_1day.pkl' has been saved as the artifact of the Experiment 2
[15417:MainThread](2022-08-04 16:15:09,767) INFO - qlib.workflow - [record_temp.py:524] - Indicator analysis record 'indicator_analysis_1day.pkl' has been saved as the artifact of the Experiment 2
[15417:MainThread](2022-08-04 16:15:09,814) INFO - qlib.timer - [log.py:117] - Time cost: 0.014s | waiting `async_log` Done


'The following are analysis results of benchmark return(1day).'
                       risk
mean               0.000477
std                0.012295
annualized_return  0.113561
information_ratio  0.598699
max_drawdown      -0.370479
'The following are analysis results of the excess return without cost(1day).'
                       risk
mean               0.000456
std                0.004701
annualized_return  0.108480
information_ratio  1.495731
max_drawdown      -0.066153
'The following are analysis results of the excess return with cost(1day).'
                       risk
mean               0.000286
std                0.004700
annualized_return  0.067982
information_ratio  0.937501
max_drawdown      -0.070388
'The following are analysis results of indicators(1day).'
     value
ffr    1.0
pa     0.0
pos    0.0


In [21]:
from qlib.contrib.report import analysis_model, analysis_position
from qlib.data import D
recorder = R.get_recorder(recorder_id=ba_rid, experiment_name="online_srv")
print(recorder)
pred_df = recorder.load_object("pred.pkl")
pred_df_dates = pred_df.index.get_level_values(level='datetime')
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")

{'class': 'Recorder', 'id': 'eaabb02d26874f7c8375750017e822e3', 'name': 'mlflow_recorder', 'experiment_id': '1', 'start_time': '2022-08-04 16:14:38', 'end_time': '2022-08-04 16:15:09', 'status': 'FINISHED'}


In [22]:
pred_df

Unnamed: 0_level_0,Unnamed: 1_level_0,score
datetime,instrument,Unnamed: 2_level_1
2017-01-03,SH600000,0.005775
2017-01-03,SH600008,0.065544
2017-01-03,SH600009,0.071585
2017-01-03,SH600010,0.007785
2017-01-03,SH600015,-0.002417
...,...,...
2020-07-31,SZ300413,-0.033918
2020-07-31,SZ300433,-0.192951
2020-07-31,SZ300498,-0.013752
2020-07-31,SZ300601,-0.326905


In [23]:
pred_df_dates

DatetimeIndex(['2017-01-03', '2017-01-03', '2017-01-03', '2017-01-03',
               '2017-01-03', '2017-01-03', '2017-01-03', '2017-01-03',
               '2017-01-03', '2017-01-03',
               ...
               '2020-07-31', '2020-07-31', '2020-07-31', '2020-07-31',
               '2020-07-31', '2020-07-31', '2020-07-31', '2020-07-31',
               '2020-07-31', '2020-07-31'],
              dtype='datetime64[ns]', name='datetime', length=261300, freq=None)

In [24]:
report_normal_df

Unnamed: 0_level_0,account,return,total_turnover,turnover,total_cost,cost,value,cash,bench
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-01-03,1.000000e+08,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,1.000000e+08,0.009713
2017-01-04,9.995250e+07,0.000000,9.500000e+07,0.950000,4.750000e+04,0.000475,9.500000e+07,4.952500e+06,0.007803
2017-01-05,1.002487e+08,0.003176,1.182989e+08,0.233099,6.869177e+04,0.000212,9.953156e+07,7.171484e+05,-0.000154
2017-01-06,9.985647e+07,-0.003718,1.378832e+08,0.195357,8.818488e+04,0.000194,9.934124e+07,5.152356e+05,-0.005974
2017-01-09,1.004493e+08,0.006130,1.572267e+08,0.193713,1.075324e+05,0.000194,9.994528e+07,5.039922e+05,0.004848
...,...,...,...,...,...,...,...,...,...
2020-07-27,1.757429e+08,-0.000589,1.895012e+10,0.215214,1.891006e+07,0.000216,1.747605e+08,9.824198e+05,0.005074
2020-07-28,1.765433e+08,0.004729,1.898089e+10,0.175120,1.894077e+07,0.000175,1.757373e+08,8.060007e+05,0.008791
2020-07-29,1.802717e+08,0.021326,1.901737e+10,0.206604,1.897733e+07,0.000207,1.793256e+08,9.461304e+05,0.024243
2020-07-30,1.785436e+08,-0.009399,1.904925e+10,0.176878,1.901099e+07,0.000187,1.740772e+08,4.466404e+06,-0.004886


In [25]:
analysis_df

Unnamed: 0,Unnamed: 1,risk
excess_return_without_cost,mean,0.000456
excess_return_without_cost,std,0.004701
excess_return_without_cost,annualized_return,0.10848
excess_return_without_cost,information_ratio,1.495731
excess_return_without_cost,max_drawdown,-0.066153
excess_return_with_cost,mean,0.000286
excess_return_with_cost,std,0.0047
excess_return_with_cost,annualized_return,0.067982
excess_return_with_cost,information_ratio,0.937501
excess_return_with_cost,max_drawdown,-0.070388
