In [1]:
import sys, os
import numpy as np
import pandas as pd
import qlib

In [2]:
from pathlib import Path
scripts_dir = Path("D:/py/qlib/scripts")
print(scripts_dir.joinpath("get_data.py"))
assert scripts_dir.joinpath("get_data.py").exists()

D:\py\qlib\scripts\get_data.py


In [3]:
if not scripts_dir.joinpath("get_data.py").exists():
    # download get_data.py script
    scripts_dir = Path("~/tmp/qlib_code/scripts").expanduser().resolve()
    scripts_dir.mkdir(parents=True, exist_ok=True)
    import requests
    with requests.get("https://raw.githubusercontent.com/microsoft/qlib/main/scripts/get_data.py") as resp:
        with open(scripts_dir.joinpath("get_data.py"), "wb") as fp:
            fp.write(resp.content)

In [4]:
from qlib.constant import REG_CN
from qlib.utils import exists_qlib_data, init_instance_by_config
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.utils import flatten_dict

In [5]:
provider_uri = "D:/py/qlib/.qlib/qlib_data/cn_data"  # target_dir
# if not exists_qlib_data(provider_uri):
#     print(f"Qlib data is not found in {provider_uri}")
#     sys.path.append(str(scripts_dir))
#     from get_data import GetData
#     GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
qlib.init(provider_uri=provider_uri, region=REG_CN)

[6848:MainThread](2022-07-25 14:59:28,936) INFO - qlib.Initialization - [config.py:403] - default_conf: client.
[6848:MainThread](2022-07-25 14:59:28,940) INFO - qlib.Initialization - [__init__.py:73] - qlib successfully initialized based on client settings.
[6848:MainThread](2022-07-25 14:59:28,941) INFO - qlib.Initialization - [__init__.py:75] - data_path={'__DEFAULT_FREQ': WindowsPath('D:/py/qlib/.qlib/qlib_data/cn_data')}


In [6]:
market = "all"
benchmark = "SH000300"

In [7]:
from qlib.data import D
from qlib.data.filter import ExpressionDFilter
from qlib.data.filter import NameDFilter

In [8]:
instruments = D.instruments(market='all')
fields = ['$close', '(Ref($close, -1)-$close)/$close', '(Ref($close, -2)-Ref($close, -1))/Ref($close, -1)', '(Ref($close, -3)-Ref($close, -2))/Ref($close, -2)', '(Ref($close, -4)-Ref($close, -3))/Ref($close, -3)', '(Ref($close, -5)-Ref($close, -4))/Ref($close, -4)', '(Ref($close, -6)-Ref($close, -5))/Ref($close, -5)', '(Ref($close, -7)-Ref($close, -6))/Ref($close, -6)']
f_d = D.features(instruments, fields, start_time='2021-01-04', end_time='2021-06-11', freq='day')
df = f_d
df.index = df.index.get_level_values('datetime')
print(df.index.min(), df.index.max())

start_time = pd.to_datetime(df.index.min())
end_time = pd.to_datetime(df.index.max())
print(start_time.strftime('%Y-%m-%d'), end_time.strftime('%Y-%m-%d'))

2021-01-04 00:00:00 2021-06-11 00:00:00
2021-01-04 2021-06-11


In [11]:
experiment_name="online_srv"

In [12]:
###################################
# train model
###################################
# '2017-01-04', end_time='2022-02-28'
data_handler_config = {
    # "start_time": "2017-01-04",
    "start_time": start_time, # 
    "end_time": end_time,
    "fit_start_time": "2021-01-04",
    "fit_end_time": "2021-04-30",
    "instruments": market,
    "infer_processors": [
      {
        "class": "RobustZScoreNorm",
        "kwargs": {
          "fields_group": "feature",
          "clip_outlier": True
        }
      },
      {
        "class": "Fillna",
        "kwargs": {
          "fields_group": "feature"
        }
      }
    ],
    "learn_processors": [
      {
        "class": "DropnaLabel"
      },
      {
        "class": "CSRankNorm",
        "kwargs": {
          "fields_group": "label"
        }
      }
    ],
    "label": [
      "Ref($close, -2) / Ref($close, -1) - 1"
    ]
}

task = {   
    "model": {
        "class": "LSTM",
        "module_path": "qlib.contrib.model.pytorch_lstm",
        "kwargs": {
            "d_feat": 6,
            "hidden_size": 64,
            "num_layers": 2,
            "dropout": 0.1,
            "dec_dropout": 0.0,
            "n_epochs": 15,
            "lr": 1e-5,
            "early_stop": 3,
            "batch_size": 800,
            "metric": "loss",
            "loss": "mse",
            "optimizer": "adam",
            "GPU": 0
        },
    },
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha360",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": {
                "train": (start_time, "2021-04-30"),
                "valid": ("2021-05-01", "2021-05-19"),
                "test": ("2021-05-20", "2021-06-11"),
            },
        },
    },
}

# model initiaiton
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])

[6848:MainThread](2022-07-25 15:03:15,572) INFO - qlib.LSTM - [pytorch_lstm.py:58] - LSTM pytorch version...
[6848:MainThread](2022-07-25 15:03:15,573) INFO - qlib.LSTM - [pytorch_lstm.py:104] - LSTM parameters setting:
d_feat : 6
hidden_size : 64
num_layers : 2
dropout : 0.1
n_epochs : 15
lr : 1e-05
metric : loss
batch_size : 800
early_stop : 3
optimizer : adam
loss_type : mse
visible_GPU : 0
use_GPU : False
seed : None
  if idx.is_monotonic_increasing and not (isinstance(idx, pd.MultiIndex) and not idx.is_lexsorted()):
[6848:MainThread](2022-07-25 15:12:06,729) INFO - qlib.timer - [log.py:113] - Time cost: 531.150s | Loading data Done
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
[6848:MainThread](2022-07-25 15:12:16,265) INFO - qlib.timer - [log.py:113] - Time cost: 8.645s | RobustZScoreNorm Done
[6848:MainThread](2022-07-25 15:12:16,658) INFO - qlib.timer - [log.py:113] - Time cost: 0.392s | Fillna Done
[6848:MainThread](2022-07-25 15:12:17,306) INFO - qlib

In [13]:
# start exp to train model

experiment_id = 'cn_backtest'
# experiment_name: Optional[Text] = None,
# recorder_id: Optional[Text] = None,

# with R.start(experiment_name=experiment_name, experimen
# t_id=experiment_id):
with R.start(experiment_name=experiment_name):
    R.log_params(**flatten_dict(task))
    model.fit(dataset)
    R.save_objects(trained_model=model)
    rid = R.get_recorder().id
    # prediction
    recorder = R.get_recorder()
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

[6848:MainThread](2022-07-25 15:12:17,849) INFO - qlib.workflow - [expm.py:318] - <mlflow.tracking.client.MlflowClient object at 0x000002600DD65648>
[6848:MainThread](2022-07-25 15:12:17,881) INFO - qlib.workflow - [exp.py:257] - Experiment 1 starts running ...
[6848:MainThread](2022-07-25 15:12:18,129) INFO - qlib.workflow - [recorder.py:293] - Recorder 097522e0f3de4c0a925e0e31e5173db3 starts running under Experiment 1 ...
[6848:MainThread](2022-07-25 15:12:19,578) INFO - qlib.LSTM - [pytorch_lstm.py:236] - training...
[6848:MainThread](2022-07-25 15:12:19,580) INFO - qlib.LSTM - [pytorch_lstm.py:240] - Epoch0:
[6848:MainThread](2022-07-25 15:12:19,581) INFO - qlib.LSTM - [pytorch_lstm.py:241] - training...
[6848:MainThread](2022-07-25 15:17:04,110) INFO - qlib.LSTM - [pytorch_lstm.py:243] - evaluating...
[6848:MainThread](2022-07-25 15:19:17,009) INFO - qlib.LSTM - [pytorch_lstm.py:246] - train -0.996146, valid -0.996042
[6848:MainThread](2022-07-25 15:19:17,013) INFO - qlib.LSTM - [

'The following are prediction results of the LSTM model.'
                          score
datetime   instrument          
2021-05-20 SH000300    0.037091
           SH000903    0.040627
           SH600000    0.047179
           SH600004    0.039283
           SH600006   -0.069650


[6848:MainThread](2022-07-25 16:12:53,513) INFO - qlib.timer - [log.py:113] - Time cost: 0.000s | waiting `async_log` Done


In [14]:
###################################
# prediction, backtest & analysis
###################################
port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": {
        "class": "WeekTopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy.signal_strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "topk": 50,
            "n_drop": 5,
        },
    },
    "backtest": {
        "start_time": "2021-05-20",
        "end_time": "2021-06-01",
        "account": 100000000,
        "benchmark": benchmark,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.095,
            "deal_price": "close",
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        },
    },
}

# backtest and analysis
with R.start(experiment_name="backtest_analysis"):
    recorder = R.get_recorder(recorder_id=rid, experiment_name="online_srv")
    model = recorder.load_object("trained_model")

    # prediction
    recorder = R.get_recorder()
    ba_rid = recorder.id
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # backtest & analysis
    par = PortAnaRecord(recorder, port_analysis_config, "day")
    par.generate()

[6848:MainThread](2022-07-25 16:12:53,574) INFO - qlib.workflow - [expm.py:318] - <mlflow.tracking.client.MlflowClient object at 0x000002602D0DA148>
[6848:MainThread](2022-07-25 16:12:53,590) INFO - qlib.workflow - [exp.py:257] - Experiment 2 starts running ...
[6848:MainThread](2022-07-25 16:12:53,616) INFO - qlib.workflow - [recorder.py:293] - Recorder 5f617d77783a44f8b054240194e2b13a starts running under Experiment 2 ...
[6848:MainThread](2022-07-25 16:13:17,037) INFO - qlib.workflow - [record_temp.py:194] - Signal record 'pred.pkl' has been saved as the artifact of the Experiment 2


'The following are prediction results of the LSTM model.'
                          score
datetime   instrument          
2021-05-20 SH000300    0.037091
           SH000903    0.040627
           SH600000    0.047179
           SH600004    0.039283
           SH600006   -0.069650


[6848:MainThread](2022-07-25 16:13:17,078) INFO - qlib.backtest caller - [__init__.py:82] - Create new exchange


backtest loop:   0%|          | 0/9 [00:00<?, ?it/s]

  if idx.is_monotonic_increasing and not (isinstance(idx, pd.MultiIndex) and not idx.is_lexsorted()):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wha['A_Week_HighAndLow'] = wha.apply(lambda x: x.sum(), axis=1)
[6848:MainThread](2022-07-25 16:17:28,364) INFO - qlib.workflow - [record_temp.py:499] - Portfolio analysis record 'port_analysis_1day.pkl' has been saved as the artifact of the Experiment 2


'The following are analysis results of benchmark return(1day).'
                       risk
mean               0.003644
std                0.011369
annualized_return  0.867258
information_ratio  4.944487
max_drawdown      -0.010076
'The following are analysis results of the excess return without cost(1day).'
                        risk
mean                0.029945
std                 0.024833
annualized_return   7.126985
information_ratio  18.602869
max_drawdown        0.000000
'The following are analysis results of the excess return with cost(1day).'
                        risk
mean                0.029782
std                 0.024904
annualized_return   7.088120
information_ratio  18.448753
max_drawdown        0.000000


[6848:MainThread](2022-07-25 16:17:28,390) INFO - qlib.workflow - [record_temp.py:524] - Indicator analysis record 'indicator_analysis_1day.pkl' has been saved as the artifact of the Experiment 2


'The following are analysis results of indicators(1day).'
     value
ffr    1.0
pa     0.0
pos    0.0


[6848:MainThread](2022-07-25 16:17:28,444) INFO - qlib.timer - [log.py:113] - Time cost: 0.040s | waiting `async_log` Done


In [15]:
from qlib.contrib.report import analysis_model, analysis_position
from qlib.data import D
recorder = R.get_recorder(recorder_id=ba_rid, experiment_name="online_srv")
print(recorder)
pred_df = recorder.load_object("pred.pkl")
pred_df_dates = pred_df.index.get_level_values(level='datetime')
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")

{'class': 'Recorder', 'id': '5f617d77783a44f8b054240194e2b13a', 'name': 'mlflow_recorder', 'experiment_id': '1', 'start_time': '2022-07-25 16:12:53', 'end_time': '2022-07-25 16:17:28', 'status': 'FINISHED'}


In [16]:
report_normal_df

Unnamed: 0_level_0,account,return,total_turnover,turnover,total_cost,cost,value,cash,bench
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-05-20,99956320.0,2.4665500000000002e-17,87365070.0,0.873651,43682.534709,0.000437,87365070.0,12591250.0,0.002734
2021-05-21,99826400.0,-0.00108394,111144400.0,0.237898,65255.487381,0.000216,91669570.0,8156830.0,-0.010076
2021-05-24,102776800.0,0.0296375,119487600.0,0.083576,73439.649656,8.2e-05,94946080.0,7830742.0,0.004176
2021-05-25,106216200.0,0.03361199,138223200.0,0.182295,88608.195368,0.000148,105534800.0,681365.2,0.031595
2021-05-26,108450800.0,0.02118614,152602900.0,0.135382,104343.490887,0.000148,105074000.0,3376792.0,0.000397
2021-05-27,113386800.0,0.04565754,169550000.0,0.156265,119870.500629,0.000143,112865700.0,521173.8,0.003315
2021-05-28,119522600.0,0.05423392,183244200.0,0.120774,133491.584131,0.00012,119161300.0,361350.1,-0.003211
2021-05-31,127486500.0,0.06676373,199070100.0,0.132409,149350.768199,0.000133,127074500.0,412058.9,0.00197
2021-06-01,134148100.0,0.05229643,204718200.0,0.044303,154873.115627,4.3e-05,133993100.0,155041.6,0.001896
