In [1]:
import sys, os
import numpy as np
import pandas as pd
import qlib

In [2]:
from pathlib import Path
scripts_dir = Path("D:/py/qlib/scripts")
print(scripts_dir.joinpath("get_data.py"))
assert scripts_dir.joinpath("get_data.py").exists()

D:\py\qlib\scripts\get_data.py


In [3]:
if not scripts_dir.joinpath("get_data.py").exists():
    # download get_data.py script
    scripts_dir = Path("~/tmp/qlib_code/scripts").expanduser().resolve()
    scripts_dir.mkdir(parents=True, exist_ok=True)
    import requests
    with requests.get("https://raw.githubusercontent.com/microsoft/qlib/main/scripts/get_data.py") as resp:
        with open(scripts_dir.joinpath("get_data.py"), "wb") as fp:
            fp.write(resp.content)

In [4]:
from qlib.constant import REG_CN
from qlib.utils import exists_qlib_data, init_instance_by_config
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.utils import flatten_dict

In [5]:
provider_uri = "D:/py/qlib/.qlib/qlib_data/cn_data"  # target_dir
# if not exists_qlib_data(provider_uri):
#     print(f"Qlib data is not found in {provider_uri}")
#     sys.path.append(str(scripts_dir))
#     from get_data import GetData
#     GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
qlib.init(provider_uri=provider_uri, region=REG_CN)

[6848:MainThread](2022-07-25 14:59:28,936) INFO - qlib.Initialization - [config.py:403] - default_conf: client.
[6848:MainThread](2022-07-25 14:59:28,940) INFO - qlib.Initialization - [__init__.py:73] - qlib successfully initialized based on client settings.
[6848:MainThread](2022-07-25 14:59:28,941) INFO - qlib.Initialization - [__init__.py:75] - data_path={'__DEFAULT_FREQ': WindowsPath('D:/py/qlib/.qlib/qlib_data/cn_data')}


In [6]:
market = "all"
benchmark = "SH000300"

In [7]:
from qlib.data import D
from qlib.data.filter import ExpressionDFilter
from qlib.data.filter import NameDFilter

In [8]:
instruments = D.instruments(market='all')
fields = ['$close', '(Ref($close, -1)-$close)/$close', '(Ref($close, -2)-Ref($close, -1))/Ref($close, -1)', '(Ref($close, -3)-Ref($close, -2))/Ref($close, -2)', '(Ref($close, -4)-Ref($close, -3))/Ref($close, -3)', '(Ref($close, -5)-Ref($close, -4))/Ref($close, -4)', '(Ref($close, -6)-Ref($close, -5))/Ref($close, -5)', '(Ref($close, -7)-Ref($close, -6))/Ref($close, -6)']
f_d = D.features(instruments, fields, start_time='2021-01-04', end_time='2021-06-11', freq='day')
df = f_d
df.index = df.index.get_level_values('datetime')
print(df.index.min(), df.index.max())

start_time = pd.to_datetime(df.index.min())
end_time = pd.to_datetime(df.index.max())
print(start_time.strftime('%Y-%m-%d'), end_time.strftime('%Y-%m-%d'))

2021-01-04 00:00:00 2021-06-11 00:00:00
2021-01-04 2021-06-11


In [9]:
experiment_name="online_srv"

In [10]:
###################################
# train model
###################################
# '2017-01-04', end_time='2022-02-28'
data_handler_config = {
    # "start_time": "2017-01-04",
    "start_time": start_time, # 
    "end_time": end_time,
    "fit_start_time": "2021-01-04",
    "fit_end_time": "2021-04-30",
    "instruments": market,
    "infer_processors": [
      {
        "class": "RobustZScoreNorm",
        "kwargs": {
          "fields_group": "feature",
          "clip_outlier": True
        }
      },
      {
        "class": "Fillna",
        "kwargs": {
          "fields_group": "feature"
        }
      }
    ],
    "learn_processors": [
      {
        "class": "DropnaLabel"
      },
      {
        "class": "CSRankNorm",
        "kwargs": {
          "fields_group": "label"
        }
      }
    ],
    "label": [
      "Ref($close, -2) / Ref($close, -1) - 1"
    ]
}

task = {   
    "model": {
        "class": "LSTM",
        "module_path": "qlib.contrib.model.pytorch_lstm",
        "kwargs": {
            "d_feat": 6,
            "hidden_size": 64,
            "num_layers": 2,
            "dropout": 0.1,
            "dec_dropout": 0.0,
            "n_epochs": 15,
            "lr": 1e-5,
            "early_stop": 3,
            "batch_size": 800,
            "metric": "loss",
            "loss": "mse",
            "optimizer": "adam",
            "GPU": 0
        },
    },
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha360",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": {
                "train": (start_time, "2021-04-30"),
                "valid": ("2021-05-01", "2021-05-19"),
                "test": ("2021-05-20", "2021-06-11"),
            },
        },
    },
}

# model initiaiton
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])

[6848:MainThread](2022-07-25 14:59:59,692) INFO - qlib.LSTM - [pytorch_lstm.py:58] - LSTM pytorch version...
[6848:MainThread](2022-07-25 14:59:59,694) INFO - qlib.LSTM - [pytorch_lstm.py:104] - LSTM parameters setting:
d_feat : 6
hidden_size : 64
num_layers : 2
dropout : 0.1
n_epochs : 15
lr : 1e-05
metric : loss
batch_size : 800
early_stop : 3
optimizer : adam
loss_type : mse
visible_GPU : 0
use_GPU : False
seed : None


In [None]:
# start exp to train model

experiment_id = 'cn_backtest'
# experiment_name: Optional[Text] = None,
# recorder_id: Optional[Text] = None,

# with R.start(experiment_name=experiment_name, experimen
# t_id=experiment_id):
with R.start(experiment_name=experiment_name):
    R.log_params(**flatten_dict(task))
    model.fit(dataset)
    R.save_objects(trained_model=model)
    rid = R.get_recorder().id
    # prediction
    recorder = R.get_recorder()
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

In [None]:
###################################
# prediction, backtest & analysis
###################################
port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": {
        "class": "WeekTopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy.signal_strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "topk": 50,
            "n_drop": 5,
        },
    },
    "backtest": {
        "start_time": "2021-05-20",
        "end_time": "2021-06-01",
        "account": 100000000,
        "benchmark": benchmark,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.095,
            "deal_price": "close",
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        },
    },
}

# backtest and analysis
with R.start(experiment_name="backtest_analysis"):
    recorder = R.get_recorder(recorder_id=rid, experiment_name="online_srv")
    model = recorder.load_object("trained_model")

    # prediction
    recorder = R.get_recorder()
    ba_rid = recorder.id
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # backtest & analysis
    par = PortAnaRecord(recorder, port_analysis_config, "day")
    par.generate()

In [None]:
from qlib.contrib.report import analysis_model, analysis_position
from qlib.data import D
recorder = R.get_recorder(recorder_id=ba_rid, experiment_name="online_srv")
print(recorder)
pred_df = recorder.load_object("pred.pkl")
pred_df_dates = pred_df.index.get_level_values(level='datetime')
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")