In [None]:
# if you are running this code in Google Colab, please make sure to run the following code first
!pip install numpy
!pip install --upgrade  cython
!pip install --upgrade scipy
!pip install mlflow

%cd /usr/local/lib/python3.10/dist-packages
!git clone https://github.com/microsoft/qlib.git
%cd qlib
!pip install --upgrade jupyter-client
!pip install .
!python setup.py install
%cd ~
!pip install statsmodels plotly

In [None]:
import os

repo_url = 'https://github.com/wyulan0724/alpha-modeling.git'
repo_name = repo_url.split('/')[-1].replace('.git', '')

# Clone the repository
print(f"Cloning repository: {repo_url}")
!git clone {repo_url}

# Get the base path where the repository was cloned
base_repo_path = os.path.join(os.getcwd(), repo_name)

# Define the relative paths within the repository for the folder and the file
tw_data_relative_path = os.path.join('data', 'tw_data')
parquet_file_relative_path = os.path.join('data', 'precomputed_features', 'tw50_alpha101_label_1d.parquet')

# Construct the full absolute paths
tw_data_folder_path = os.path.join(base_repo_path, tw_data_relative_path)
parquet_file_path = os.path.join(base_repo_path, parquet_file_relative_path)

In [None]:
import pandas as pd
import numpy as np

import qlib
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord, SigAnaRecord
from qlib.workflow import R
from qlib.utils import init_instance_by_config, flatten_dict
from qlib.contrib.report import analysis_model, analysis_position
from qlib.data.dataset.handler import DataHandlerLP
from qlib.data.dataset.loader import StaticDataLoader

In [None]:
# Define market and benchmark
market = "tw50"  # Taiwan 50 index constituents
benchmark = "0050"  # Taiwan 50 ETF as benchmark

# Initialize QLib with Taiwan market data
qlib.init(provider_uri=tw_data_folder_path,  # Set your data path here
          region="tw",
          dataset_cache="SimpleDatasetCache")  # Cache setting for dataset

In [None]:
DEFAULT_TRAIN_START = "2014-12-31"
DEFAULT_TRAIN_END = "2021-12-31"
DEFAULT_VALID_START = "2022-01-01"
DEFAULT_VALID_END = "2022-12-31"
DEFAULT_TEST_START = "2023-01-01"
DEFAULT_TEST_END = "2024-12-27"

DEFAULT_LEARN_PROCESSORS = [{"class": "DropnaLabel"}]
DEFAULT_INFER_PROCESSORS = [
    {"class": "ZScoreNorm",
     "kwargs": {"fields_group": "feature",
                "fit_start_time": DEFAULT_TRAIN_START,
                "fit_end_time": DEFAULT_TRAIN_END}},
    {"class": "Fillna", "kwargs": {}},
]

static_loader = StaticDataLoader(config=parquet_file_path)
handler_kwargs = {
    "start_time": DEFAULT_TRAIN_START,
    "end_time": DEFAULT_TEST_END,
    "data_loader": static_loader,
    "learn_processors": DEFAULT_LEARN_PROCESSORS,
    "infer_processors": DEFAULT_INFER_PROCESSORS,
}

# Define model training configuration
task = {
    # Model configuration
    "model": {
        "class": "XGBModel",
        "module_path": "qlib.contrib.model.xgboost",
        "kwargs": {
            "eval_metric": "rmse",
            "learning_rate": 0.0015542077094361038,
            "max_depth": 9,
            "subsample": 0.8884543702381469,
            "colsample_bytree": 0.6888809148265227,
            "reg_alpha": 0.004201822811194576,
            "reg_lambda": 0.0034218571819520298
        }
    },
    # Dataset configuration
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "DataHandlerLP",
                "module_path": "qlib.data.dataset.handler",
                "kwargs": handler_kwargs
            },
            "segments": {
                "train": (DEFAULT_TRAIN_START, DEFAULT_TRAIN_END),
                "valid": (DEFAULT_VALID_START, DEFAULT_VALID_END),
                "test": (DEFAULT_TEST_START, DEFAULT_TEST_END)
            },
        },
    }
}

# Initialize model and dataset from configuration
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])

port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": {
        "class": "TopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy.signal_strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "topk": 10,
            "n_drop": 2,
        },
    },
    "backtest": {
        "start_time": "2023-01-01",
        "end_time": "2024-12-27",
        "account": 10000000,
        "benchmark": benchmark,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.1,
            "deal_price": "close",
            "open_cost": 0.001425,
            "close_cost": 0.004425,
            "min_cost": 20,
            "trade_unit": 1000,
        },
    },
}

In [None]:
# Optional: Examine the dataset before and after preprocessing
def examine_data_segment(data_dict, segment_name, data_type_name):
    print(f"--- {data_type_name} {segment_name.capitalize()} ---")
    if data_dict is None or not isinstance(data_dict, (pd.DataFrame, pd.Series)):
        print("Data not available or not a DataFrame/Series.")
        return

    nan_count = data_dict.isna().sum().sum()
    print(f"NaNs: {nan_count}")

    inf_count = np.isinf(data_dict.values).sum()
    print(f"Infs: {inf_count}")

    print(f"Shape: {data_dict.shape}")


# --- 1. Raw Data Examination ---
print("="*10 + " Raw Data Examination " + "="*10)
segments = ["train", "valid", "test"]
col_set = ["feature", "label"]

for segment in segments:
    try:
        raw_data = dataset.prepare(
            segment,
            col_set=col_set,
            data_key=DataHandlerLP.DK_R
        )
        examine_data_segment(raw_data.get('feature'), segment, "Raw Feature")
        examine_data_segment(raw_data.get('label'), segment, "Raw Label")
    except Exception as e:
        print(f"Error loading raw data for segment '{segment}': {e}")
    print("-" * 30)


# --- 2. Preprocessed Data Examination ---
print("\n" + "="*10 + " Preprocessed Data Examination " + "="*10)
processed_data = {}

for segment in ["train", "valid"]:
    try:
        processed_data[segment] = dataset.prepare(
            segment,
            col_set=col_set,
            data_key=DataHandlerLP.DK_L
        )
        examine_data_segment(processed_data[segment].get(
            'feature'), segment, "Processed Feature")
        examine_data_segment(processed_data[segment].get(
            'label'), segment, "Processed Label")
    except Exception as e:
        print(f"Error loading preprocessed data for segment '{segment}': {e}")
    print("-" * 30)

segment = "test"
try:
    processed_data[segment] = {'feature': None, 'label': None}
    processed_data[segment]['feature'] = dataset.prepare(
        segment,
        col_set=["feature"],
        data_key=DataHandlerLP.DK_I
    )
    examine_data_segment(
        processed_data[segment]['feature'], segment, "Processed Feature (DK_I)")

    processed_data[segment]['label'] = dataset.prepare(
        segment,
        col_set=["label"],
        data_key=DataHandlerLP.DK_L
    )
    examine_data_segment(
        processed_data[segment]['label'], segment, "Processed Label (DK_L)")
except Exception as e:
    print(f"Error loading preprocessed data for segment '{segment}': {e}")
print("-" * 30)

In [None]:
# Start QLib recorder for experiment tracking
with R.start(experiment_name=f"{market}_{task['model']['class']}_Alpha101"):
    # Log model parameters for traceability
    print("Logging model parameters")
    R.log_params(**flatten_dict(task))

    # Train the model
    print("Training model")
    model.fit(dataset)

    # Save trained model
    R.save_objects(**{"params.pkl": model})
    rid = R.get_recorder().id

    # Generate predictions for test set
    print("Generating predictions")
    recorder = R.get_recorder()
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # Signal Analysis
    print("Performing signal analysis")
    sar = SigAnaRecord(recorder)  # IC, ICIR, Rank IC and Rank ICIR
    sar.generate()

    # backtest & analysis
    par = PortAnaRecord(recorder, port_analysis_config, "day")
    par.generate()

[Analysis: Evaluation & Results Analysis](https://qlib.readthedocs.io/en/latest/component/report.html)

In [None]:
# Model Performance

# Load prediction and label data
pred_df = recorder.load_object("pred.pkl")
label_df = recorder.load_object("label.pkl")
label_df.columns = ["label"]

# Create combined dataframes for raw and normalized data
pred_label = pd.concat([label_df, pred_df],
                       axis=1, sort=True).reindex(label_df.index)

# Clean up index levels if needed
if pred_label.index.nlevels > 2:
    pred_label.drop(level=0, inplace=True)

# Visualize model performance
analysis_model.model_performance_graph(pred_label)

In [None]:
# Backtest Results

recorder = R.get_recorder(recorder_id=rid, experiment_name=f"{market}_{task['model']['class']}_Alpha101")
print(recorder)
pred_df = recorder.load_object("pred.pkl")
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")

analysis_position.report_graph(report_normal_df)

In [None]:
analysis_position.risk_analysis_graph(analysis_df, report_normal_df)