In [None]:
# if you are running this code in Google Colab, please make sure to run the following code first
!pip install numpy
!pip install --upgrade  cython
!pip install --upgrade scipy
!pip install mlflow

%cd /usr/local/lib/python3.10/dist-packages
!git clone https://github.com/microsoft/qlib.git
%cd qlib
!pip install --upgrade jupyter-client
!pip install .
!python setup.py install
%cd ~
!pip install statsmodels plotly

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

import qlib
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord, SigAnaRecord
from qlib.workflow import R
from qlib.utils import init_instance_by_config, flatten_dict
from qlib.contrib.report import analysis_model, analysis_position
from qlib.data.dataset.handler import DataHandlerLP

In [None]:
# Define market and benchmark
market = "tw50"  # Taiwan 50 index constituents
benchmark = "0050"  # Taiwan 50 ETF as benchmark

# Initialize QLib with Taiwan market data
qlib.init(provider_uri="data/tw_data",  # Set your data path here
          region="tw",
          dataset_cache="SimpleDatasetCache")  # Cache setting for dataset

In [None]:
from qlib.data import D

# Fetch all available instruments for Taiwan market
instruments = D.instruments("all")

# Define fields to fetch (OHLCV data)
fields = ["$open", "$high", "$low", "$close", "$volume"]

# Define the time range (10-year period)
start_time = "2014-12-31"
end_time = "2024-12-30"

# Fetch the historical price data
df = D.features(instruments, fields, start_time=start_time, end_time=end_time)

# Check data quality - null values
print(f"null counts in each column:\n{df.isnull().sum() }")

# Check data quality - zero values
zero_counts = (df == 0).sum()
print(f"zero counts in each column:\n{zero_counts}")

# Check data quality - negative values
negative_counts = (df < 0).sum()
print(f"negative counts in each column:\n{negative_counts}")

null counts in each column:
$open      0
$high      0
$low       0
$close     0
$volume    0
dtype: int64
zero counts in each column:
$open        0
$high        0
$low         0
$close       0
$volume    327
dtype: int64
negative counts in each column:
$open      0
$high      0
$low       0
$close     0
$volume    0
dtype: int64


In [None]:
custom_label = ["Ref($close, -1)/$close - 1"]  # next-day returns

# Define custom processors for data handling
# Data processors for training and inference
custom_learn_processors = [
    {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}},
    {"class": "DropnaLabel"},
]

custom_infer_processors = [
    {"class": "ZScoreNorm", "kwargs": {}},
    {"class": "Fillna", "kwargs": {}},
]

data_handler_config = {
    "start_time": "2014-12-31",
    "end_time": "2024-12-27",
    "fit_start_time": "2014-12-31",
    "fit_end_time": "2021-12-31",
    "instruments": market,
    "label": custom_label,
    "learn_processors": custom_learn_processors,
    "infer_processors": custom_infer_processors,
}

# Define model training configuration
task = {
    # Model configuration
    "model": {
        "class": "XGBModel",
        "module_path": "qlib.contrib.model.xgboost",
        "kwargs": {
            "eval_metric": "rmse",
            "learning_rate": 0.06424642669823001,
            "max_depth": 5,
            "subsample": 0.8466671252590435,
            "colsample_bytree": 0.962838237653785,
            "reg_alpha": 0.49091134749293946,
            "reg_lambda": 0.4299807160698682
        }
    },
    # Dataset configuration
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha158",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": {
                "train": ("2014-12-31", "2021-12-31"),
                "valid": ("2022-01-01", "2022-12-31"),
                "test": ("2023-01-01", "2024-12-27"),
            },
        },
    }
}

# Initialize model and dataset from configuration
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])

In [None]:
# Optional: Examine the dataset before and after preprocessing
def examine_data_segment(data_dict, segment_name, data_type_name):
    print(f"--- {data_type_name} {segment_name.capitalize()} ---")
    if data_dict is None or not isinstance(data_dict, (pd.DataFrame, pd.Series)):
        print("Data not available or not a DataFrame/Series.")
        return

    nan_count = data_dict.isna().sum().sum()
    print(f"NaNs: {nan_count}")

    inf_count = np.isinf(data_dict.values).sum()
    print(f"Infs: {inf_count}")

    print(f"Shape: {data_dict.shape}")


# --- 1. Raw Data Examination ---
print("="*10 + " Raw Data Examination " + "="*10)
segments = ["train", "valid", "test"]
col_set = ["feature", "label"]

for segment in segments:
    try:
        raw_data = dataset.prepare(
            segment,
            col_set=col_set,
            data_key=DataHandlerLP.DK_R
        )
        examine_data_segment(raw_data.get('feature'), segment, "Raw Feature")
        examine_data_segment(raw_data.get('label'), segment, "Raw Label")
    except Exception as e:
        print(f"Error loading raw data for segment '{segment}': {e}")
    print("-" * 30)


# --- 2. Preprocessed Data Examination ---
print("\n" + "="*10 + " Preprocessed Data Examination " + "="*10)
processed_data = {}

for segment in ["train", "valid"]:
    try:
        processed_data[segment] = dataset.prepare(
            segment,
            col_set=col_set,
            data_key=DataHandlerLP.DK_L
        )
        examine_data_segment(processed_data[segment].get(
            'feature'), segment, "Processed Feature")
        examine_data_segment(processed_data[segment].get(
            'label'), segment, "Processed Label")
    except Exception as e:
        print(f"Error loading preprocessed data for segment '{segment}': {e}")
    print("-" * 30)

segment = "test"
try:
    processed_data[segment] = {'feature': None, 'label': None}
    processed_data[segment]['feature'] = dataset.prepare(
        segment,
        col_set=["feature"],
        data_key=DataHandlerLP.DK_I
    )
    examine_data_segment(
        processed_data[segment]['feature'], segment, "Processed Feature (DK_I)")

    processed_data[segment]['label'] = dataset.prepare(
        segment,
        col_set=["label"],
        data_key=DataHandlerLP.DK_L
    )
    examine_data_segment(
        processed_data[segment]['label'], segment, "Processed Label (DK_L)")
except Exception as e:
    print(f"Error loading preprocessed data for segment '{segment}': {e}")
print("-" * 30)

In [None]:
# Create experiment name
experiment_type = "TRAIN"
feature_set = "Alpha158"

# Start QLib recorder for experiment tracking
with R.start(experiment_name=f"{experiment_type}_{market}_{task['model']['class']}_{feature_set}"):
    # Log model parameters for traceability
    print("Logging model parameters")
    R.log_params(**flatten_dict(task))

    # Train the model
    print("Training model")
    model.fit(dataset)

    # Save trained model
    R.save_objects(**{"params.pkl": model})
    rid = R.get_recorder().id

    # Generate predictions for test set
    print("Generating predictions")
    recorder = R.get_recorder()
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # Signal Analysis
    print("Performing signal analysis")
    sar = SigAnaRecord(recorder)  # IC, ICIR, Rank IC and Rank ICIR
    sar.generate()

In [None]:
# Load prediction and label data
pred_df = recorder.load_object("pred.pkl")
label_df = recorder.load_object("label.pkl")
label_df.columns = ["label"]

# Get normalized label data
label_df_normalized = dataset.prepare(
    "test", col_set=["label"], data_key=DataHandlerLP.DK_I)
label_df_normalized.columns = ["label"]

# Create combined dataframes for raw and normalized data
pred_label = pd.concat([label_df, pred_df],
                       axis=1, sort=True).reindex(label_df.index)
pred_label_normalized = pd.concat(
    [label_df_normalized, pred_df], axis=1, sort=True).reindex(label_df.index)

# Clean up index levels if needed
for df in [pred_label, pred_label_normalized]:
    if df.index.nlevels > 2:
        df.drop(level=0, inplace=True)

# Evaluate
print("===Calculating evaluation loss for raw label===")
print(f"\nraw label: {pred_label['label'].head()}")
print(f"\npred: {pred_label['score'].head()}")
mse = mean_squared_error(pred_label['label'], pred_label['score'])
print(f"\nMean Squared Error: {mse:.6f}")

print("\n===Calculating evaluation loss for normalized label===")
print(f"\nnormalized label: {pred_label_normalized['label'].head()}")
print(f"\npred: {pred_label_normalized['score'].head()}")
mse_normalized = mean_squared_error(
    pred_label_normalized['label'],
    pred_label_normalized['score']
)
print(f"\nMean Squared Error (Normalized): {mse_normalized:.6f}")

In [None]:
# Visualize model performance
analysis_model.model_performance_graph(pred_label)