In [16]:
import torch
import sys
import os
import json
import pandas as pd
import numpy as np
from datetime import datetime, timezone

Igazából ha minden címkéhez hozzá akarjuk adni a pole-t, akkor a label előtti dolgokat kell nézni hogy ott van-e pole és azt hozzáadni.  
Ha pedig el akarjuk tüntetni a poleokat, akkor a label elejét kell megnézni és ha ott van pole az első pár gyertyán akkor azt kiszedjük. Ezt akár lehet label arányosan csinálni hogy pl első 10%ot nézzük meg és abból azt a rész kiszedjük ami a legnagyobb meredekségű és abs álép egy határt.


In [None]:
from typing import List, Dict
import numpy as np
import pandas as pd

def extend_labels_with_pole(
    ohlc_df: pd.DataFrame,
    labels: List[Dict],
    price_col: str = "close",
    timestamp_col: str = "timestamp",

    min_bar_slope: float = 0.001,      # ~0.1% move per bar
    min_total_gain: float = 0.01,      # ~1% pole gain
    max_pole_bars: int = 25
) -> List[Dict]:
    """
    For each label interval, scan backward to attach a pole if a steep move precedes the flag.
    Thresholds (`min_bar_slope`, `min_total_gain`) are derived dynamically from the label's own
    price action (percentiles of recent bar slopes and cumulative gains).
    """
    df = ohlc_df.copy()
    df = df.sort_values(timestamp_col).reset_index(drop=True)

    ts_values = df[timestamp_col].to_numpy()
    price = df[price_col].to_numpy()

    def timestamp_to_idx(ts):
        idx = np.searchsorted(ts_values, ts)
        if idx >= len(ts_values) or ts_values[idx] != ts:
            return None
        return idx

    updated_labels = []
    for entry in labels:
        start_idx = timestamp_to_idx(entry["start"])
        end_idx = timestamp_to_idx(entry["end"])
        if start_idx is None or end_idx is None:
            entry = {**entry, "pole_start": entry["start"], "pole_added": False, "pole_gain": 0.0}
            updated_labels.append(entry)
            continue

        direction = 1 if "bull" in entry["label"].lower() else -1
        pole_start_idx = start_idx
        steps = 0

        while pole_start_idx > 0 and steps < max_pole_bars:
            prev_idx = pole_start_idx - 1
            bar_return = direction * ((price[pole_start_idx] - price[prev_idx]) / price[prev_idx])
            if bar_return < min_bar_slope:
                break
            pole_start_idx = prev_idx
            steps += 1

        total_gain = direction * ((price[start_idx] - price[pole_start_idx]) / price[pole_start_idx])
        if total_gain >= min_total_gain and pole_start_idx != start_idx:
            pole_ts = ts_values[pole_start_idx].isoformat()
            updated_labels.append({
                **entry,
                "pole_start": pd.to_datetime(pole_ts, utc=True),
                "pole_added": True,
                "pole_gain": float(total_gain)
            })
        else:
            updated_labels.append({
                **entry,
                "pole_start": pd.to_datetime(entry["start"], utc=True),
                "pole_added": False,
                "pole_gain": float(total_gain)
            })

    return updated_labels


In [None]:
def extend_labels_with_pole(
    ohlc_df: pd.DataFrame,
    labels: List[Dict],
    price_col: str = "close",
    timestamp_col: str = "timestamp",
    max_pole_bars: int = 25,
    label_window: int = 5,
    slope_multiplier: float = 1.25,
    gain_multiplier: float = 1.15,
    default_min_bar_slope: float = 0.0001,
    default_min_total_gain: float = 0.005,
) -> List[Dict]:
    df = ohlc_df.copy().sort_values(timestamp_col).reset_index(drop=True)
    df[timestamp_col] = pd.to_datetime(df[timestamp_col], utc=True)

    ts_values = df[timestamp_col].to_numpy()
    price = df[price_col].to_numpy()

    def timestamp_to_idx(ts):
        idx = np.searchsorted(ts_values, ts)
        if idx >= len(ts_values) or ts_values[idx] != ts:
            return None
        return idx

    # ezt a logikát erőteljesen át kell gondolni mert lehet pont ellentétes irányú néha működik néha nem stb
    def label_threshold(entry):
        start_idx = timestamp_to_idx(entry["start"])
        end_idx = timestamp_to_idx(entry["end"])
        if start_idx is None or end_idx is None or end_idx <= start_idx:
            return default_min_bar_slope, default_min_total_gain

        direction = 1 if "bull" in entry["label"].lower() else -1
        last_idx = min(start_idx + label_window, end_idx)
        bar_slopes = []
        for i in range(start_idx + 1, last_idx + 1):
            bar_return = direction * ((price[i] - price[i - 1]) / price[i - 1])
            if bar_return > 0:
                bar_slopes.append(bar_return)

        base_slope = np.median(bar_slopes) if bar_slopes else default_min_bar_slope
        label_gain = direction * ((price[last_idx] - price[start_idx]) / price[start_idx])
        base_gain = label_gain if label_gain > 0 else default_min_total_gain

        return base_slope * slope_multiplier, base_gain * gain_multiplier

    updated_labels = []
    for entry in labels:
        min_bar_slope, min_total_gain = label_threshold(entry)

        start_idx = timestamp_to_idx(entry["start"])
        end_idx = timestamp_to_idx(entry["end"])
        if start_idx is None or end_idx is None:
            updated_labels.append({**entry, "pole_start": entry["start"], "pole_added": False, "pole_gain": 0.0})
            continue

        direction = 1 if "bull" in entry["label"].lower() else -1
        pole_start_idx = start_idx
        steps = 0

        while pole_start_idx > 0 and steps < max_pole_bars:
            prev_idx = pole_start_idx - 1
            bar_return = direction * ((price[pole_start_idx] - price[prev_idx]) / price[prev_idx])
            if bar_return < min_bar_slope:
                break
            pole_start_idx = prev_idx
            steps += 1

        total_gain = direction * ((price[start_idx] - price[pole_start_idx]) / price[pole_start_idx])
        if total_gain >= min_total_gain and pole_start_idx != start_idx:
            updated_labels.append({
                **entry,
                "pole_start": ts_values[pole_start_idx],
                "pole_added": True,
                "pole_gain": float(total_gain),
            })
        else:
            print(f"Bar Return: {bar_return} Min Bar Slope: {min_bar_slope}, Total gain: {total_gain} Min Total Gain: {min_total_gain}")
            updated_labels.append({
                **entry,
                "pole_start": entry["start"],
                "pole_added": False,
                "pole_gain": float(total_gain),
            })

    return updated_labels

In [None]:
data = data_with_labels["VWXUD6/XAUUSD_5m_001.csv"]['data']
labels = data_with_labels["VWXUD6/XAUUSD_5m_001.csv"]['labels']

extended_labels = extend_labels_with_pole(data, labels)
extended_labels

Ezek néha működtek néha nem úgyhogy inkább nem használtam végül. Még fejleszteni, finomítani vagy pont egyszerűsíteni kéne az algoritmust.

In [12]:
summary_rows = []
for file_name, content in data_with_labels.items():
    df = content["data"].copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    total_span = (df["timestamp"].max() - df["timestamp"].min()).total_seconds()
    label_durations = {}
    total_label_sec = 0.0
    for label in content["labels"]:
        if label["start"] is None or label["end"] is None:
            continue
        duration = (pd.to_datetime(label["end"], utc=True) - pd.to_datetime(label["start"], utc=True)).total_seconds()
        label_name = label["label"]
        label_durations[label_name] = label_durations.get(label_name, 0.0) + duration
        total_label_sec += max(duration, 0.0)
    coverage = (total_label_sec / total_span) if total_span > 0 else 0.0
    summary_rows.append({
        "file": file_name,
        "rows": len(df),
        "span_sec": total_span,
        "label_sec": total_label_sec,
        "coverage": coverage,
        "label_breakdown": label_durations,
    })
summary_df = pd.DataFrame(summary_rows).sort_values("coverage", ascending=False)
print(summary_df[["file", "rows", "span_sec", "label_sec", "coverage"]].to_string(index=False))
for idx, row in summary_df.iterrows():
    print(f"{row['file']} label breakdown:", row["label_breakdown"])


                                          file   rows    span_sec  label_sec  coverage
                   TYEGJ8/EURUSD_15min_001.csv    452    586800.0   347400.0  0.592025
                    TYEGJ8/EURUSD_5min_002.csv   1355    587400.0   178500.0  0.303882
                   IV92EM/EURUSD_15min_001.csv   8201  10539900.0  2279700.0  0.216292
GFTYRV/EURUSDX_2025-10-25_to_2025-10-29_5m.csv    863    258900.0    50700.0  0.195829
                     TYEGJ8/US500_1min_003.csv   1839    535380.0   102720.0  0.191864
                       W241KQ/EURUSD_15min.csv 143054 181578600.0 32354100.0  0.178182
GFTYRV/EURUSDX_2025-10-30_to_2025-10-30_5m.csv    136     40500.0     7200.0  0.177778
GFTYRV/EURUSDX_2025-09-15_to_2025-09-19_5m.csv   1365    426300.0    52800.0  0.123856
GFTYRV/EURUSDX_2025-09-10_to_2025-09-14_5m.csv    846    253500.0    23700.0  0.093491
                AS76HW/XAU_1m_data_limited.csv 999999 149316360.0 13838520.0  0.092679
GFTYRV/EURUSDX_2025-10-10_to_2025-10-14_5m.

In [24]:
def trim_after_last_label(df: pd.DataFrame, label_col: str = "flag_label", margin: int = 64) -> pd.DataFrame:
    if df.shape[0] < 10000:
        return df
    
    mask = df[label_col] != label_map["No Flag"]
    last_labeled_idx = mask.to_numpy().nonzero()[0][-1]
    end_idx = min(len(df), last_labeled_idx + margin)
    return df.iloc[:end_idx].reset_index(drop=True)


def label_coverage(df: pd.DataFrame, label_col: str = "flag_label") -> dict[str, float]:
    total = len(df)
    if total == 0:
        return {"rows": 0, "labeled": 0, "coverage": 0.0}
    mask = df[label_col] != label_map["No Flag"]
    return {
        "rows": total,
        "labeled": int(mask.sum()),
        "coverage": float(mask.sum() / total),
    }

def print_coverage(file_name, stats_before, stats_after) -> None:
    print(f"{file_name} rows: {stats_before['rows']} rows: {stats_after['rows']}  coverage: {stats_before['coverage']:.3f} coverage: {stats_after['coverage']:.3f}")
    if stats_before['labeled'] != stats_after['labeled']:
        print(f"  Labeled rows changed from {stats_before['labeled']} to {stats_after['labeled']} after trimming.")

for file_name, content in data_with_labels.items():
    data = content["data"]
    labels = content["labels"]

    labeled_df = attach_interval_labels(data, labels, label_map)
    stats_before = label_coverage(labeled_df)
    labeled_df = trim_after_last_label(labeled_df, margin=32*2)
    stats_after = label_coverage(labeled_df)
    print_coverage(file_name, stats_before, stats_after)

AS76HW/XAU_1h_data_limited.csv rows: 121823 rows: 11152  coverage: 0.001 coverage: 0.014
AS76HW/XAU_1m_data_limited.csv rows: 999999 rows: 969593  coverage: 0.095 coverage: 0.098
AS76HW/XAU_5m_data_limited.csv rows: 999999 rows: 478257  coverage: 0.008 coverage: 0.017
AS76HW/XAU_15m_data_limited.csv rows: 480717 rows: 432118  coverage: 0.011 coverage: 0.012
AS76HW/XAU_30m_data_limited.csv rows: 242152 rows: 190510  coverage: 0.030 coverage: 0.038
AY1PC8/EURUSD_1H_005.csv rows: 840 rows: 840  coverage: 0.067 coverage: 0.067
AY1PC8/EURUSD_1min_001.csv rows: 50000 rows: 46398  coverage: 0.077 coverage: 0.083
DO1H50/EURUSD_1hour_001.csv rows: 5057 rows: 5057  coverage: 0.068 coverage: 0.068
DO1H50/EURUSD_1minute_001.csv rows: 82033 rows: 59607  coverage: 0.013 coverage: 0.018
GFTYRV/GSPC_2025-09-25_to_2025-09-29_5m.csv rows: 234 rows: 234  coverage: 0.201 coverage: 0.201
GFTYRV/GSPC_2025-09-30_to_2025-10-04_5m.csv rows: 312 rows: 312  coverage: 0.083 coverage: 0.083
GFTYRV/GSPC_2025-10-05_

Ez sem oldja meg a coverage problémát úgyhogy majd eldobjuk a nagyrészét az unlabeled windowknak