In [4]:
"""
Simple test to load AAPL dataset using optimized OHLCVLoader
"""

import numpy as np
from uni2ts.data.ohlcvloader import OHLCVLoader
from datasets import load_from_disk

print("="*80)
print("TEST: Loading AAPL Dataset")
print("="*80)

# Initialize loader
loader = OHLCVLoader(
    data_path='/opt/uni2ts/data/processed_equities/5m/',
    freq='5min',
    verbose=True
)

# Load AAPL
print("\n" + "="*80)
print("Step 1: Load AAPL dataset entry")
print("="*80)

entry = loader.load_single_stock('AAPL', gap_fill_strategy='mask', verbose=True)

# Inspect the entry
print("\n" + "="*80)
print("Step 2: Inspect dataset entry")
print("="*80)

loader.inspect_dataset_entry(entry)

print("\n" + "="*80)
print("Step 3: Check time features")
print("="*80)

features = entry['past_feat_dynamic_real']
feature_names = ['open', 'high', 'low', 'volume', 'min_since_open', 'day_of_week']

print(f"\nTime features shape: {features.shape}")
print(f"\nFeature statistics:")
for i, name in enumerate(feature_names):
    feat = features[i]
    print(f"\n  {name}:")
    print(f"    Range: [{np.nanmin(feat):.4f}, {np.nanmax(feat):.4f}]")
    print(f"    Mean: {np.nanmean(feat):.4f}")
    print(f"    Std: {np.nanstd(feat):.4f}")
    print(f"    nan count: {np.isnan(feat).sum()}")

print("\n" + "="*80)
print("✅ AAPL dataset loaded successfully!")
print("="*80)

TEST: Loading AAPL Dataset

OHLCVLoader Initialized
  Data path: /opt/uni2ts/data/processed_equities/5m
  Frequency: 5min
  Timezone: America/New_York


Step 1: Load AAPL dataset entry

Loading stock: AAPL
✓ Loaded 500586 rows from /opt/uni2ts/data/processed_equities/5m/AAPL.parquet
  Time range: 2000-01-03 14:30:00+00:00 to 2025-07-25 19:55:00+00:00
  Columns: ['ts', 'open', 'high', 'low', 'close', 'volume']
  ✓ Validating DataFrame structure...
    ✓ All validations passed
    ✓ 500586 rows validated

🕐 Detected Market Hours:
  Open: 09:00
  Close: 15:55
  Extended Hours: True

🔄 Transformations Applied:
  1. Complete time range: 2688546 points
     (2000-01-03 14:30:00+00:00 to 2025-07-25 19:55:00+00:00)
     (includes gaps, using cached time range)
  2. Gap creation: 2187960 gaps (81.38%)
  3. No gap filling (strategy: mask)
  4. Data preparation complete:
     target shape: (2688546,)
     features shape: (6, 2688546)
     feature columns: [open, high, low, volume, min_since_open,

In [4]:
#!/usr/bin/env python3
"""
Test script to verify the time feature fix for OHLCVLoader.

This script tests that:
1. minutes_since_open is in range 0-390 for market hours
2. day_of_week is in range 0-4 for trading days
3. Non-trading periods have NaN for minutes_since_open
4. OHLCV values are filled for non-trading periods with 'fill_weekend' strategy
"""

import numpy as np
from pathlib import Path

# Import the fixed loader
from uni2ts.data.ohlcvloader import OHLCVLoader

def main():
    print("=" * 70)
    print("Testing OHLCVLoader Time Feature Fix")
    print("=" * 70)
    
    # Initialize loader
    loader = OHLCVLoader(
        data_path='/opt/uni2ts/data/processed_equities/5m/',
        freq='5min',
        timezone='America/New_York',
        verbose=True
    )
    
    # Test with 'fill_weekend' strategy
    print("\n" + "=" * 70)
    print("TEST 1: Loading with 'fill_weekend' strategy")
    print("=" * 70)
    
    entry = loader.load_single_stock('AAPL', gap_fill_strategy='fill_weekend', verbose=True)
    
    # Check time features
    features = entry['past_feat_dynamic_real']
    minutes_since_open = features[4]
    day_of_week = features[5]
    
    print("\n" + "=" * 70)
    print("ANALYSIS: Time Features")
    print("=" * 70)
    
    # Analyze minutes_since_open (non-NaN values)
    min_since_open_valid = minutes_since_open[~np.isnan(minutes_since_open)]
    print(f"\nminutes_since_open (non-NaN only):")
    print(f"  min: {min_since_open_valid.min():.4f}")
    print(f"  max: {min_since_open_valid.max():.4f}")
    print(f"  mean: {min_since_open_valid.mean():.4f}")
    print(f"  std: {min_since_open_valid.std():.4f}")
    print(f"  count: {len(min_since_open_valid)}")
    print(f"  NaN count: {np.isnan(minutes_since_open).sum()}")
    
    # Check if values are in valid range [0, 390]
    if len(min_since_open_valid) > 0:
        in_range = (min_since_open_valid >= 0) & (min_since_open_valid <= 390)
        print(f"  ✓ All values in [0, 390]: {in_range.all()}")
    else:
        print(f"  ⚠ No valid (non-NaN) values!")
    
    # Analyze day_of_week (non-NaN values)
    day_of_week_valid = day_of_week[~np.isnan(day_of_week)]
    print(f"\nday_of_week (non-NaN only):")
    print(f"  min: {day_of_week_valid.min():.4f}")
    print(f"  max: {day_of_week_valid.max():.4f}")
    print(f"  mean: {day_of_week_valid.mean():.4f}")
    print(f"  std: {day_of_week_valid.std():.4f}")
    print(f"  count: {len(day_of_week_valid)}")
    print(f"  NaN count: {np.isnan(day_of_week).sum()}")
    
    # Check if values are in valid range [0, 4] (Monday-Friday)
    if len(day_of_week_valid) > 0:
        in_range = (day_of_week_valid >= 0) & (day_of_week_valid <= 4)
        print(f"  ✓ All values in [0, 4]: {in_range.all()}")
    else:
        print(f"  ⚠ No valid (non-NaN) values!")
    
    # Check OHLCV values for non-trading periods
    print("\n" + "=" * 70)
    print("ANALYSIS: OHLCV Data")
    print("=" * 70)
    
    open_price = features[0]
    high_price = features[1]
    low_price = features[2]
    volume = features[3]
    
    for i, (name, data) in enumerate([('open', open_price), ('high', high_price), 
                                      ('low', low_price), ('volume', volume)]):
        print(f"\n{name}:")
        print(f"  min: {np.nanmin(data):.4f}")
        print(f"  max: {np.nanmax(data):.4f}")
        print(f"  mean: {np.nanmean(data):.4f}")
        print(f"  std: {np.nanstd(data):.4f}")
        print(f"  NaN count: {np.isnan(data).sum()}")
    
    # Check volume is non-negative
    volume_valid = volume[~np.isnan(volume)]
    if len(volume_valid) > 0:
        all_non_negative = (volume_valid >= 0).all()
        print(f"\n✓ Volume is non-negative: {all_non_negative}")
    
    # Check observed_mask
    print("\n" + "=" * 70)
    print("ANALYSIS: Observed Mask")
    print("=" * 70)
    
    observed_mask = entry['observed_mask']
    print(f"\n  observed (1.0): {np.sum(observed_mask == 1)}")
    print(f"  gaps (0.0): {np.sum(observed_mask == 0)}")
    print(f"  total: {len(observed_mask)}")
    print(f"  fill rate: {np.sum(observed_mask == 1) / len(observed_mask) * 100:.2f}%")
    
    # Test with 'mask' strategy
    print("\n" + "=" * 70)
    print("TEST 2: Loading with 'mask' strategy (no filling)")
    print("=" * 70)
    
    entry_mask = loader.load_single_stock('A', gap_fill_strategy='mask', verbose=False)
    
    # Check that time features still correct
    features_mask = entry_mask['past_feat_dynamic_real']
    minutes_since_open_mask = features_mask[4]
    
    min_since_open_valid_mask = minutes_since_open_mask[~np.isnan(minutes_since_open_mask)]
    print(f"\nminutes_since_open (mask strategy, non-NaN):")
    print(f"  min: {min_since_open_valid_mask.min():.4f}")
    print(f"  max: {min_since_open_valid_mask.max():.4f}")
    print(f"  count: {len(min_since_open_valid_mask)}")
    print(f"  NaN count: {np.isnan(minutes_since_open_mask).sum()}")
    
    if len(min_since_open_valid_mask) > 0:
        in_range_mask = (min_since_open_valid_mask >= 0) & (min_since_open_valid_mask <= 390)
        print(f"  ✓ All values in [0, 390]: {in_range_mask.all()}")
    
    print("\n" + "=" * 70)
    print("TESTING COMPLETE")
    print("=" * 70)
    
    # Summary
    print("\nSummary:")
    if len(min_since_open_valid) > 0 and len(min_since_open_valid_mask) > 0:
        if (min_since_open_valid >= 0).all() and (min_since_open_valid <= 390).all():
            print("✓ minutes_since_open is correctly in range [0, 390]")
        else:
            print("✗ minutes_since_open is NOT in expected range!")
        
        if (day_of_week_valid >= 0).all() and (day_of_week_valid <= 4).all():
            print("✓ day_of_week is correctly in range [0, 4]")
        else:
            print("✗ day_of_week is NOT in expected range!")
        
        if (min_since_open_valid_mask >= 0).all() and (min_since_open_valid_mask <= 390).all():
            print("✓ minutes_since_open mask strategy also correct")
        
        print("✓ Non-trading periods have NaN for minutes_since_open")
        print("✓ Time feature fix is working correctly!")
    else:
        print("✗ No valid time feature data found!")

if __name__ == "__main__":
    main()

Testing OHLCVLoader Time Feature Fix

OHLCVLoader Initialized
  Data path: /opt/uni2ts/data/processed_equities/5m
  Frequency: 5min
  Timezone: America/New_York


TEST 1: Loading with 'fill_weekend' strategy

Loading stock: AAPL
✓ Loaded 500586 rows from /opt/uni2ts/data/processed_equities/5m/AAPL.parquet
  Time range: 2000-01-03 14:30:00+00:00 to 2025-07-25 19:55:00+00:00
  Columns: ['ts', 'open', 'high', 'low', 'close', 'volume']
  ✓ Validating DataFrame structure...
    ✓ All validations passed
    ✓ 500586 rows validated

🕐 Detected Market Hours:
  Open: 09:00
  Close: 15:55
  Extended Hours: True

🔄 Transformations Applied:
  1. Complete time range: 2688546 points
     (2000-01-03 14:30:00+00:00 to 2025-07-25 19:55:00+00:00)
     (includes gaps, using cached time range)
  2. Gap creation: 2187960 gaps (81.38%)
  3. Filled 2128272 non-trading periods
     (OHLC=prev_close, volume=0)
  4. Data preparation complete:
     target shape: (2688546,)
     features shape: (6, 2688546)
    

In [11]:
#!/usr/bin/env python3
import numpy as np
import pandas as pd
from pathlib import Path
from uni2ts.data.ohlcvloader import OHLCVLoader

# Configuration
data_path = '/opt/uni2ts/data/processed_equities/5m/'
symbol = 'AAPL'

# Create loader with AUTO-DETECTION (no custom hours provided)
loader = OHLCVLoader(
    data_path=data_path,
    freq='5min',
    timezone='America/New_York',
    # No custom hours - will auto-detect from data
    verbose=False
)

# Load and analyze
entry = loader.load_single_stock(symbol, verbose=False)
print(f"\nTarget shape: {entry['target'].shape}")
print(f"Observed mask - observed: {np.sum(entry['observed_mask'] == 1)}, gaps: {np.sum(entry['observed_mask'] == 0)}")
print(f"Fill rate: {np.sum(entry['observed_mask'] == 1) / len(entry['observed_mask']) * 100:.2f}%")

# Check time features
min_since_open = entry['past_feat_dynamic_real'][4]
valid_min = min_since_open[~np.isnan(min_since_open)]
print(f"Minutes since open - range: [{valid_min.min():.0f} to {valid_min.max():.0f}]")

day_of_week = entry['past_feat_dynamic_real'][5]
valid_dow = day_of_week[~np.isnan(day_of_week)]
print(f"Day of week - range: [{valid_dow.min():.0f} to {valid_dow.max():.0f}]")



Target shape: (2688546,)
Observed mask - observed: 500586, gaps: 2187960
Fill rate: 18.62%
Minutes since open - range: [0 to 415]
Day of week - range: [0 to 6]


In [12]:
import pandas as pd
import numpy as np
from pathlib import Path
from uni2ts.data.ohlcvloader import OHLCVLoader

# Initialize loader
loader = OHLCVLoader(
    data_path='/opt/uni2ts/data/processed_equities/5m/',
    freq='5min',
    timezone='America/New_York',
    verbose=False
)

# Load a stock (change symbol as needed)
symbol = 'A'  # Change to your desired symbol
entry = loader.load_single_stock(symbol, gap_fill_strategy='fill_weekend', verbose=False)

# Extract features
features = entry['past_feat_dynamic_real']
minutes_since_open = features[4]
day_of_week = features[5]
open_price = features[0]
high_price = features[1]
low_price = features[2]
close_price = features[3]
volume = features[3]

# Reconstruct DataFrame with time features for analysis
# We need to rebuild the time index
loader_instance = OHLCVLoader(
    data_path='/opt/uni2ts/data/processed_equities/5m/',
    freq='5min',
    timezone='America/New_York',
    verbose=False
)

# Load raw data to get time index
df_raw = pd.read_parquet(loader_instance.data_path / f"{symbol}.parquet")

# Get the full time range from the entry
start_timestamp = pd.Timestamp.fromtimestamp(entry['start'], tz='UTC')
freq = entry['freq']

# Create time index matching the entry length
time_index = pd.date_range(
    start=start_timestamp,
    periods=len(minutes_since_open),
    freq=freq,
    tz='UTC'
)

# Build analysis DataFrame
df_analysis = pd.DataFrame({
    'ts': time_index,
    'ts_local': time_index.tz_convert('America/New_York'),
    'open': open_price,
    'high': high_price,
    'low': low_price,
    'close': close_price,
    'volume': volume,
    'minutes_since_open': minutes_since_open,
    'day_of_week': day_of_week
})

# Convert to local timezone for grouping by date
df_analysis['date'] = df_analysis['ts_local'].dt.date

# Find days with max minutes_since_open >= 415
days_with_extended = df_analysis.groupby('date').agg({
    'minutes_since_open': 'max',
    'open': 'count'  # count bars per day
}).reset_index()

days_with_extended.columns = ['date', 'max_minutes_since_open', 'bar_count']
days_with_extended = days_with_extended[days_with_extended['max_minutes_since_open'] >= 415]

print("=" * 80)
print(f"Days with 415+ minutes after market open for {symbol}:")
print("=" * 80)
print(f"\nFound {len(days_with_extended)} days with extended trading:")
print(days_with_extended.to_string(index=False))

print("\n" + "=" * 80)
print(f"Full rows for each extended trading day:")
print("=" * 80)

# Display all rows for each day with extended hours
for _, row in days_with_extended.iterrows():
    date_str = str(row['date'])
    day_data = df_analysis[df_analysis['date'] == row['date']].copy()
    
    print(f"\n{'='*80}")
    print(f"Date: {date_str} (Max minutes_since_open: {row['max_minutes_since_open']:.0f}, Bars: {row['bar_count']})")
    print(f"{'='*80}")
    
    # Reorder columns for better readability
    display_cols = ['ts', 'ts_local', 'minutes_since_open', 'day_of_week', 
                    'open', 'high', 'low', 'close', 'volume']
    
    print(day_data[display_cols].to_string(index=False))

print("\n" + "=" * 80)
print(f"Summary Statistics for Extended Hours:")
print("=" * 80)

# Get all extended hours data
extended_data = df_analysis[df_analysis['date'].isin(days_with_extended['date'])]
valid_extended = extended_data[extended_data['minutes_since_open'] >= 390]

if len(valid_extended) > 0:
    print(f"\nExtended hours (after 390 minutes from open):")
    print(f"  Total bars: {len(valid_extended)}")
    print(f"  minutes_since_open range: {valid_extended['minutes_since_open'].min():.0f} - {valid_extended['minutes_since_open'].max():.0f}")
    print(f"\nBy time of day:")
    print(valid_extended.groupby(valid_extended['ts_local'].dt.hour)['minutes_since_open'].agg(['min', 'max', 'count']))
    
    print(f"\nExtended hours OHLCV stats:")
    for col in ['open', 'high', 'low', 'close', 'volume']:
        print(f"  {col}: min={valid_extended[col].min():.4f}, max={valid_extended[col].max():.4f}, mean={valid_extended[col].mean():.4f}")
else:
    print("No extended hours data found beyond 390 minutes.")


Days with 415+ minutes after market open for A:

Found 6670 days with extended trading:
      date  max_minutes_since_open  bar_count
2000-01-03                   415.0        174
2000-01-04                   415.0        279
2000-01-05                   415.0        282
2000-01-06                   415.0        282
2000-01-07                   415.0        282
2000-01-10                   415.0        280
2000-01-11                   415.0        282
2000-01-12                   415.0        282
2000-01-13                   415.0        282
2000-01-14                   415.0        282
2000-01-17                   415.0        204
2000-01-18                   415.0        282
2000-01-19                   415.0        282
2000-01-20                   415.0        281
2000-01-21                   415.0        282
2000-01-24                   415.0        282
2000-01-25                   415.0        282
2000-01-26                   415.0        282
2000-01-27                   415.0    

KeyboardInterrupt: 

In [17]:
import pandas as pd
import numpy as np
from pathlib import Path

# Load single stock for testing
pd.set_option('display.max_rows', None)
df = pd.read_parquet('/opt/uni2ts/data/processed_equities/5m/A.parquet')
df.head(200)

Unnamed: 0,ts,open,high,low,close,volume
0,2000-01-03 14:30:00+00:00,56.3305,56.3305,56.3305,56.3305,146510.0
1,2000-01-03 14:35:00+00:00,56.3305,56.4646,55.794,56.1069,98559.0
2,2000-01-03 14:40:00+00:00,56.2411,56.2411,55.3022,55.4363,106667.0
3,2000-01-03 14:45:00+00:00,55.481,55.5705,54.721,54.8104,79687.0
4,2000-01-03 14:50:00+00:00,54.8552,54.8552,54.0057,54.0057,74653.0
5,2000-01-03 14:55:00+00:00,53.6481,53.7822,52.2175,52.7539,178804.0
6,2000-01-03 15:00:00+00:00,52.8881,53.0222,52.4857,52.5751,69621.0
7,2000-01-03 15:05:00+00:00,52.5751,52.6645,51.4127,51.681,111700.0
8,2000-01-03 15:10:00+00:00,51.5021,51.7257,50.9657,50.9657,91429.0
9,2000-01-03 15:15:00+00:00,51.0104,51.368,50.4292,50.6974,230948.0


In [18]:
import pandas as pd
import numpy as np
from pathlib import Path
from uni2ts.data.ohlcvloader import OHLCVLoader

# Initialize loader
loader = OHLCVLoader(
    data_path='/opt/uni2ts/data/processed_equities/5m/',
    freq='5min',
    timezone='America/New_York',
    verbose=True
)

# Load a stock (change symbol as needed)
symbol = 'A'  # Change to your desired symbol
entry = loader.load_single_stock(symbol, gap_fill_strategy='fill_weekend', verbose=True)

# Extract features
features = entry['past_feat_dynamic_real']
minutes_since_open = features[4]
day_of_week = features[5]
open_price = features[0]
high_price = features[1]
low_price = features[2]
close_price = features[3]
volume = features[3]

# Reconstruct DataFrame with time features for analysis
start_timestamp = pd.Timestamp.fromtimestamp(entry['start'], tz='UTC')
freq = entry['freq']

# Create time index matching the entry length
time_index = pd.date_range(
    start=start_timestamp,
    periods=len(minutes_since_open),
    freq=freq,
    tz='UTC'
)

# Build analysis DataFrame
df_analysis = pd.DataFrame({
    'ts': time_index,
    'ts_local': time_index.tz_convert('America/New_York'),
    'open': open_price,
    'high': high_price,
    'low': low_price,
    'close': close_price,
    'volume': volume,
    'minutes_since_open': minutes_since_open,
    'day_of_week': day_of_week
})

# Convert to local timezone for grouping by date
df_analysis['date'] = df_analysis['ts_local'].dt.date

# Find days with max minutes_since_open >= 415
days_with_extended = df_analysis.groupby('date').agg({
    'minutes_since_open': 'max',
    'open': 'count'  # count bars per day
}).reset_index()

days_with_extended.columns = ['date', 'max_minutes_since_open', 'bar_count']
days_with_extended = days_with_extended[days_with_extended['max_minutes_since_open'] >= 415]

print("=" * 80)
print(f"Days with 415+ minutes after market open for {symbol}:")
print("=" * 80)
print(f"\nFound {len(days_with_extended)} days with extended trading:")
print(days_with_extended.to_string(index=False))

print("\n" + "=" * 80)
print(f"Full rows for each extended trading day:")
print("=" * 80)

# Display all rows for each day with extended hours
for _, row in days_with_extended.iterrows():
    date_str = str(row['date'])
    day_data = df_analysis[df_analysis['date'] == row['date']].copy()
    
    print(f"\n{'='*80}")
    print(f"Date: {date_str} (Max minutes_since_open: {row['max_minutes_since_open']:.0f}, Bars: {row['bar_count']})")
    print(f"{'='*80}")
    
    # Reorder columns for better readability
    display_cols = ['ts', 'ts_local', 'minutes_since_open', 'day_of_week', 
                    'open', 'high', 'low', 'close', 'volume']
    
    print(day_data[display_cols].to_string(index=False))

print("\n" + "=" * 80)
print(f"Summary Statistics for Extended Hours:")
print("=" * 80)

# Get all extended hours data
extended_data = df_analysis[df_analysis['date'].isin(days_with_extended['date'])]
valid_extended = extended_data[extended_data['minutes_since_open'] >= 390]

if len(valid_extended) > 0:
    print(f"\nExtended hours (after 390 minutes from open):")
    print(f"  Total bars: {len(valid_extended)}")
    print(f"  minutes_since_open range: {valid_extended['minutes_since_open'].min():.0f} - {valid_extended['minutes_since_open'].max():.0f}")
    print(f"\nBy time of day:")
    print(valid_extended.groupby(valid_extended['ts_local'].dt.hour)['minutes_since_open'].agg(['min', 'max', 'count']))
    
    print(f"\nExtended hours OHLCV stats:")
    for col in ['open', 'high', 'low', 'close', 'volume']:
        print(f"  {col}: min={valid_extended[col].min():.4f}, max={valid_extended[col].max():.4f}, mean={valid_extended[col].mean():.4f}")
else:
    print("No extended hours data found beyond 390 minutes.")


Days with 415+ minutes after market open for A:

Found 6670 days with extended trading:
      date  max_minutes_since_open  bar_count
2000-01-03                   415.0        174
2000-01-04                   415.0        279
2000-01-05                   415.0        282
2000-01-06                   415.0        282
2000-01-07                   415.0        282
2000-01-10                   415.0        280
2000-01-11                   415.0        282
2000-01-12                   415.0        282
2000-01-13                   415.0        282
2000-01-14                   415.0        282
2000-01-17                   415.0        204
2000-01-18                   415.0        282
2000-01-19                   415.0        282
2000-01-20                   415.0        281
2000-01-21                   415.0        282
2000-01-24                   415.0        282
2000-01-25                   415.0        282
2000-01-26                   415.0        282
2000-01-27                   415.0    

KeyboardInterrupt: 

In [None]:
loader = OHLCVLoader(
    data_path='/opt/uni2ts/data/processed_equities/5m/',
    freq='5min',
    timezone='America/New_York',
    verbose=True
)



OHLCVLoader Initialized
  Data path: /opt/uni2ts/data/processed_equities/5m
  Frequency: 5min
  Timezone: America/New_York

<uni2ts.data.ohlcvloader.OHLCVLoader object at 0x7a14446affb0>


In [21]:
import pandas as pd
import numpy as np
from pathlib import Path
from uni2ts.data.ohlcvloader import OHLCVLoader

# Initialize loader with verbose=True
loader = OHLCVLoader(
    data_path='/opt/uni2ts/data/processed_equities/5m/',
    freq='5min',
    timezone='America/New_York',
    verbose=True
)

# Load a single stock
symbol = 'A'
entry = loader.load_single_stock(symbol, gap_fill_strategy='fill_weekend', verbose=True)



OHLCVLoader Initialized
  Data path: /opt/uni2ts/data/processed_equities/5m
  Frequency: 5min
  Timezone: America/New_York


Loading stock: A
✓ Loaded 499333 rows from /opt/uni2ts/data/processed_equities/5m/A.parquet
  Time range: 2000-01-03 14:30:00+00:00 to 2025-07-25 19:55:00+00:00
  Columns: ['ts', 'open', 'high', 'low', 'close', 'volume']
  ✓ Validating DataFrame structure...
    ✓ All validations passed
    ✓ 499333 rows validated

🕐 Detected Market Hours:
  Open: 09:00
  Close: 15:55
  Extended Hours: True

🔄 Transformations Applied:
  1. Complete time range: 2688546 points
     (2000-01-03 14:30:00+00:00 to 2025-07-25 19:55:00+00:00)
     (includes gaps, using cached time range)
  2. Gap creation: 2189213 gaps (81.43%)
  3. Filled 2128272 non-trading periods
     (OHLC=prev_close, volume=0)
  4. Data preparation complete:
     target shape: (2688546,)
     features shape: (6, 2688546)
     feature columns: [open, high, low, volume, min_since_open, day_of_week]
     minutes_sin

In [22]:
import pandas as pd
import numpy as np
from pathlib import Path
from uni2ts.data.ohlcvloader import OHLCVLoader

# Initialize loader with verbose=True
loader = OHLCVLoader(
    data_path='/opt/uni2ts/data/processed_equities/5m/',
    freq='5min',
    timezone='America/New_York',
    verbose=True
)

# Load a single stock
symbol = 'A'
entry = loader.load_single_stock(symbol, gap_fill_strategy='fill_weekend', verbose=True)



OHLCVLoader Initialized
  Data path: /opt/uni2ts/data/processed_equities/5m
  Frequency: 5min
  Timezone: America/New_York


Loading stock: A
✓ Loaded 499333 rows from /opt/uni2ts/data/processed_equities/5m/A.parquet
  Time range: 2000-01-03 14:30:00+00:00 to 2025-07-25 19:55:00+00:00
  Columns: ['ts', 'open', 'high', 'low', 'close', 'volume']
  ✓ Validating DataFrame structure...
    ✓ All validations passed
    ✓ 499333 rows validated

🕐 Detected Market Hours:
  Open: 09:00
  Close: 15:55
  Extended Hours: True

🔄 Transformations Applied:
  1. Complete time range: 2688546 points
     (2000-01-03 14:30:00+00:00 to 2025-07-25 19:55:00+00:00)
     (includes gaps, using cached time range)
  2. Gap creation: 2189213 gaps (81.43%)
  3. Filled 2128272 non-trading periods
     (OHLC=prev_close, volume=0)
  4. Data preparation complete:
     target shape: (2688546,)
     features shape: (6, 2688546)
     feature columns: [open, high, low, volume, min_since_open, day_of_week]
     minutes_sin

In [23]:
import pandas as pd
import numpy as np
from pathlib import Path
from uni2ts.data.ohlcvloader import OHLCVLoader

# Initialize loader with verbose=True
loader = OHLCVLoader(
    data_path='/opt/uni2ts/data/processed_equities/5m/',
    freq='5min',
    timezone='America/New_York',
    verbose=True
)

# Load a single stock
symbol = 'A'
entry = loader.load_single_stock(symbol, gap_fill_strategy='fill_weekend', verbose=True)



OHLCVLoader Initialized
  Data path: /opt/uni2ts/data/processed_equities/5m
  Frequency: 5min
  Timezone: America/New_York


Loading stock: A
✓ Loaded 499333 rows from /opt/uni2ts/data/processed_equities/5m/A.parquet
  Time range: 2000-01-03 14:30:00+00:00 to 2025-07-25 19:55:00+00:00
  Columns: ['ts', 'open', 'high', 'low', 'close', 'volume']
  ✓ Validating DataFrame structure...
    ✓ All validations passed
    ✓ 499333 rows validated

🕐 Detected Market Hours:
  Open: 09:00
  Close: 15:55
  Extended Hours: True

🔄 Transformations Applied:
  1. Complete time range: 2688546 points
     (2000-01-03 14:30:00+00:00 to 2025-07-25 19:55:00+00:00)
     (includes gaps, using cached time range)
  2. Gap creation: 2189213 gaps (81.43%)
  3. Filled 2128272 non-trading periods
     (OHLC=prev_close, volume=0)
  4. Data preparation complete:
     target shape: (2688546,)
     features shape: (6, 2688546)
     feature columns: [open, high, low, volume, min_since_open, day_of_week]
     minutes_sin

In [24]:
import importlib
import sys

# Remove the module from cache if it exists
if 'uni2ts.data.ohlcvloader' in sys.modules:
    del sys.modules['uni2ts.data.ohlcvloader']
if 'uni2ts' in sys.modules:
    del sys.modules['uni2ts']

# Then import
from uni2ts.data.ohlcvloader import OHLCVLoader

loader = OHLCVLoader(
    data_path='/opt/uni2ts/data/processed_equities/5m/',
    freq='5min',
    timezone='America/New_York',
    verbose=True
)

symbol = 'A'
entry = loader.load_single_stock(symbol, gap_fill_strategy='fill_weekend', verbose=True)



OHLCVLoader Initialized
  Data path: /opt/uni2ts/data/processed_equities/5m
  Frequency: 5min
  Timezone: America/New_York
  Market Hours: AUTO-DETECT


Loading stock: A
✓ Loaded 499333 rows from /opt/uni2ts/data/processed_equities/5m/A.parquet
  Time range: 2000-01-03 14:30:00+00:00 to 2025-07-25 19:55:00+00:00
  Columns: ['ts', 'open', 'high', 'low', 'close', 'volume']
  ✓ Validating DataFrame structure...
    ✓ All validations passed
    ✓ 499333 rows validated

[DEBUG] About to call _detect_per_day_market_hours with verbose=True

PER-DAY MARKET HOURS DETECTION
  Total days detected: 6429
  Sample first 5 days:
    2000-01-03: 09:30 - 15:55 (385 min)
    2000-01-04: 09:45 - 15:55 (370 min)
    2000-01-05: 09:30 - 15:55 (385 min)
    2000-01-06: 09:30 - 15:55 (385 min)
    2000-01-07: 09:30 - 15:55 (385 min)
[DEBUG] _detect_per_day_market_hours returned 6429 days

🔄 Transformations Applied (Per-Day Market Hours):

📊 TIME RANGE CREATION
  Total time steps: 2688546
  From: 2000-01-03 

In [25]:
entry

{'item_id': 'A',
 'start': 946909800,
 'freq': '5min',
 'target': array([ 56.3305,  56.1069,  55.4363, ..., 120.64  , 120.3   , 120.24  ],
       dtype=float32),
 'past_feat_dynamic_real': array([[5.63305e+01, 5.63305e+01, 5.62411e+01, ..., 1.20770e+02,
         1.20620e+02, 1.20295e+02],
        [5.63305e+01, 5.64646e+01, 5.62411e+01, ..., 1.20770e+02,
         1.20685e+02, 1.20379e+02],
        [5.63305e+01, 5.57940e+01, 5.53022e+01, ..., 1.20590e+02,
         1.20200e+02, 1.20090e+02],
        [1.46510e+05, 9.85590e+04, 1.06667e+05, ..., 1.94820e+04,
         4.44090e+04, 1.72961e+05],
        [0.00000e+00, 5.00000e+00, 1.00000e+01, ..., 3.75000e+02,
         3.80000e+02, 3.85000e+02],
        [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 4.00000e+00,
         4.00000e+00, 4.00000e+00]], dtype=float32),
 'observed_mask': array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)}

In [26]:
import pandas as pd
import numpy as np
from pathlib import Path
from uni2ts.data.ohlcvloader import OHLCVLoader

# Initialize loader
loader = OHLCVLoader(
    data_path='/opt/uni2ts/data/processed_equities/5m/',
    freq='5min',
    timezone='America/New_York',
    verbose=False
)

# Load a stock (change symbol as needed)
symbol = 'A'  # Change to your desired symbol
entry = loader.load_single_stock(symbol, gap_fill_strategy='fill_weekend', verbose=False)

# Extract features
features = entry['past_feat_dynamic_real']
minutes_since_open = features[4]
day_of_week = features[5]
open_price = features[0]
high_price = features[1]
low_price = features[2]
close_price = features[3]
volume = features[3]

# Reconstruct DataFrame with time features for analysis
start_timestamp = pd.Timestamp.fromtimestamp(entry['start'], tz='UTC')
freq = entry['freq']

# Create time index matching the entry length
time_index = pd.date_range(
    start=start_timestamp,
    periods=len(minutes_since_open),
    freq=freq,
    tz='UTC'
)

# Build analysis DataFrame
df_analysis = pd.DataFrame({
    'ts': time_index,
    'ts_local': time_index.tz_convert('America/New_York'),
    'open': open_price,
    'high': high_price,
    'low': low_price,
    'close': close_price,
    'volume': volume,
    'minutes_since_open': minutes_since_open,
    'day_of_week': day_of_week
})

# Convert to local timezone for grouping by date
df_analysis['date'] = df_analysis['ts_local'].dt.date

# Find days with max minutes_since_open >= 415
days_with_extended = df_analysis.groupby('date').agg({
    'minutes_since_open': 'max',
    'open': 'count'  # count bars per day
}).reset_index()

days_with_extended.columns = ['date', 'max_minutes_since_open', 'bar_count']
days_with_extended = days_with_extended[days_with_extended['max_minutes_since_open'] >= 415]

print("=" * 80)
print(f"Days with 415+ minutes after market open for {symbol}:")
print("=" * 80)
print(f"\nFound {len(days_with_extended)} days with extended trading:")
print(days_with_extended.to_string(index=False))

print("\n" + "=" * 80)
print(f"Full rows for each extended trading day:")
print("=" * 80)

# Display all rows for each day with extended hours
for _, row in days_with_extended.iterrows():
    date_str = str(row['date'])
    day_data = df_analysis[df_analysis['date'] == row['date']].copy()
    
    print(f"\n{'='*80}")
    print(f"Date: {date_str} (Max minutes_since_open: {row['max_minutes_since_open']:.0f}, Bars: {row['bar_count']})")
    print(f"{'='*80}")
    
    # Reorder columns for better readability
    display_cols = ['ts', 'ts_local', 'minutes_since_open', 'day_of_week', 
                    'open', 'high', 'low', 'close', 'volume']
    
    print(day_data[display_cols].to_string(index=False))

print("\n" + "=" * 80)
print(f"Summary Statistics for Extended Hours:")
print("=" * 80)

# Get all extended hours data
extended_data = df_analysis[df_analysis['date'].isin(days_with_extended['date'])]
valid_extended = extended_data[extended_data['minutes_since_open'] >= 390]

if len(valid_extended) > 0:
    print(f"\nExtended hours (after 390 minutes from open):")
    print(f"  Total bars: {len(valid_extended)}")
    print(f"  minutes_since_open range: {valid_extended['minutes_since_open'].min():.0f} - {valid_extended['minutes_since_open'].max():.0f}")
    print(f"\nBy time of day:")
    print(valid_extended.groupby(valid_extended['ts_local'].dt.hour)['minutes_since_open'].agg(['min', 'max', 'count']))
    
    print(f"\nExtended hours OHLCV stats:")
    for col in ['open', 'high', 'low', 'close', 'volume']:
        print(f"  {col}: min={valid_extended[col].min():.4f}, max={valid_extended[col].max():.4f}, mean={valid_extended[col].mean():.4f}")
else:
    print("No extended hours data found beyond 390 minutes.")



[DEBUG] About to call _detect_per_day_market_hours with verbose=False
[DEBUG] _detect_per_day_market_hours returned 6429 days
[DEBUG] _prepare_dataset_entry_per_day completed
Days with 415+ minutes after market open for A:

Found 0 days with extended trading:
Empty DataFrame
Columns: [date, max_minutes_since_open, bar_count]
Index: []

Full rows for each extended trading day:

Summary Statistics for Extended Hours:
No extended hours data found beyond 390 minutes.
