In [11]:
%reload_ext autoreload
%autoreload 2

import sys
from pathlib import Path
from datasets import load_dataset, load_from_disk, Dataset
import torch
from torch.utils.data import DataLoader
import numpy as np
import pickle
import yaml
import json
import pandas as pd
from collections import defaultdict
root_path = Path(".").resolve().parent
print("Root path:", root_path)
sys.path.append(str(root_path))

import importlib
from src.model_meta.dataset import CSVDataModule, custom_collate_fn

Root path: /Users/takeruito/work/PrfSR


# Load Config path

In [12]:

train_config_path = root_path / "src/model_meta/training_config.yaml"
assert Path(train_config_path).exists(), FileNotFoundError(f"Train config file not found: {train_config_path}")
with open(train_config_path, 'r') as f:
    config = yaml.safe_load(f)
print(f"Metadata path: {config["metadata_path"]:}")
print(f"max_epoch: {config['max_epoch']}")
print(f"max_value: {config['max_value']}")
print(f"min_n_tokens_in_batch: {config['min_n_tokens_in_batch']}")
print(f"test_ratio: {config['test_ratio']}")
print(f"val_ratio: {config['val_ratio']}")
print(f"num_workers: {config['num_workers']}")
print(f"token_embed_dim: {config['token_embed_dim']}")
print(f"emb_expansion_factor: {config['emb_expansion_factor']}")
print(f"learning_rate: {config['learning_rate']}")
print(f"nhead: {config['transformer']['nhead']}")
print(f"num_encoder_layers: {config['transformer']['num_encoder_layers']}")
print(f"num_decoder_layers: {config["transformer"]["num_decoder_layers"]}")
print(f"dim_feedforward: {config["transformer"]["dim_feedforward"]}")
print(f"dropout: {config["transformer"]["dropout"]}")


Metadata path: /home/takeru/AlphaSymbol/data/training/mini_metadata.yaml
max_epoch: 1000
max_value: 2000
min_n_tokens_in_batch: 2000
test_ratio: 0.1
val_ratio: 0.25
num_workers: 13
token_embed_dim: 16
emb_expansion_factor: 1
learning_rate: 3*10**(-4)
nhead: 16
num_encoder_layers: 4
num_decoder_layers: 6
dim_feedforward: 512
dropout: 0.1


# Load CSV by chunk

In [14]:
dataset_path = "/Users/takeruito/work/PrfSR/data/training/superfib_r1_dataset.csv"
chunk_reader = pd.read_csv(dataset_path, chunksize=10000)

for chunk_idx, chunk_df in enumerate(chunk_reader):
    print(f"Chunk {chunk_idx} - Rows: {len(chunk_df)}")
    dataset = Dataset.from_pandas(chunk_df)
    dataset = dataset.map(
        lambda x: {
            "source": eval(x["source"]),
            "target": eval(x["target"]),
        }
    )
    print(f"Processing shard {chunk_idx} with size {len(chunk_df)}")
    
    dataloader = CSVDataModule(
        dataset=dataset,
        batch_size=config['batch_size'],
        num_workers=0, # config['num_workers'],
        train_val_split=1 - config["test_ratio"],
        seed=42,
        collate_fn=custom_collate_fn,
        batching_strategy=config['batching_strategy'],
        min_tokens_per_batch=config['min_n_tokens_in_batch'],
        max_batch_size=config['batch_size'],
    )
    
    dataloader.setup()
    batch = next(iter(dataloader.train_dataloader()))
    print(f"MiniData {chunk_idx+1} - Source shape: {batch['source'].shape}, Target shape: {batch['target'].shape}")
    print()
    

Chunk 0 - Rows: 10000


Map: 100%|██████████| 10000/10000 [00:01<00:00, 8101.42 examples/s]


Processing shard 0 with size 10000
Train dataset: 9000 samples
Validation dataset: 1000 samples
MiniData 1 - Source shape: torch.Size([64, 10, 4]), Target shape: torch.Size([64, 124])

Chunk 1 - Rows: 10000


Map: 100%|██████████| 10000/10000 [00:01<00:00, 7679.37 examples/s]


Processing shard 1 with size 10000
Train dataset: 9000 samples
Validation dataset: 1000 samples


KeyboardInterrupt: 