# Prototype end to end process
1. Save the train/val/test sets (which were generated from the train set)
2. Then have a simple process to train on the train set, optimize on the val set, and then test on the holdout test set. I'll then test the outputs on the actual submission test set.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import polars as pl
from omegaconf import OmegaConf
from pathlib import Path
import os
from typing import Tuple

import torch

In [3]:
from trav_nlp.misc import polars_train_val_test_split

In [20]:
cfg = OmegaConf.create({
    'raw_data': {
        'train_path': '../data/train.csv',
        'test_path': '../data/test.csv',
        'sample_submission_path': '../data/sample_submission.csv',
    },
    # Split the train dataset into a train/val/test split
    'training_data': {
        'train_path': '../data/splits/train.parquet',
        'val_path': '../data/splits/val.parquet',
        'test_path': '../data/splits/test.parquet'
    },

    'params': {
        'train_frac': 0.8,
        'val_frac': 0.1,
        'test_frac': 0.1,
        'train_val_test_seed': 42,   
    }
})

## Create the train/val/test splits if they don't already exist

In [32]:
# Define local variables for configuration paths
train_path = cfg.training_data.train_path
val_path = cfg.training_data.val_path
test_path = cfg.training_data.test_path
raw_train_path = cfg.raw_data.train_path

# Define local variables for parameters
train_frac = cfg.params.train_frac
val_frac = cfg.params.val_frac
test_frac = cfg.params.test_frac
seed = cfg.params.train_val_test_seed

if Path(train_path).exists():
    df_train = pl.read_parquet(train_path)
    df_val = pl.read_parquet(val_path)
    df_test = pl.read_parquet(test_path)
else:
    df = pl.read_csv(raw_train_path)
    Path(train_path).parent.mkdir(parents=True, exist_ok=True)
    
    df_train, df_val, df_test = polars_train_val_test_split(
        df, 
        train_frac=train_frac,
        val_frac=val_frac,
        test_frac=test_frac,
        shuffle=True,
        seed=seed
    )
    
    df_train.write_parquet(train_path)
    df_val.write_parquet(val_path)
    df_test.write_parquet(test_path)

In [30]:
print('great, it worked.')

great, it worked.
