In [None]:
import random

import polars as pl
import pyarrow.parquet as pq

import src.pytorch.data.kaggle
import src.pytorch.models
import src.pytorch.models.utils
import src.visualization.performance

from src.pytorch.data.parquet import Dataset
from src.schemas.climsim import INPUT_COLUMNS, OUTPUT_COLUMNS

In [None]:
# TRAINSET_DATA_PATH = "/home/data/subset_train.arrow"
TRAINSET_DATA_PATH = "/home/data/train.parquet"

In [None]:
lf = pl.scan_parquet(TRAINSET_DATA_PATH)
n_samples = lf.select(pl.len()).collect().item()

parquet = pq.ParquetFile(TRAINSET_DATA_PATH, memory_map=True)
all_groups = list(range(0, parquet.num_row_groups))
train_groups = random.sample(all_groups, int(0.7 * len(all_groups)))
val_groups = list(set(all_groups) - set(train_groups))

dataset_train = Dataset(
    parquet=parquet,
    input_cols=INPUT_COLUMNS,
    target_cols=OUTPUT_COLUMNS,
    batch_size=3072,
    n_samples=int(n_samples/len(all_groups) * len(train_groups)),
    groups=train_groups,
)
dataset_val = Dataset(
    parquet=parquet,
    input_cols=INPUT_COLUMNS,
    target_cols=OUTPUT_COLUMNS,
    batch_size=3072,
    n_samples=int(n_samples/len(all_groups) * len(val_groups)),
    groups=val_groups,
)

In [None]:
model = src.pytorch.models.MLP()

trainloader = dataset_train.to_dataloader()
valloader = dataset_val.to_dataloader()

model, best_weights, loss = src.pytorch.models.utils.train(
    model=model,
    dataloaders={"Training": trainloader, "Validation": valloader},
    num_epochs=10,
)
src.visualization.performance.loss_curve(loss, close=False)

In [None]:
# Save weights if needed
MODEL_WEIGHTS_PATH = "./model.pt"

# src.pytorch.models.utils.save_weights(
#     model=model,
#     model_path=MODEL_WEIGHTS_PATH,
# )

In [None]:
# Load data to generate submission file

import pandas as pd

TESTSET_DATA_PATH = "/home/data/test.arrow"
TESTSET_PREDICTION_WEIGHTS_PATH = "/home/data/sample_submission.arrow"

# df_submission = pd.read_feather(TESTSET_DATA_PATH)
# df_weights = pd.read_feather(TESTSET_PREDICTION_WEIGHTS_PATH)

# Load model to generate submission file

# model = src.pytorch.models.MLP()
# src.pytorch.models.utils.load_model(model, MODEL_WEIGHTS_PATH)
# model.to(src.env.DEVICE)

In [None]:
# If you want to submit the predictions

# src.pytorch.data.kaggle.output_compressed_parquet(
#     model=model,
#     df=df_submission,
#     weights=df_weights,
# )