In [None]:
import fasttext
import os
import subprocess
import yaml
from datetime import datetime

## config

In [None]:
# training data
TRAIN_DATA_PATH = "/veld/input/dewiki.txt"
TRAIN_DATA_NAME = "german wikipedia"

# hyperparameters
EPOCHS = 5
VECTOR_SIZE = 200

# output
MODEL_PATH = "/veld/output/m2/model.bin"
MODEL_METADATA_PATH = "/veld/output/m2/metadata.yaml"

## training and persisting

In [None]:
time_start = datetime.now()
model = fasttext.train_unsupervised(TRAIN_DATA_PATH, epoch=EPOCHS, dim=VECTOR_SIZE)
time_end = datetime.now()
duration = (time_end - time_start).seconds / 60
model.save_model(MODEL_PATH)

## writing metadata 

In [None]:
# calculate size of training data
def calc_size(file):
    size = os.path.getsize(file)
    for unit in ["","KB","MB","GB","TB"]:
        if abs(size) < 1024.0:
            return f"{round(size, 1)} {unit}"
        size /= 1024.0
train_data_size = calc_size(TRAIN_DATA_PATH)
model_data_size = calc_size(MODEL_PATH)

# calculate hash of training data
train_data_md5_hash = subprocess.run(["md5sum", TRAIN_DATA_PATH], capture_output=True, text=True)
train_data_md5_hash = train_data_md5_hash.stdout.split()[0]


# aggregate into metadata dictionary
metadata = {
    "train_data_name": TRAIN_DATA_NAME,
    "train_data_size": train_data_size,
    "train_data_md5_hash": train_data_md5_hash,
    "training_epochs": EPOCHS,
    "training_vector_size": VECTOR_SIZE,
    "training_duration (minutes)": round(duration, 1),
    "model_data_size": model_data_size,
}

# write to yaml
with open(MODEL_METADATA_PATH, "w") as f:
    # iteration over dictionary to ensure the yaml writer respects the order
    for k, v in metadata.items():
        yaml.dump({k: v}, f)