In [1]:
import fasttext
import os
import subprocess
import yaml
from datetime import datetime

## config

In [2]:
# training data
TRAIN_DATA_PATH = "/veld/input/dewiki.txt"
TRAIN_DATA_NAME = "german wikipedia"

# hyperparameters
EPOCHS = 5
VECTOR_SIZE = 200

# output
MODEL_PATH = "/veld/output/m2/model.bin"
MODEL_METADATA_PATH = "/veld/output/m2/metadata.yaml"

## training and persisting

In [3]:
time_start = datetime.now()
model = fasttext.train_unsupervised(TRAIN_DATA_PATH, epoch=EPOCHS, dim=VECTOR_SIZE)
time_end = datetime.now()
duration = (time_end - time_start).seconds / 60
model.save_model(MODEL_PATH)

Read 1274M words
Number of words:  2435989
Number of labels: 0
Progress: 100.0% words/sec/thread:   27746 lr:  0.000000 avg.loss:  0.077716 ETA:   0h 0m 0s  1.2% words/sec/thread:   27040 lr:  0.049418 avg.loss:  1.103365 ETA:   5h52m55s  26983 lr:  0.049341 avg.loss:  1.091777 ETA:   5h53m 6s  1.5% words/sec/thread:   26924 lr:  0.049232 avg.loss:  1.054234 ETA:   5h53m 6s  2.8% words/sec/thread:   26747 lr:  0.048598 avg.loss:  0.935665 ETA:   5h50m52s  2.9% words/sec/thread:   26744 lr:  0.048562 avg.loss:  0.933161 ETA:   5h50m38s  2.9% words/sec/thread:   26744 lr:  0.048556 avg.loss:  0.932625 ETA:   5h50m36s  3.2% words/sec/thread:   26750 lr:  0.048391 avg.loss:  0.912477 ETA:   5h49m20s 0.048126 avg.loss:  0.835594 ETA:   5h47m24s  4.3% words/sec/thread:   26789 lr:  0.047854 avg.loss:  0.773376 ETA:   5h44m57s  4.4% words/sec/thread:   26797 lr:  0.047804 avg.loss:  0.762810 ETA:   5h44m29s  26798 lr:  0.047798 avg.loss:  0.761506 ETA:   5h44m25s  4.5% words/sec/thread:   268

## writing metadata 

In [4]:
# calculate size of training data
def calc_size(file):
    size = os.path.getsize(file)
    for unit in ["","KB","MB","GB","TB"]:
        if abs(size) < 1024.0:
            return f"{round(size, 1)} {unit}"
        size /= 1024.0
train_data_size = calc_size(TRAIN_DATA_PATH)
model_data_size = calc_size(MODEL_PATH)

# calculate hash of training data
train_data_md5_hash = subprocess.run(["md5sum", TRAIN_DATA_PATH], capture_output=True, text=True)
train_data_md5_hash = train_data_md5_hash.stdout.split()[0]


# aggregate into metadata dictionary
metadata = {
    "train_data_name": TRAIN_DATA_NAME,
    "train_data_size": train_data_size,
    "train_data_md5_hash": train_data_md5_hash,
    "training_epochs": EPOCHS,
    "training_vector_size": VECTOR_SIZE,
    "training_duration (minutes)": round(duration, 1),
    "model_data_size": model_data_size,
}

# write to yaml
with open(MODEL_METADATA_PATH, "w") as f:
    # iteration over dictionary to ensure the yaml writer respects the order
    for k, v in metadata.items():
        yaml.dump({k: v}, f)