### Configuration 

- This first step is the configuration step, set variables that are going to control how the training is going to proceed.

In [1]:
TRAINING_DATA_PATH = ""
TRAINING_DATA_ORIGIN = "wiki"
CONFIGURATION_FILE_PATH = "Configuration/config/config_1"
DATA_STRUCTURE_TYPE = "slt"
SLT_ENCODER_MAP_PATH = ""
OPT_ENCODER_MAP_PATH = ""


MODEL_FILE_PATH = "lib/trained_model/slt_model"

# Embedding parameters
TOKENIZE_ALL = False
TOKENIZE_NUMBERS = True
IGNORE_FULL_RELATIVE_PATH = True
EMBEDDING_TYPE = 3

In [None]:
# READ THE DATA IN TUPLES OF SLT OR OPT
from DataReader.wiki_data_reader import WikiDataReader
from DataReader.mse_data_reader import MSEDataReader

print("Reading data to train the model...")
print(f"Data structure type: {DATA_STRUCTURE_TYPE}")

if TRAINING_DATA_ORIGIN == "wiki":
    data_reader = WikiDataReader(TRAINING_DATA_PATH, DATA_STRUCTURE_TYPE == 'slt')
else:
    data_reader = MSEDataReader(TRAINING_DATA_PATH, DATA_STRUCTURE_TYPE == 'slt')


dictionary_formula_slt_tuple = data_reader.get_collection()

print(f"Data readed in slt format!, Number of formulas in the training data: {len(dictionary_formula_slt_tuple.keys())}")

In [None]:
# ENCODE ALL THE TUPLES AND SAVE IN THE ENCODER MAP
print("Encoding all the tuples and saving in the encoder map...")

from lib.tangentCFT.touple_encoder.encoder import (EncoderManager)

count = 0
dictionary_slt_encoded_tuples = {}

encoder_manager = EncoderManager()

for formula in dictionary_formula_slt_tuple:
    count += 1
    if count % 1000 == 0:
        print(f"Encoded {count} formulas")

    dictionary_slt_encoded_tuples[formula] = encoder_manager.encode_tuples(
        dictionary_formula_slt_tuple[formula],
        EMBEDDING_TYPE,
        IGNORE_FULL_RELATIVE_PATH,
        TOKENIZE_ALL,
        TOKENIZE_NUMBERS,
    )

print(f"All {count} formulas encoded and saved in the encoder map!")


In [None]:
# TRAIN AND SAVE THE MODEL
from services.tanget_cft_service import TangentCFTService
from Configuration.configuration import Configuration


print("loading the fast text training configuration...")
config = Configuration(CONFIGURATION_FILE_PATH)

print("training the fast text model...")
tangent_cft_service = TangentCFTService()


tangent_cft_service.train_model(
  config, list(dictionary_slt_encoded_tuples.values())
)

print(f"saving the fast text model in {MODEL_FILE_PATH}...")
tangent_cft_service.save_model(MODEL_FILE_PATH)