In [5]:
import os
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
import json

from transformers import AutoModel, AutoTokenizer

from bert_clf.src.BertCLF import BertCLF
from bert_clf.src.training_utils import train_evaluate, predict_metrics
from bert_clf.src.preparing_data_utils import prepare_data, prepare_data_notebook, prepare_dataset

In [None]:
config = dict(
    transformer_model = dict(
        model = "cointegrated/rubert-tiny",
        path_to_state_dict = False,
        device = 'cuda',
        dropout = 0.2,
        tiny_bert = True, 
        learning_rate = 1e-6,
        batch_size = 256,
        shuffle = True,
        maxlen = 512,
    ),
    data = dict(
        train_data_path = None,
        test_data_path = None,
        text_column = "text",
        target_column = "target",
        random_state = 42,
        test_size = 0.3,
    ),
    training = dict (
    save_state_dict = False, # if False the model will be saved using torch.save(<model_class>)
        # and should be loaded like this: model = torch.load()
        # you will have to install the library to do so
    delta = 0.001,
    patience = 7,
    num_epochs = 50,
    average_f1 = 'macro',
    output_dir = "../results/"
    )
)

In [None]:
os.makedirs(config['training']['output_dir'], exist_ok=True)

In [None]:
device = torch.device(config['transformer_model']['device'])
tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path=config['transformer_model']["model"]
    )
model_bert = AutoModel.from_pretrained(
    pretrained_model_name_or_path=config['transformer_model']["model"]
).to(device)

Here you can either use `prepare_data` function if you want just pass the
path to your data or you can pass the data itself in the form of pandas dataframe. In the latter case
you should use `prepare_data_notebook` function

In [None]:
id2label, train_texts, valid_texts, train_targets, valid_targets = prepare_data_notebook(
    config=config,
    train_df=df_all
)

In [None]:
model = BertCLF(
    pretrained_model=model_bert,
    tokenizer=tokenizer,
    id2label=id2label,
    dropout=config['transformer_model']['dropout'],
    tiny=config['transformer_model']['tiny_bert'],
    device=device     
    )

In [None]:
model = model.to(device)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=float(config['transformer_model']['learning_rate']))
criterion = nn.NLLLoss()

training_generator, valid_generator = prepare_dataset(
    tokenizer=tokenizer,
    train_texts=train_texts,
    train_targets=train_targets,
    valid_texts=valid_texts,
    valid_targets=valid_targets,
    config=config
)

In [None]:
model = train_evaluate(
    model=model,
    training_generator=training_generator,
    valid_generator=valid_generator,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=config['training']['num_epochs'],
    average=config['training']['average_f1'],
    config=config
)

In [60]:
torch.save(
    model.state_dict(), 
    os.path.join(config['training']['output_dir'], "model.pth")
)

with open(os.path.join(config['training']['output_dir'], 'label_mapper.json'), mode='w', encoding='utf-8') as f:
        json.dump(model.mapper, f, indent=4, ensure_ascii=False)