# Active learning - part 2 - resolve annotation and model training

1. communicate_brat_server：Download annotated data from the BRAT server
2. process_brat_annotation：Resolve annotated data
3. build_training_data：create training data
4. Model training

In [1]:
import sys
sys.path.append("../../src/")
sys.path.append("../../../../git_clone_repos/fast-coref/src/")

In [2]:
from hydra import compose, initialize
from omegaconf import OmegaConf

config = None
with initialize(version_base=None, config_path="../config", job_name="active_learning"):
    config = compose(config_name="active_learning")
# print(OmegaConf.to_yaml(config))

## Step 1: Download annotated data from the BRAT server

In [3]:
import os
from active_learning.communicate_brat_server import RemoteConnection

brat_server_basedir_name = f"iter_{config.current_iter}"
brat_server_dir = os.path.join(config.remote_server.brat.data_dir, brat_server_basedir_name)

brat_finished_parent_dir = os.path.split(config.output.brat.finished_dir)[0]
brat_finished_base_dir_name = os.path.split(config.output.brat.finished_dir)[1]

hostname = config.remote_server.brat.hostname
username = config.remote_server.brat.username
password = config.remote_server.brat.password
connection = RemoteConnection(hostname, username, password)
connection.get_all(
    brat_server_dir,
    brat_finished_parent_dir,
)
connection.close_client()

# Rename local dir
os.rename(
    os.path.join(brat_finished_parent_dir, brat_server_basedir_name),
    os.path.join(brat_finished_parent_dir, brat_finished_base_dir_name),
)

100%|██████████| 2/2 [00:02<00:00,  1.21s/it]
100%|██████████| 10/10 [00:12<00:00,  1.21s/it]
100%|██████████| 10/10 [00:12<00:00,  1.21s/it]
3it [00:29,  9.70s/it]


## Step 2: Resolve brat data and build training data

In [3]:
from active_learning.process_brat_annotation import resolve_brat
from active_learning.build_training_data import (
    build_aggregrated_conll,
    concat_previous_conll_and_jsonlines,
    build_individua_conll,
    build_jsonlines,
)

resolve_brat(config)
build_individua_conll(config)
build_aggregrated_conll(config)
build_jsonlines(config)
concat_previous_conll_and_jsonlines(config)

print("The training data is created at: \n", config.output.model_training_data.base_dir)

/home/yuxiangliao/PhD/workspace/VSCode_workspace/sr_coref/output/mimic_cxr/active_learning/iter_8/brat_finished/findings/s57786720.txt
/home/yuxiangliao/PhD/workspace/VSCode_workspace/sr_coref/output/mimic_cxr/active_learning/iter_8/brat_finished/findings/s53097934.txt
/home/yuxiangliao/PhD/workspace/VSCode_workspace/sr_coref/output/mimic_cxr/active_learning/iter_8/brat_finished/findings/s58779775.txt
/home/yuxiangliao/PhD/workspace/VSCode_workspace/sr_coref/output/mimic_cxr/active_learning/iter_8/brat_finished/findings/s59074740.txt
/home/yuxiangliao/PhD/workspace/VSCode_workspace/sr_coref/output/mimic_cxr/active_learning/iter_8/brat_finished/findings/s56075000.txt
/home/yuxiangliao/PhD/workspace/VSCode_workspace/sr_coref/output/mimic_cxr/active_learning/iter_8/brat_finished/findings/s56081725.txt
/home/yuxiangliao/PhD/workspace/VSCode_workspace/sr_coref/output/mimic_cxr/active_learning/iter_8/brat_finished/findings/s58011103.txt
/home/yuxiangliao/PhD/workspace/VSCode_workspace/sr_cor

FileNotFoundError: [Errno 2] No such file or directory: '/home/yuxiangliao/PhD/workspace/VSCode_workspace/sr_coref/output/mimic_cxr/active_learning/iter_7/model_training_data/conll/train.conll'

## Step 3: Move training data to the model's resource dir

In [5]:
import shutil
source_dir = config.output.model_training_data.base_dir
des_dir = config.coref_model.dataset_dir
# copy the subdirs from source_dir to des_dir
shutil.copytree(source_dir, des_dir)

'/home/yuxiangliao/PhD/workspace/git_clone_repos/fast-coref/coref_resources/data/mimic_active_learning_iter_1'

## Step 4: Create model's config files

In [None]:
import os
from active_learning.utils import get_trainset_size

dataset_name = config.coref_model.dataset_name
num_train_docs = get_trainset_size(config)

content = f"""{dataset_name}:
\tname: "{dataset_name}"
\tcluster_threshold: 2  # Singletons are ignored for evaluation (also not annotated)
\tcanonical_cluster_threshold: 2
\ttargeted_eval: False
\tnum_train_docs: {num_train_docs}
\tnum_dev_docs: 25
\tnum_test_docs: 200
\thas_conll: True
\tsingleton_file: null
"""

dataset_conf_file = os.path.join(config.coref_model.conf_base_dir,"datasets",f"{dataset_name}.yaml")
with open(dataset_conf_file, "w", encoding="utf-8") as f:
    f.write(content)

In [None]:
content = f"""# @package _global_

defaults:
\t- override /datasets: {dataset_name}
\t- override /trainer: train.yaml
\t- override /model/doc_encoder/transformer: longformer_joint_arcca_local

trainer:
\tlog_frequency: 50
\tpatience: 10
\tmax_evals: 100
\teval_per_k_steps: {num_train_docs}

model:
\tdoc_encoder:
\t\tadd_speaker_tokens: True
\t\tfinetune: False
"""

dataset_conf_file = os.path.join(config.coref_model.conf_base_dir,"experiment","arcca_exp_10.yaml")
with open(dataset_conf_file, "w", encoding="utf-8") as f:
    f.write(content)

## Step 4: Model training