In [None]:
from pathlib import Path

ROOT_PATH = Path.cwd().parent.parent

RAW_TRAIN_DATA_PATH = "dataset/raw_data/train.txt"
RAW_TEST_DATA_PATH = "dataset/raw_data/test.txt"

TRAIN_DATA_PATH = "dataset/ner_data/train.data"
TEST_DATA_PATH = "dataset/ner_data/test.data"

TRAIN_GRAINED_DATA_PATH = "dataset/ner_data/train_grained.data"
TEST_GRAINED_DATA_PATH = "dataset/ner_data/test_grained.data"

MODEL = [
    "CRF",
    "SVM",
    "PYTORCH_CRF",
    "BILSTM_CRF",
    "BERT_CRF",
    "BERT_BILSTM_CRF"
]

MODEL_SELECT = 3

%set_env PYTHONPATH=$ROOT_PATH

In [None]:
# Generate train, test NER format Data

!python data_generator.py \
            --RAW_TRAIN_DATA_PATH=$ROOT_PATH/$TRAIN_DATA_PATH \
            --RAW_TEST_DATA_PATH=$ROOT_PATH/$TEST_DATA_PATH \
            --TRAIN_DATA_PATH=$ROOT_PATH/$TRAIN_GRAINED_DATA_PATH \
            --TEST_DATA_PATH=$ROOT_PATH/$TEST_GRAINED_DATA_PATH \
            --OUTPUT_TYPE=split


In [None]:
# Preprocess and generate trainable datasets

!python data_preprocessor.py \
            --TRAIN_DATA_PATH=$ROOT_PATH/$TRAIN_GRAINED_DATA_PATH \
            --TEST_DATA_PATH=$ROOT_PATH/$TEST_GRAINED_DATA_PATH \
            --RAW_TEST_DATA_PATH=$ROOT_PATH/$RAW_TEST_DATA_PATH \
            --MODEL_DATA_PATH=$ROOT_PATH/model/{MODEL[MODEL_SELECT]}/data/


In [None]:
# Tokenize and training process, use the dataset pickled from data_preprocessor

!python ner_trainer.py \
            --MODEL={MODEL[MODEL_SELECT]} \
            --TRAIN_DATA_PATH=$ROOT_PATH/$TRAIN_GRAINED_DATA_PATH \
            --MODEL_DATA_PATH=$ROOT_PATH/model/{MODEL[MODEL_SELECT]}/data/ \
            --MODEL_CHECKPOINT_PATH=$ROOT_PATH/model/{MODEL[MODEL_SELECT]}/checkpoint/ \
            --CHECKPOINT_KEEP=3 \
            --SENTENCE_MAX_LENGTH=32 \
            --BATCH_SIZE=16 \
            --EMBEDDING_SIZE=300 \
            --HIIDEN_NUMS=512 \
            --EPOCHS=1 \
            --LEARNING_RATE=1e-3


In [None]:
# Predicting process and export the results, use the model generated from training checkpoints 

!python ner_predictor.py \
            --MODEL={MODEL[MODEL_SELECT]} \
            --MODEL_DATA_PATH=$ROOT_PATH/model/{MODEL[MODEL_SELECT]}/data/ \
            --MODEL_CHECKPOINT_PATH=$ROOT_PATH/model/{MODEL[MODEL_SELECT]}/checkpoint/ \
            --MODEL_OUTPUT_PATH=$ROOT_PATH/model/{MODEL[MODEL_SELECT]}/output/ \
            --EMBEDDING_SIZE=300 \
            --HIIDEN_NUMS=512 \
            --LEARNING_RATE=1e-3
