# GEC Model Training

## Imports and Setup

Check list:
* Check if GPU is enabled in utils.py Config
* Check the datasets being used in cfgs/train_error_correction_ce_syntax.py

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# GCP specific setup. Comment out for SageMaker.
# Connect to google drive
# from google.colab import drive
# drive.mount('/content/drive')

# # Clone repo and install required libraries
# !git clone https://ram-senth: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@github.com/team-langbot/model_gec.git
# %cd /content/model_gec
# !git checkout -b model origin/model


In [2]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.64.3-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting regex (from simpletransformers)
  Downloading regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers>=4.31.0 (from simpletransformers)
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting datasets (from simpletransformers)
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [8]:
# Easier to just install SimpleTtransformers
# !pip install seqeval
# !pip install datasets
# !pip install transformers
# !pip install wandb
# !pip install tqdm
# !pip install tokenizer

# AWS specific
# !pip install tensorflow
# !pip install tensorboard

Collecting tensorflow
  Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow)
  Downloading flatbuffers-23.5.26-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.5.4-py3-none-any.whl (19 kB)
Collecting h5py>=2.9.0 (from tensorflow)
  Downloading h5py-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.5 kB)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-16.0.6-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting ml-dtypes~=0.2.0 (from tensorflow)
  Downloading ml_dtypes-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.3.0-py3-none-any.wh

In [10]:
import pandas as pd
import pickle
import os
from utils import Config, Training_config
import torch
import wandb
from seqeval.metrics import accuracy_score

torch.multiprocessing.set_sharing_strategy('file_system')
# from simpletransformers.ner import NERModel
from ner import NERModel

# Change this to True to train on GPU.
USE_CUDA = True
EPOCHS = 1
main_args = Config()
# Change to small_dataset for testing logic.
DATA = main_args.full_dataset
WANDB_PROJECT_NAME = "tt_gec_beto_cows_l2h_full"
# Disable HuggingFace's perallel tokenization feature to avoid any deadlocks with our small dataset.
%env TOKENIZERS_PARALLELISM=false

train_args = Training_config(main_args.ECC_TRAIN_CONFIG)
train_args.use_cuda = USE_CUDA
train_args.epochs = EPOCHS
train_args.train_dev_data = [
    f'{main_args.PROCESSED_DATA_FOLDER}/{DATA.GEC_TRAIN_NER}',
    f'{main_args.PROCESSED_DATA_FOLDER}/{DATA.GEC_DEV_NER}',
    f'{main_args.PROCESSED_DATA_FOLDER}/{DATA.GEC_TEST_NER}']

def test_config():
  print(f'Training config file {main_args.ECC_TRAIN_CONFIG}')
  print(f'Class ids: {main_args.CLASS_IDS}')
  print(f'debug is enabled? {train_args.debug}')
  print(f'Is GPU enabled? {train_args.use_cuda}')
  files = "\n".join(train_args.train_dev_data)
  print(f'Dataset files: \n{files}')

test_config()

env: TOKENIZERS_PARALLELISM=false
Training config file cfgs/train_error_correction_ce_syntax.py
Class ids: {'article': 's1', 'gender agreement': 's2', 'gender and number agreement': 's3', 'number agreement': 's5'}
debug is enabled? False
Is GPU enabled? True
Dataset files: 
./processed_data/bert_train.pkl
./processed_data/bert_dev.pkl
./processed_data/bert_test.pkl


In [11]:
# Load Config and data
output_base_dir = os.path.join(os.path.abspath('.'), 'outputs')

args = train_args
labels = args.BIO_labels

# if args.model_type != 'interactive':
#     labels = labels[:-1]

train_file, eval_file, test_file = args.train_dev_data

print("reading train file:", train_file)
with open(train_file, 'rb') as f:
    train_data = pickle.load(f)
print("reading eval file:", eval_file)
with open(eval_file, 'rb') as f:
    eval_data = pickle.load(f)
print("reading test file:", test_file)
with open(test_file, 'rb') as f:
    test_data = pickle.load(f)

if args.debug:
    train_data = train_data[:[i for i, d in enumerate(train_data) if d[0]<6][-1]]
    eval_data = eval_data[:[i for i, d in enumerate(eval_data) if d[0]<6][-1]]
    test_data = test_data[:[i for i, d in enumerate(test_data) if d[0]<6][-1]]
    wandb_project = False
else:
    wandb_project = WANDB_PROJECT_NAME


reading train file: ./processed_data/bert_train.pkl
reading eval file: ./processed_data/bert_dev.pkl
reading test file: ./processed_data/bert_test.pkl


In [12]:
%%time
def train(train_file, eval_file, test_file=None):
  columns=["sentence_id", "words", "labels", "cls_labels", "correction_index", "parsing_embedding"]

  train_df = pd.DataFrame(train_data, columns=columns)
  eval_df = pd.DataFrame(eval_data, columns=columns)
  test_df = None
  if test_file:
      test_df = pd.DataFrame(test_data, columns=columns)
  if not args.parsing_embedding:
      train_df = train_df.drop(['parsing_embedding'], axis=1)
      eval_df = eval_df.drop(['parsing_embedding'], axis=1)
      test_df = test_df.drop(['parsing_embedding'], axis=1)

  print(f'len(train_df):{len(train_df)}, len(eval_df):{len(eval_df)}, len(test_df):{len(test_df) if test_df else 0}')

  if args.only_inference is not None:
      args.model_name = output_base_dir + args.exp_name
  print(args.exp_name)

  if args.only_inference is not None:
      if args.output_dir is None:
          output_dir = output_base_dir + args.exp_name
      else:
          output_dir = args.output_dir
          args.exp_name = output_dir
  else:
      output_dir = output_base_dir + args.exp_name + '/eval'

  model_args = {"overwrite_output_dir": True,
            "num_train_epochs": args.epochs,
            "train_batch_size": args.train_batch_size,
            "eval_batch_size": args.eval_batch_size,
            "output_dir": output_dir,
            "reprocess_input_data": True,
            "special_tokens_list": ["[NONE]", "[MOD]"],
            "wandb_kwargs": {
                "mode": 'online', #'offline',
                "name": args.exp_name,
            },
            "wandb_project": wandb_project,
            "evaluate_during_training": args.evaluate_during_training,
            "evaluate_each_epoch": args.evaluate_each_epoch,
            "learning_rate": args.lr,
            "multi_loss": args.multi_loss,
            "wo_token_labels": args.wo_token_labels,
            "cls_num_labels": 15, # label nums for [CLS] token classification
            "use_multiprocessing_for_evaluation": False,
            "use_multiprocessing": args.use_multiprocessing,
            "loss_weight": args.loss_weight,
            "max_correction_embeddings": args.max_correction_embeddings,
            "max_seq_length": args.max_seq_length,
            "n_gpu": args.n_gpu,
            "dataloader_num_workers": 20,
            "save_eval_checkpoints": False,
            "early_stopping_metric": "f1_score",
            "best_model_dir": output_dir,
            "parsing_embedding": args.parsing_embedding,
            "parsing_embedding_for_embedding": args.parsing_embedding_for_embedding,
            "logging_steps": 0,
            "manual_seed": 42
            }

  model = NERModel(
      model_type=args.model,
      model_name=args.model_name,
      labels=labels,
      args=model_args,
      use_cuda=args.use_cuda,
      weight=None # Todo: add class weights
  )
  if args.only_inference is None:
      # Train the model
      model.train_model(train_df, eval_data=eval_df, test_data=test_df)

  # if args.only_inference is None or 'dev' in args.only_inference:
  #     result, model_outputs, predictions, out_label_list = model.eval_model(eval_df, wandb_log=False)
  # if args.only_inference is None or 'test' in args.only_inference:
  #     result, model_outputs, predictions, out_label_list = model.eval_model(test_df, wandb_log=False)

  #Save model to google drive
  # Skip saving to google drive as we can use WandB.
  # model.model.save_pretrained(f'/content/drive/MyDrive/MIDS/w210/pretrained/{WANDB_PROJECT_NAME}')
  # model.tokenizer.save_pretrained(f'/content/drive/MyDrive/MIDS/w210/pretrained/{WANDB_PROJECT_NAME}')
  # model.config.save_pretrained(f'/content/drive/MyDrive/MIDS/w210/pretrained/{WANDB_PROJECT_NAME}/')

  # Save model locally
  model.model.save_pretrained(f'outputs/{WANDB_PROJECT_NAME}')
  model.tokenizer.save_pretrained(f'outputs/{WANDB_PROJECT_NAME}')
  model.config.save_pretrained(f'outputs/{WANDB_PROJECT_NAME}')

  # Save pickled model to wandb
  with open(f'outputs/{WANDB_PROJECT_NAME}.pkl', 'wb') as f:
    pickle.dump(model, f)
    if wandb_project:
      wandb.save(f'outputs/{WANDB_PROJECT_NAME}.pkl')

  # Create metrics against the dev data set
  result, model_outputs, predictions = model.eval_model(eval_df, wandb_log=True, output_dir='outputs/', accuracy=accuracy_score)

  with open('outputs/dev_model_result.pkl', 'wb') as f:
    pickle.dump(result, f)
  with open('outputs/dev_model_output.pkl', 'wb') as f:
    pickle.dump(model_outputs, f)
  with open('outputs/dev_predictions.pkl', 'wb') as f:
    pickle.dump(predictions, f)

  # # Upload the three result artifacts
  wandb.save('outputs/dev_model_result.pkl')
  wandb.save('outputs/dev_model_output.pkl')
  wandb.save('outputs/dev_predictions.pkl')

  # Wrapup wandb project
  if wandb_project:
    wandb.finish()

  return model, eval_df, test_df

model, eval_df, test_df = train(train_file, eval_file, test_file=None)

len(train_df):129128, len(eval_df):16661, len(test_df):0
/error_correction_ce_syntax/ner_1e5_bs32_bert_large


Some weights of BertForTokenClassification were not initialized from the model checkpoint at espejelomar/beto-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/4144 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

[34m[1mwandb[0m: Currently logged in as: [33mram-senth[0m ([33mlangbot[0m). Use [1m`wandb login --relogin`[0m to force relogin




Running Epoch 0 of 1:   0%|          | 0/518 [00:00<?, ?it/s]



  0%|          | 0/519 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/519 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/9 [00:00<?, ?it/s]



VBox(children=(Label(value='319.291 MB of 417.413 MB uploaded\r'), FloatProgress(value=0.7649280880948914, max…

0,1
eval_loss,▁
f1_score,▁
global_step,▁
precision,▁
recall,▁
train_loss,▁

0,1
eval_loss,0.59493
f1_score,0.11799
global_step,518.0
precision,0.39683
recall,0.0693
train_loss,0.45634


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112734777778617, max=1.0…



VBox(children=(Label(value='0.460 MB of 0.462 MB uploaded\r'), FloatProgress(value=0.9964310646084704, max=1.0…



CPU times: user 54.3 s, sys: 9.96 s, total: 1min 4s
Wall time: 3min 36s


In [None]:
# Explicitly close wandb project if run is aborted.
# wandb.finish(exit_code=-1)