# GEC Model V1 - Errors and Evidence Words
This notebook has our initial attempt at training a token classification model following the research paper [Enhancing Grammatical Error Correction Systems with Explanations](https://aclanthology.org/2023.acl-long.413/). As done in this paper, we first wanted to predict evidence words along with errors. However, the model did not do as well as we had hoped for, with F1 score of about 0.35. So, we pivoted to predicting just the error words first. That part of our experiments is covered in other notebooks in this repo. We will revisit evidence words prediction in the near future.

## Imports and Setup

In [1]:
%load_ext autoreload
%autoreload 2
PLATFORM='GCP' # 'GCP' or 'AWS' or 'LOCAL'

# Disable HuggingFace's parallel tokenization feature to avoid any deadlock with our small dataset.
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [2]:
# GCP specific setup.
if PLATFORM == 'GCP':
    # Connect to google drive
    # from google.colab import drive
    # drive.mount('/content/drive')

    # Clone repo and install required libraries
    !git clone https://ram-senth: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@github.com/team-langbot/model_gec.git

    # !git checkout -b model origin/model

!git config --global user.email "ram.senth@berkeley.edu"
!git config --global user.name "Ram S"

Cloning into 'model_gec'...
remote: Enumerating objects: 299, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 299 (delta 30), reused 40 (delta 17), pack-reused 227[K
Receiving objects: 100% (299/299), 21.14 MiB | 15.95 MiB/s, done.
Resolving deltas: 100% (139/139), done.


In [3]:
%cd /content/model_gec
!git status

/content/model_gec
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [None]:
!pip install simpletransformers

In [13]:
import pandas as pd
import pickle
import os
from utils import Config, Training_config
import torch
import wandb
from seqeval.metrics import accuracy_score
from ner import NERModel

torch.multiprocessing.set_sharing_strategy('file_system')

# Change this to True to train on GPU.
DEBUG = False
WANDB_PROJECT_NAME = "langbot_gec"
ECC_TRAIN_CONFIG = 'cfgs/beto_2classes.py'
# ECC_TRAIN_CONFIG = 'cfgs/mBERT_2classes.py'

main_args = Config()

train_args = Training_config(ECC_TRAIN_CONFIG)
if PLATFORM == 'local':
    train_args.use_cuda = False
train_args.debug = DEBUG

def test_config():
    print(f'Training config file {ECC_TRAIN_CONFIG}')
    print(f'Class ids: {main_args.CLASS_IDS}')
    print(f'debug is enabled? {train_args.debug}')
    print(f'Is GPU enabled? {train_args.use_cuda}')
    files = "\n".join(train_args.train_dev_data)
    print(f'Dataset files: \n{files}')
    print(f'Training label list: {train_args.labels_list}')
    print(f'Model: {train_args.model_name}')
    print(f'Experiment: {train_args.exp_name}')

test_config()

Training config file cfgs/beto_2classes.py
Class ids: {'article': 'a', 'gender agreement': 'ga', 'gender and number agreement': 'gna', 'number agreement': 'na'}
debug is enabled? False
Is GPU enabled? True
Dataset files: 
processed_data/bert_train_two_classed.pkl
processed_data/bert_dev_two_classed.pkl
processed_data/bert_test_two_classed.pkl
Training label list: ['B-ga', 'I-ga', 'B-na', 'I-na', 'O']
Model: dccuchile/bert-base-spanish-wwm-uncased
Experiment: beto_cows_l2h_two_classes


In [14]:
def load_data(debug, train_file, eval_file, test_file):
    print("reading train file:", train_file)
    with open(train_file, 'rb') as f:
        train_data = pickle.load(f)
    print("reading eval file:", eval_file)
    with open(eval_file, 'rb') as f:
        eval_data = pickle.load(f)
    if test_file != '':
        print("reading test file:", test_file)
        with open(test_file, 'rb') as f:
            test_data = pickle.load(f)
    else:
        test_data = None

    if debug:
        train_data = train_data[:[i for i, d in enumerate(train_data) if d[0]<6][-1]]
        train_data = train_data[0:80]
        eval_data = eval_data[:[i for i, d in enumerate(eval_data) if d[0]<6][-1]]
        eval_data = eval_data[0:10]
        if test_data:
            test_data = test_data[:[i for i, d in enumerate(test_data) if d[0]<6][-1]]
            test_data = test_data[0:10]
    return train_data, eval_data, test_data

def evaluate_and_save(args, wandb_project, model, eval_df):
    # Create metrics against the dev data set
    result, model_outputs, predictions = model.eval_model(
        eval_df, wandb_log=True, output_dir='outputs/', accuracy=accuracy_score)

    # Save model locally
    model.model.save_pretrained(f'outputs/{args.exp_name}')
    model.tokenizer.save_pretrained(f'outputs/{args.exp_name}')
    model.config.save_pretrained(f'outputs/{args.exp_name}')

    #Save model to google drive
    # Skip saving to google drive as we can use WandB.
    # model.model.save_pretrained(f'/content/drive/MyDrive/MIDS/w210/pretrained/{WANDB_PROJECT_NAME}')
    # model.tokenizer.save_pretrained(f'/content/drive/MyDrive/MIDS/w210/pretrained/{WANDB_PROJECT_NAME}')
    # model.config.save_pretrained(f'/content/drive/MyDrive/MIDS/w210/pretrained/{WANDB_PROJECT_NAME}/')

    # Save pickled model to wandb
    # NOTE: All artifacts are stored in the second evaluation WandB project.
    with open(f'outputs/{args.exp_name}/{args.exp_name}.pkl', 'wb') as f:
        pickle.dump(model, f)
        if wandb_project:
            wandb.save(f'outputs/{args.exp_name}/{args.exp_name}.pkl')

    with open(f'outputs/{args.exp_name}/dev_result.pkl', 'wb') as f:
        pickle.dump(result, f)
    with open(f'outputs/{args.exp_name}/dev_output.pkl', 'wb') as f:
        pickle.dump(model_outputs, f)
    with open(f'outputs/{args.exp_name}/dev_predictions.pkl', 'wb') as f:
        pickle.dump(predictions, f)

    # # Upload the three result artifacts
    if wandb_project:
        wandb.save(f'outputs/{args.exp_name}/dev_result.pkl')
        wandb.save(f'outputs/{args.exp_name}/dev_output.pkl')
        wandb.save(f'outputs/{args.exp_name}/dev_predictions.pkl')
        # Wrapup wandb project
        wandb.finish()


def train(args, wandb_project):
    output_base_dir = os.path.join(os.path.abspath('.'), 'outputs')
    labels = args.labels_list
    train_file, eval_file, test_file = args.train_dev_data
    train_data, eval_data, test_data = load_data(args.debug, train_file, eval_file, test_file)
    columns=["sentence_id", "words", "labels", "cls_labels", "correction_index", "parsing_embedding"]

    train_df = pd.DataFrame(train_data, columns=columns)
    eval_df = pd.DataFrame(eval_data, columns=columns)
    test_df = None
    if test_data:
        test_df = pd.DataFrame(test_data, columns=columns)

    if not args.parsing_embedding:
        train_df = train_df.drop(['parsing_embedding'], axis=1)
        eval_df = eval_df.drop(['parsing_embedding'], axis=1)
        test_df = test_df.drop(['parsing_embedding'], axis=1)

    print(f'len(train_df):{len(train_df)}, len(eval_df):{len(eval_df)}, len(test_df):{len(test_df) if test_data else 0}')

    if args.only_inference is not None:
        args.model_name = output_base_dir + args.exp_name
    print(args.exp_name)

    if args.only_inference is not None:
        if args.output_dir is None:
            output_dir = output_base_dir + args.exp_name
        else:
            output_dir = args.output_dir
            args.exp_name = output_dir
    else:
        output_dir = f'{output_base_dir}/{args.exp_name}/eval'

    model_args = {"overwrite_output_dir": True,
            "num_train_epochs": 2 if args.debug else args.epochs,
            "train_batch_size": args.train_batch_size,
            "eval_batch_size": args.eval_batch_size,
            "output_dir": output_dir,
            "reprocess_input_data": True,
            "special_tokens_list": ["[NONE]", "[MOD]"],
            "wandb_kwargs": {
                "mode": 'online', #'offline',
                "name": args.exp_name,
            },
            "wandb_project": wandb_project,
            "evaluate_during_training": args.evaluate_during_training,
            "evaluate_each_epoch": args.evaluate_each_epoch,
            "learning_rate": args.lr,
            "multi_loss": args.multi_loss,
            "wo_token_labels": args.wo_token_labels,
            "use_multiprocessing_for_evaluation": False,
            "use_multiprocessing": args.use_multiprocessing,
            "loss_weight": args.loss_weight,
            "max_correction_embeddings": args.max_correction_embeddings,
            "max_seq_length": args.max_seq_length,
            "n_gpu": args.n_gpu,
            "dataloader_num_workers": 4,
            "save_eval_checkpoints": False,
            "early_stopping_metric": "f1_score",
            "best_model_dir": output_dir,
            "parsing_embedding": args.parsing_embedding,
            "parsing_embedding_for_embedding": args.parsing_embedding_for_embedding,
            "logging_steps": 0,
            "manual_seed": 42
            }

    model = NERModel(
        model_type=args.model,
        model_name=args.model_name,
        labels=labels,
        args=model_args,
        use_cuda=args.use_cuda,
        weight=None # Todo: add class weights
    )
    if args.only_inference is None:
        # Train the model
        model.train_model(train_df, eval_data=eval_df, test_data=test_df)

    # Wrapup wandb project
    if wandb_project:
        wandb.finish()

    return model, eval_df, test_df

In [15]:
# %%time
wandb_project = False if train_args.debug else WANDB_PROJECT_NAME

# Train model
model, eval_df, test_df = train(train_args, wandb_project)

# Evaluate model and save all artifacts
evaluate_and_save(train_args, wandb_project, model, eval_df)

reading train file: processed_data/bert_train_two_classed.pkl
reading eval file: processed_data/bert_dev_two_classed.pkl
reading test file: processed_data/bert_test_two_classed.pkl
len(train_df):109927, len(eval_df):13820, len(test_df):13955
beto_cows_l2h_two_classes


config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/486k [00:00<?, ?B/s]

  0%|          | 0/3545 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/444 [00:00<?, ?it/s]



  0%|          | 0/444 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/443 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/444 [00:00<?, ?it/s]

  0%|          | 0/444 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/443 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/444 [00:00<?, ?it/s]

  0%|          | 0/444 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/443 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/444 [00:00<?, ?it/s]

  0%|          | 0/444 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/443 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/444 [00:00<?, ?it/s]

  0%|          | 0/444 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/443 [00:00<?, ?it/s]

  0%|          | 0/444 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/443 [00:00<?, ?it/s]

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval_loss,█▂▂▁▂▁
f1_score,▁▅▆▆█▇
global_step,▁▃▄▆▇█
precision,▂█▁▃▅▄
recall,▁▄▇▇██
test_eval_loss,█▁▁▁▁▁
test_f1_score,▁▅▇█▇█
test_precision,▁█▃▆▅▅
test_recall,▁▃██▇█
train_loss,▂█▅▂▁▁

0,1
eval_loss,0.38449
f1_score,0.37241
global_step,2220.0
precision,0.45324
recall,0.31605
test_eval_loss,0.39053
test_f1_score,0.36253
test_precision,0.44334
test_recall,0.30663
train_loss,0.37574


  0%|          | 0/444 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/7 [00:00<?, ?it/s]



VBox(children=(Label(value='0.448 MB of 0.449 MB uploaded\r'), FloatProgress(value=0.9987718746547225, max=1.0…

In [None]:
# Explicitly close wandb project if run is aborted.
# wandb.finish(exit_code=-1)