<a href="https://colab.research.google.com/github/teekaytai/cs4248-project/blob/ahiyer/%5BWI_LOCNESS%5D_Final_Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GENERAL

In [None]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
test_path = '/content/drive/MyDrive/datasets/wi+locness/dataset_splits/test.json'
train_path = '/content/drive/MyDrive/datasets/wi+locness/dataset_splits/train.json'
val_path = '/content/drive/MyDrive/datasets/wi+locness/dataset_splits/val.json'

In [None]:
import pandas as pd

In [None]:
# Tokenize senteces with simple fuction
from tensorflow.keras.preprocessing.text import Tokenizer


In [None]:
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Concatenate
from tensorflow.keras import Input, Model

In [None]:
import tensorflow as tf
import os
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K

In [None]:
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
from collections import namedtuple

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import spacy

In [None]:
import regex as re

# TRANSFORMERS

## TRANSFORMERS (BART) : No Context

> NO CONTEXT



In [None]:
'''
Baseline
Input: Error Sentences
Output: Corrected Sentences
'''
import spacy
import torch
from transformers import BartModel, BartForConditionalGeneration, BartTokenizerFast
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer

# Configs
BART_MODEL = 'gotutiyan/gec-bart-base'
MAX_SOURCE_LENGTH = 240
MAX_TARGET_LENGTH = 240
NUM_EPOCHS = 10
# Hugging face documentation reccomends 1e-4 or 3e-4 for T5
LEARNING_RATE = 3e-4
NUM_BEAMS = 5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BartTokenizerFast.from_pretrained(BART_MODEL)
nlp = spacy.load("en_core_web_sm")
spacy_tokenizer = nlp.tokenizer

def tokenize_source_sentences(sentences):
    return tokenizer(
        sentences,
        padding = 'max_length',
        max_length = MAX_SOURCE_LENGTH,
        truncation = True,
        return_tensors = "pt",
    ).to(device)

def tokenize_target_sentences(sentences):
    tokenized = tokenizer(
        sentences,
        padding = 'max_length',
        max_length = MAX_TARGET_LENGTH,
        truncation = True,
        return_tensors="pt",
    ).to(device)
    # Replace padding token ids of the labels by -100 so it's ignored by the loss
    ids = tokenized.input_ids
    ids[ids == tokenizer.pad_token_id] = -100
    tokenized.input_ids = ids
    return tokenized

def preprocess_dataset(dataset, source_column_name, target_column_name):
    tokenized_source = tokenize_source_sentences(dataset[source_column_name])
    tokenized_target = tokenize_target_sentences(dataset[target_column_name])
    input = {}
    input['input_ids'] = tokenized_source['input_ids']
    input['attention_mask'] = tokenized_source['attention_mask']
    input['labels'] = tokenized_target['input_ids']
    return input

def train(train_dataset, eval_dataset, output_dir):
    model = BartForConditionalGeneration.from_pretrained(BART_MODEL)
    model.to(device)

    training_args = TrainingArguments(
        output_dir = output_dir,
        num_train_epochs = NUM_EPOCHS,
        evaluation_strategy = 'steps',
        eval_steps = 500,
        save_steps = 500,
        learning_rate = LEARNING_RATE,
        load_best_model_at_end = True,
        save_total_limit = 2,
    )

    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        eval_dataset = eval_dataset
    )

    trainer.train()

def generate(model_path, dataset):
    model = BartForConditionalGeneration.from_pretrained(model_path)
    model.to(device)

    generated_sentences = []

    for sample in dataset:
        original = sample['original']
        tokenized = tokenize_source_sentences([original])
        generated = model.generate(
            tokenized.input_ids,
            max_length = MAX_TARGET_LENGTH,
            num_beams = NUM_BEAMS,
            early_stopping=True
        )
        generated_sentence = tokenizer.decode(
            generated[0],
            skip_special_tokens=True,
        )
        # Retokenize sentence using spacy to restore correct spacing between tokens
        # for accurate error correction score calculation
        generated_sentence = ' '.join(tok.text for tok in spacy_tokenizer(generated_sentence))
        generated_sentences.append(generated_sentence)

    return generated_sentences

def get_model(path):
    model = BartForConditionalGeneration.from_pretrained(path)
    model.to(device)

    return model

In [None]:
!pip install datasets

In [None]:
! pip install -U accelerate
! pip install -U transformers

In [None]:
from datasets import load_dataset

dataset_train = load_dataset('json', data_files=train_path, split='train')
dataset_eval = load_dataset('json', data_files=val_path, split='train')

preprocessed_train = dataset_train.map(
    preprocess_dataset,
    batched=True,
    fn_kwargs={"source_column_name": "original", "target_column_name": "corrected"}
)
preprocessed_eval = dataset_eval.map(
    preprocess_dataset,
    batched=True,
    fn_kwargs={"source_column_name": "original", "target_column_name": "corrected"}
)

train(preprocessed_train, preprocessed_eval, 'outputs/model_sentence_2')

In [None]:
! pip install errant

In [None]:
!zip -r /content/outputs.zip /content/outputs

In [None]:
from google.colab import files
files.download("/content/outputs.zip")

In [None]:
import errant
import spacy
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import namedtuple

NOOP_EDIT = 'A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0'
nlp = spacy.load("en_core_web_sm")
annotator = errant.load('en', nlp)

def generate_m2(input_sentences, output_sentences, output_path):
    with open(output_path, 'w') as f:
        for input, output in zip(input_sentences, output_sentences):
            edits = annotator.annotate(annotator.parse(input), annotator.parse(output))
            print('S', input, file=f)
            if not edits:
                print(NOOP_EDIT, file=f)
            for edit in edits:
                print(edit.to_m2(), file=f)
            print(file=f)  # Blank divider line



EDIT_OPS = {'M': 'Missing', 'U': 'Unnecessary', 'R': 'Replacement'}
NOOP_EDIT_TYPE = 'noop'
UNK_EDIT_TYPE = 'UNK'
EDIT_TYPES = [
    'ADJ', 'ADJ:FORM', 'ADV', 'CONJ', 'CONTR', 'DET', 'MORPH',
    'NOUN', 'NOUN:INFL', 'NOUN:NUM', 'NOUN:POSS',
    'ORTH', 'OTHER', 'PART', 'PREP', 'PRON', 'PUNCT', 'SPELL',
    'VERB', 'VERB:FORM', 'VERB:INFL', 'VERB:SVA', 'VERB:TENSE', 'WO',
]

Edit = namedtuple('Edit', ['span', 'code', 'correction'])

def load_edits(m2_file_path):
    edits = []
    with open(m2_file_path, 'r') as f:
        for group in f.read().split('\n\n'):
            if not group:
                continue
            sentence, *sent_edits = group.split('\n')
            edits.append([Edit(*e[2:].split('|||')[:3]) for e in sent_edits])
    return edits

def create_error_count_df(gold_edits, output_edits):
    rows = [*EDIT_OPS.values(), *EDIT_TYPES, NOOP_EDIT_TYPE, UNK_EDIT_TYPE]
    df = pd.DataFrame(0, index=rows, columns=['TP', 'FP', 'FN'])
    for gold_sent_edits, output_sent_edits in zip(gold_edits, output_edits):
        gold_set = set(gold_sent_edits)
        out_set = set(output_sent_edits)
        classified_edits = {
            'TP': gold_set & out_set,
            'FP': out_set - gold_set,
            'FN': gold_set - out_set
        }
        for outcome, edits in classified_edits.items():
            for edit in edits:
                if edit.code in (NOOP_EDIT_TYPE, UNK_EDIT_TYPE):
                    df.loc[edit.code, outcome] += 1
                else:
                    op, type_ = edit.code.split(':', maxsplit=1)
                    df.loc[EDIT_OPS[op], outcome] += 1
                    df.loc[type_, outcome] += 1
    df['P'] = df['TP'] / (df['TP'] + df['FP'])
    df['R'] = df['TP'] / (df['TP'] + df['FN'])
    df['F0.5'] = (1 + 0.5**2) * ((df['P'] * df['R']) / (0.5**2 * df['P'] + df['R']))
    return df

def analyze_error_types(actual_path, predicted_path):
    gold_edits = load_edits(actual_path)
    output_edits = load_edits(predicted_path)
    error_df = create_error_count_df(gold_edits, output_edits)
    print(error_df)
    sns.heatmap(error_df[['P', 'R', 'F0.5']], vmin=0.0, vmax=1.0, cmap='Reds', annot=True, yticklabels=True)
    plt.show()

def analyze_params(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"{total_params:,} total parameters.")
    total_trainable_params = sum(
        p.numel() for p in model.parameters() if p.requires_grad)
    print(f"{total_trainable_params:,} training parameters.")

In [None]:
model_path = 'outputs/model_sentence_2/checkpoint-35000'

analyze_params(get_model(model_path))

dataset_test = load_dataset('json', data_files=test_path, split='train')
generated_sentences = generate(model_path, dataset_test)

generate_m2(dataset_test['original'], generated_sentences, model_path + '/gen.m2')

analyze_error_types('/content/drive/MyDrive/datasets/wi+locness/dataset_splits/test.m2', model_path + '/gen.m2')

In [None]:
CUSTOM_TESTS = [
    ['She saw a cat.', 'He screams out loud.'],  # PRON, VERB:TENSE
    ['The P versus NP problem is an unsolved problem in computer science.', 'No one has solved them to this day.'],  # PRON
    ['The Millennium Prize Problems are seven very complex mathematical problems.', 'No one has solved it to this day.'],
    ['Car crashes are easily preventable.', 'Most cases occurred because the driver was careless.'],  # VERB:TENSE
    ['A study was done on 1000 car crashes.', 'Most cases occur because the driver is careless.'],
    ["If he thinks about it more, I'm sure he'll figure something out.", 'The right idea eventually came to him.'],  # VERB:TENSE
    ['The right idea will eventually come to him.', 'Many weeks of effort finally paid off.'],
    ['Everyone knows that cats are adorable.', 'But they make for great companions.'],  # CONJ
    ['Cats can be annoying at times.', 'And they make for great companions.'],
    ['I visit the apple store frequently.', "I'm always eager to check out the latest phone."],  # ORTH
    ['I visit the apple store frequently.', 'Fruit works great as a snack.'],
    ['Tom told his sister there was a spider in her hair.', 'Cried out in alarm.'],  # PRON
    ['There have been complaints about long queues in the canteens.', "I'm looking them now."],  # PREP
    ["I lost my earphones earlier.", "I'm looking them now."]
]

def get_custom_tests():
    dataset = []
    for para in CUSTOM_TESTS:
        for pos, sentence in enumerate(para):
            dataset.append({
                "original": sentence,
                "pos": pos,
                "paragraph": para
            })
    return dataset

In [None]:
from datasets import Dataset

custom_tests = get_custom_tests()
print(custom_tests)
dataset_test = Dataset.from_list(custom_tests)

model_path = 'outputs/model_sentence_2/checkpoint-35000'
generated_sentences = generate(model_path, dataset_test)

with open(model_path + '/custom.txt', 'w') as f:
    for line in generated_sentences:
        f.write(f"{line}\n")

## TRANSFORMERS (BART) : Sentence Flank

> SENTENCE FLANK



> Note that the parameters can be changed and finetuned as needed. For consistency's sake, we keep the same. Moreover, the 0/1, 1/0, 1/1 experiments are all done in this section by changing the relevant variables

In [None]:
'''
Adding pre/post k sentences to the target sentence
Input: Error sentence + k pre + j post
Output: Corrected sentence
'''

import spacy
import torch
from transformers import BartModel, BartForConditionalGeneration, BartTokenizerFast
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer

# Configs
BART_MODEL = 'gotutiyan/gec-bart-base'
MAX_SOURCE_LENGTH = 512
MAX_TARGET_LENGTH = 240
NUM_EPOCHS = 7
# Hugging face documentation reccomends 1e-4 or 3e-4 for T5
LEARNING_RATE = 3e-4
NUM_BEAMS = 5
CONCAT_PARA_TOKEN = ' <cct> '

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BartTokenizerFast.from_pretrained(BART_MODEL)
nlp = spacy.load("en_core_web_sm")
spacy_tokenizer = nlp.tokenizer

def tokenize_source_sentences(sentences, paragraphs, sentence_positions, prec_range, post_range):
    concatenated_sentences = []
    for sentence, para, pos in zip(sentences, paragraphs, sentence_positions):
        para_len = len(para)
        concatenated = CONCAT_PARA_TOKEN.join(para[max(pos - prec_range, 0) : min(pos + post_range + 1, para_len)])
        concatenated_sentences.append(concatenated)

    return tokenizer(
        sentences,
        padding = 'max_length',
        max_length = MAX_SOURCE_LENGTH,
        truncation = True,
        return_tensors = "pt",
    ).to(device)

def tokenize_target_sentences(sentences):
    tokenized = tokenizer(
        sentences,
        padding = 'max_length',
        max_length = MAX_TARGET_LENGTH,
        truncation = True,
        return_tensors="pt",
    ).to(device)
    # Replace padding token ids of the labels by -100 so it's ignored by the loss
    ids = tokenized.input_ids
    ids[ids == tokenizer.pad_token_id] = -100
    tokenized.input_ids = ids
    return tokenized

def preprocess_dataset(dataset, source_column_name, target_column_name, para_column_name, pos_column_name, prec_range, post_range):
    tokenized_source = tokenize_source_sentences(dataset[source_column_name], dataset[para_column_name], dataset[pos_column_name], prec_range, post_range)
    tokenized_target = tokenize_target_sentences(dataset[target_column_name])
    input = {}
    input['input_ids'] = tokenized_source['input_ids']
    input['attention_mask'] = tokenized_source['attention_mask']
    input['labels'] = tokenized_target['input_ids']
    return input

def train(train_dataset, eval_dataset, output_dir):
    model = BartForConditionalGeneration.from_pretrained(BART_MODEL)
    model.to(device)

    training_args = TrainingArguments(
        output_dir = output_dir,
        num_train_epochs = NUM_EPOCHS,
        evaluation_strategy = 'steps',
        eval_steps = 500,
        save_steps = 500,
        learning_rate = LEARNING_RATE,
        load_best_model_at_end = True,
        save_total_limit = 2,
    )

    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        eval_dataset = eval_dataset
    )

    trainer.train()

def generate(model_path, dataset, prec_range, post_range):
    model = BartForConditionalGeneration.from_pretrained(model_path)
    model.to(device)

    generated_sentences = []
    i = 1

    for sample in dataset:
        print(i)
        i+=1
        original = sample['original']
        para = sample['paragraph']
        pos = sample['pos']
        tokenized = tokenize_source_sentences([original], [para], [pos], prec_range, post_range)
        generated = model.generate(
            tokenized.input_ids,
            max_length = MAX_TARGET_LENGTH,
            num_beams = NUM_BEAMS,
            early_stopping=True
        )
        generated_sentence = tokenizer.decode(
            generated[0],
            skip_special_tokens=True,
        )
        # Retokenize sentence using spacy to restore correct spacing between tokens
        # for accurate error correction score calculation
        generated_sentence = ' '.join(tok.text for tok in spacy_tokenizer(generated_sentence))
        generated_sentences.append(generated_sentence)

    return generated_sentences

def get_model(path):
    model = BartForConditionalGeneration.from_pretrained(path)
    model.to(device)

    return model

In [None]:
! pip install datasets

In [None]:
from datasets import load_dataset

dataset_train = load_dataset('json', data_files=train_path, split='train')
dataset_eval = load_dataset('json', data_files=val_path, split='train')

dataset_kwargs = {
        "source_column_name": "original",
        "target_column_name": "corrected",
        "para_column_name": "paragraph",
        "pos_column_name": "pos",
        "prec_range": 1,
        "post_range": 0,
        }

preprocessed_train = dataset_train.map(
    preprocess_dataset,
    batched=True,
    fn_kwargs=dataset_kwargs
)
preprocessed_eval = dataset_eval.map(
    preprocess_dataset,
    batched=True,
    fn_kwargs=dataset_kwargs
)

train(preprocessed_train, preprocessed_eval, 'outputs_sf/model_sentence_append_source/1_1')

In [None]:
!zip -r /content/outputs_sf.zip /content/outputs_sf

In [None]:
from google.colab import files
files.download("/content/outputs_sf.zip")

In [None]:
! pip install errant

In [None]:
import errant
import spacy
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import namedtuple

NOOP_EDIT = 'A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0'
nlp = spacy.load("en_core_web_sm")
annotator = errant.load('en', nlp)

def generate_m2(input_sentences, output_sentences, output_path):
    with open(output_path, 'w') as f:
        for input, output in zip(input_sentences, output_sentences):
            edits = annotator.annotate(annotator.parse(input), annotator.parse(output))
            print('S', input, file=f)
            if not edits:
                print(NOOP_EDIT, file=f)
            for edit in edits:
                print(edit.to_m2(), file=f)
            print(file=f)  # Blank divider line



EDIT_OPS = {'M': 'Missing', 'U': 'Unnecessary', 'R': 'Replacement'}
NOOP_EDIT_TYPE = 'noop'
UNK_EDIT_TYPE = 'UNK'
EDIT_TYPES = [
    'ADJ', 'ADJ:FORM', 'ADV', 'CONJ', 'CONTR', 'DET', 'MORPH',
    'NOUN', 'NOUN:INFL', 'NOUN:NUM', 'NOUN:POSS',
    'ORTH', 'OTHER', 'PART', 'PREP', 'PRON', 'PUNCT', 'SPELL',
    'VERB', 'VERB:FORM', 'VERB:INFL', 'VERB:SVA', 'VERB:TENSE', 'WO',
]

Edit = namedtuple('Edit', ['span', 'code', 'correction'])

def load_edits(m2_file_path):
    edits = []
    with open(m2_file_path, 'r') as f:
        for group in f.read().split('\n\n'):
            if not group:
                continue
            sentence, *sent_edits = group.split('\n')
            edits.append([Edit(*e[2:].split('|||')[:3]) for e in sent_edits])
    return edits

def create_error_count_df(gold_edits, output_edits):
    rows = [*EDIT_OPS.values(), *EDIT_TYPES, NOOP_EDIT_TYPE, UNK_EDIT_TYPE]
    df = pd.DataFrame(0, index=rows, columns=['TP', 'FP', 'FN'])
    for gold_sent_edits, output_sent_edits in zip(gold_edits, output_edits):
        gold_set = set(gold_sent_edits)
        out_set = set(output_sent_edits)
        classified_edits = {
            'TP': gold_set & out_set,
            'FP': out_set - gold_set,
            'FN': gold_set - out_set
        }
        for outcome, edits in classified_edits.items():
            for edit in edits:
                if edit.code in (NOOP_EDIT_TYPE, UNK_EDIT_TYPE):
                    df.loc[edit.code, outcome] += 1
                else:
                    op, type_ = edit.code.split(':', maxsplit=1)
                    df.loc[EDIT_OPS[op], outcome] += 1
                    df.loc[type_, outcome] += 1
    df['P'] = df['TP'] / (df['TP'] + df['FP'])
    df['R'] = df['TP'] / (df['TP'] + df['FN'])
    df['F0.5'] = (1 + 0.5**2) * ((df['P'] * df['R']) / (0.5**2 * df['P'] + df['R']))
    return df

def analyze_error_types(actual_path, predicted_path):
    gold_edits = load_edits(actual_path)
    output_edits = load_edits(predicted_path)
    error_df = create_error_count_df(gold_edits, output_edits)
    print(error_df)
    sns.heatmap(error_df[['P', 'R', 'F0.5']], vmin=0.0, vmax=1.0, cmap='Reds', annot=True, yticklabels=True)
    plt.show()

def analyze_params(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"{total_params:,} total parameters.")
    total_trainable_params = sum(
        p.numel() for p in model.parameters() if p.requires_grad)
    print(f"{total_trainable_params:,} training parameters.")

In [None]:
x = load_dataset('json', data_files=test_path, split='train')
print(x.shape)

In [None]:
from datasets import load_dataset

model_path = 'outputs_sf/model_sentence_append_source/1_1/checkpoint-7000'

analyze_params(get_model(model_path))

dataset_test = load_dataset('json', data_files=test_path, split='train')
generated_sentences = generate(
    model_path,
    dataset_test,
    1,
    0
)

generate_m2(dataset_test['original'], generated_sentences, model_path + '/gen.m2')

analyze_error_types('/content/drive/MyDrive/datasets/wi+locness/dataset_splits/test.m2', model_path + '/gen.m2')

In [None]:
analyze_error_types('/content/drive/MyDrive/datasets/wi+locness/dataset_splits/test.m2', model_path + '/gen.m2')

In [None]:
!errant_compare -hyp "/content/outputs_sf/model_sentence_append_source/1_1/checkpoint-7000/gen.m2" -ref  '/content/drive/MyDrive/datasets/wi+locness/dataset_splits/test.m2'

In [None]:
!errant_compare -hyp "/content/bart_simple_gen.m2" -ref  '/content/drive/MyDrive/datasets/wi+locness/dataset_splits/test.m2'

In [None]:
!errant_compare -hyp "/content/bart_sf8500_10_gen.m2" -ref  '/content/drive/MyDrive/datasets/wi+locness/dataset_splits/test.m2'

In [None]:
CUSTOM_TESTS = [
    ['She saw a cat.', 'He screams out loud.'],  # PRON, VERB:TENSE
    ['The P versus NP problem is an unsolved problem in computer science.', 'No one has solved them to this day.'],  # PRON
    ['The Millennium Prize Problems are seven very complex mathematical problems.', 'No one has solved it to this day.'],
    ['Car crashes are easily preventable.', 'Most cases occurred because the driver was careless.'],  # VERB:TENSE
    ['A study was done on 1000 car crashes.', 'Most cases occur because the driver is careless.'],
    ["If he thinks about it more, I'm sure he'll figure something out.", 'The right idea eventually came to him.'],  # VERB:TENSE
    ['The right idea will eventually come to him.', 'Many weeks of effort finally paid off.'],
    ['Everyone knows that cats are adorable.', 'But they make for great companions.'],  # CONJ
    ['Cats can be annoying at times.', 'And they make for great companions.'],
    ['I visit the apple store frequently.', "I'm always eager to check out the latest phone."],  # ORTH
    ['I visit the apple store frequently.', 'Fruit works great as a snack.'],
    ['Tom told his sister there was a spider in her hair.', 'Cried out in alarm.'],  # PRON
    ['There have been complaints about long queues in the canteens.', "I'm looking them now."],  # PREP
    ["I lost my earphones earlier.", "I'm looking them now."]
]

def get_custom_tests():
    dataset = []
    for para in CUSTOM_TESTS:
        for pos, sentence in enumerate(para):
            dataset.append({
                "original": sentence,
                "pos": pos,
                "paragraph": para
            })
    return dataset

In [None]:
from datasets import Dataset

custom_tests = get_custom_tests()
dataset_test = Dataset.from_list(custom_tests)

model_path = 'outputs_sf/model_sentence_append_source/1_1/checkpoint-7000'
generated_sentences = generate(model_path, dataset_test, 1, 0)

with open(model_path + '/custom.txt', 'w') as f:
    for line in generated_sentences:
        f.write(f"{line}\n")

In [None]:
from google.colab import files
files.download('/content/outputs_sf/model_sentence_append_source/1_1/checkpoint-7000/gen.m2')

In [None]:
files.download('/content/outputs_sf/model_sentence_append_source/1_1/checkpoint-7000/custom.txt')

# EXPERIMENTS

## TRANSFORMER PERFORMANCE RAW

In [None]:
'''
Adding pre/post k sentences to the target sentence
Input: Error sentence + k pre + j post
Output: Corrected sentence
'''

import spacy
import torch
from transformers import BartModel, BartForConditionalGeneration, BartTokenizerFast
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer

# Configs
BART_MODEL = 'gotutiyan/gec-bart-base'
MAX_SOURCE_LENGTH = 512
MAX_TARGET_LENGTH = 240
NUM_EPOCHS = 7
# Hugging face documentation reccomends 1e-4 or 3e-4 for T5
LEARNING_RATE = 3e-4
NUM_BEAMS = 5
CONCAT_PARA_TOKEN = ' <cct> '

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BartTokenizerFast.from_pretrained(BART_MODEL)
nlp = spacy.load("en_core_web_sm")
spacy_tokenizer = nlp.tokenizer

def tokenize_target_sentences(sentences):
    tokenized = tokenizer(
        sentences,
        padding = 'max_length',
        max_length = MAX_TARGET_LENGTH,
        truncation = True,
        return_tensors="pt",
    ).to(device)
    # Replace padding token ids of the labels by -100 so it's ignored by the loss
    ids = tokenized.input_ids
    ids[ids == tokenizer.pad_token_id] = -100
    tokenized.input_ids = ids
    return tokenized

def preprocess_dataset(dataset, source_column_name, target_column_name, para_column_name, pos_column_name, prec_range, post_range):
    tokenized_source = tokenize_source_sentences(dataset[source_column_name], dataset[para_column_name], dataset[pos_column_name], prec_range, post_range)
    tokenized_target = tokenize_target_sentences(dataset[target_column_name])
    input = {}
    input['input_ids'] = tokenized_source['input_ids']
    input['attention_mask'] = tokenized_source['attention_mask']
    input['labels'] = tokenized_target['input_ids']
    return input

def generate(model_path, dataset, prec_range, post_range):
    model = BartForConditionalGeneration.from_pretrained(model_path)
    model.to(device)

    generated_sentences = []
    i = 1

    for sample in dataset:
        print(i)
        i+=1
        original = sample['original']
        para = sample['paragraph']
        pos = sample['pos']
        tokenized = tokenize_source_sentences([original], [para], [pos], prec_range, post_range)
        generated = model.generate(
            tokenized.input_ids,
            max_length = MAX_TARGET_LENGTH,
            num_beams = NUM_BEAMS,
            early_stopping=True
        )
        generated_sentence = tokenizer.decode(
            generated[0],
            skip_special_tokens=True,
        )
        # Retokenize sentence using spacy to restore correct spacing between tokens
        # for accurate error correction score calculation
        generated_sentence = ' '.join(tok.text for tok in spacy_tokenizer(generated_sentence))
        generated_sentences.append(generated_sentence)

    return generated_sentences

def get_model(path):
    model = BartForConditionalGeneration.from_pretrained(path)
    model.to(device)

    return model

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

In [None]:
from transformers import pipeline

# Initialize the text-generation pipeline for text correction
corrector = pipeline("text2text-generation", "pszemraj/bart-base-grammar-synthesis")

In [None]:
dataset_test = load_dataset('json', data_files=test_path, split='train')
def generated_sentences():
  i = 0
  for text in dataset_test:
    if i > 3:
      break
    else:
      print(corrector(text["original"])[0]["generated_text"])
      i+=1

In [None]:
generated_sentences()

In [None]:
generated_sentences_raw = np.array([])

In [None]:
i = 1
for text in dataset_test:
  if i < 3:
    print(generated_sentences_raw)
    generated_sentences_raw = np.append(generated_sentences_raw, corrector(text["original"])[0]["generated_text"])
    i += 1
  else:
    print(i)
    i += 1
    org = text["original"]
    generated_sentences_raw = np.append(generated_sentences_raw, corrector(text["original"])[0]["generated_text"])


In [None]:
from datasets import load_dataset

model_path = 'pszemraj/bart-base-grammar-synthesis'

analyze_params(get_model(model_path))

dataset_test = load_dataset('json', data_files=test_path, split='train')
def generated_sentences():
  for text in dataset_text:
    print(text)

generate_m2(dataset_test['original'], generated_sentences, '/gen.m2')

analyze_error_types('/content/drive/MyDrive/datasets/wi+locness/dataset_splits/test.m2', '/gen.m2')

## BiLSTM Grammar Correction Experiements (Archived)

Credit:

https://medium.com/geekculture/neural-machine-translation-using-seq2seq-model-with-attention-9faea357d70b

https://shreelakshmigp1995.medium.com/grammatical-error-correction-using-deep-learning-c36824de184

https://colab.research.google.com/drive/1XrjPL3O_szhahYZW0z9yhCl9qvIcJJYW

https://suraj1997lodh.medium.com/grammar-error-handling-and-correction-with-dataset-creation-e446fa6863b8

https://medium.com/analytics-vidhya/grammatical-error-correction-using-neural-networks-aaf3e9fc91c

## BILSTM NO CONTEXT


In [None]:
train_df = pd.read_json(train_path)
val_df = pd.read_json(val_path)
test_df = pd.read_json(test_path)

In [None]:
combined_train_val_df = pd.concat([train_df, val_df], ignore_index=True)

In [None]:
# adding special tokens
combined_train_val_df['corrected'] =combined_train_val_df.corrected.apply(lambda x: 'sos '+ x + ' eos')

# Convert into list of sentence we need list to pass in tokenizer
org_texts = combined_train_val_df.original.to_list()
cor_texts = combined_train_val_df.corrected.to_list()

In [None]:
def tokenize_sent(text):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(text)

  return tokenizer, tokenizer.texts_to_sequences(text)

In [None]:
# Tokenize
org_tokenizer, org_encoded= tokenize_sent(text= org_texts)
cor_tokenizer, cor_encoded= tokenize_sent(text= cor_texts)

# Original Word --> index dictionary
org_index_word = org_tokenizer.index_word

# Orignal Index --> word dictionary
org_word_index= org_tokenizer.word_index

# size of Original vocabulary for encoder input
# For zero padding we have to add +1 in size
ORG_VOCAB_SIZE = len(org_tokenizer.word_counts)+1

# Corrected Word --> index dict
cor_word_index= cor_tokenizer.word_index

# Corrected Index --> word dict
cor_index_word = cor_tokenizer.index_word

# Cor vocab size for decoder output
COR_VOCAB_SIZE=len(cor_tokenizer.word_counts)+1

In [None]:
# Getting max length of org and cor sentences
max_org_len = 0
for i in range(len(org_encoded)):
  if len(org_encoded[i]) > max_org_len:
    max_org_len= len(org_encoded[i])

max_cor_len = 0
for i in range(len(cor_encoded)):
  if len(org_encoded[i]) > max_cor_len:
    max_cor_len= len(cor_encoded[i])


In [None]:
# Padding both
org_padded = pad_sequences(org_encoded, maxlen=max_org_len, padding='post')
cor_padded = pad_sequences(cor_encoded, maxlen=max_cor_len, padding='post')

# Convert to array
org_padded= np.array(org_padded)
cor_padded= np.array(cor_padded)

In [None]:
X_train, X_val = org_padded[:28066+1], org_padded[28066+1:]
y_train, y_val = cor_padded[:28066+1], cor_padded[28066+1:]

In [None]:
# Encoder input
encoder_inputs = Input(shape=(max_org_len,))

# Embedding layer- i am using 1024 output-dim for embedding you can try diff values 100,256,512,1000
enc_emb = Embedding(ORG_VOCAB_SIZE, 1024)(encoder_inputs)

# Bidirectional lstm layer
enc_lstm1 = Bidirectional(LSTM(256,return_sequences=True,return_state=True))
encoder_outputs1, forw_state_h, forw_state_c, back_state_h, back_state_c = enc_lstm1(enc_emb)

# Concatenate both h and c
final_enc_h = Concatenate()([forw_state_h,back_state_h])
final_enc_c = Concatenate()([forw_state_c,back_state_c])

# get Context vector
encoder_states =[final_enc_h, final_enc_c]

In [None]:
# -*- coding: utf-8 -*-
"""attention.ipynb
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/1XrjPL3O_szhahYZW0z9yhCl9qvIcJJYW
"""

class BahdAttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, **kwargs):
        super(BahdAttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(BahdAttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state """

            assert_msg = "States must be a list. However states {} is of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch_size*en_seq_len, latent_dim
            reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden))
            # <= batch_size*en_seq_len, latent_dim
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden))
            if verbose:
                print('wa.s>',W_a_dot_s.shape)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>',U_a_dot_h.shape)

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            reshaped_Ws_plus_Uh = K.tanh(K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
            if verbose:
                print('Ws+Uh>', reshaped_Ws_plus_Uh.shape)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len))
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            if verbose:
                print('ei>', e_i.shape)

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """
            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]

        def create_inital_state(inputs, hidden_size):
            # We are not using initial states, but need to pass something to K.rnn funciton
            fake_state = K.zeros_like(inputs)  # <= (batch_size, enc_seq_len, latent_dim
            fake_state = K.sum(fake_state, axis=[1, 2])  # <= (batch_size)
            fake_state = K.expand_dims(fake_state)  # <= (batch_size, 1)
            fake_state = K.tile(fake_state, [1, hidden_size])  # <= (batch_size, latent_dim
            return fake_state

        fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1])
        fake_state_e = create_inital_state(encoder_out_seq, encoder_out_seq.shape[1])  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

In [None]:
from tensorflow.keras import layers
from tensorflow.keras import Model

In [None]:
decoder_inputs = Input(shape=(None,))

# decoder embedding with same number as encoder embedding
dec_emb_layer = Embedding(COR_VOCAB_SIZE, 1024)
dec_emb = dec_emb_layer(decoder_inputs)   # apply this way because we need embedding layer for prediction

# In encoder we used Bidirectional so it's having two LSTM's so we have to take double units(256*2=512) for single decoder lstm
# LSTM using encoder's final states as initial state
decoder_lstm = LSTM(512, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

# Using Attention Layer
attention_layer = BahdAttentionLayer()
attention_result, attention_weights = attention_layer([encoder_outputs1, decoder_outputs])

# Concat attention output and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_result])

# Dense layer with softmax
decoder_dense = Dense(COR_VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)


# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
# compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Define callbacks
checkpoint = ModelCheckpoint("MODEL_CHECKPOINTS")
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)
callbacks_list = [checkpoint, early_stopping]

# Training set
encoder_input_data = X_train
# To make same as target data skip last number which is just padding
decoder_input_data = y_train[:,:-1]
# Decoder target data has to be one step ahead so we are taking from 1 as told in keras docs
decoder_target_data =  y_train[:,1:]

# devlopment set
encoder_input_test = X_val
decoder_input_test = y_val[:,:-1]
decoder_target_test=  y_val[:,1:]

In [None]:
history = model.fit([encoder_input_data, decoder_input_data],decoder_target_data,
                    epochs=5,
                    batch_size=32,
                    validation_data = ([encoder_input_test, decoder_input_test],decoder_target_test),
                    callbacks= callbacks_list)

In [None]:
model.save_weights("model.h5") # can give whole path to save model

In [None]:
!zip -r /content/outputs.zip /content/outputs

In [None]:
# Its good to restart runtime and create model and load weights
model.load_weights("model.h5")

# INFERENCE MODEL
# encoder Inference model
encoder_model = Model(encoder_inputs, outputs = [encoder_outputs1, final_enc_h, final_enc_c])

# Decoder Inference
decoder_state_h = Input(shape=(512,)) # This numbers has to be same as units of lstm's on which model is trained
decoder_state_c = Input(shape=(512,))

# we need hidden state for attention layer
# 36 is maximum length if english sentence It has to same as input taken by attention layer can see in model plot
decoder_hidden_state_input = Input(shape=(36,512))
# get decoder states
dec_states = [decoder_state_h, decoder_state_c]

# embedding layer
dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=dec_states)

# Attention inference
attention_result_inf, attention_weights_inf = attention_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_concat_input_inf = Concatenate(axis=-1, name='concat_layer')([decoder_outputs2, attention_result_inf])

dec_states2= [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_concat_input_inf)

# get decoder model
decoder_model= Model(
                    [decoder_inputs] + [decoder_hidden_state_input, decoder_state_h, decoder_state_c],
                     [decoder_outputs2]+ dec_states2)

In [None]:
def get_predicted_sentence(input_seq):
    # Encode the input as state vectors.
    enc_output, enc_h, enc_c = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = cor_word_index['sos']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [enc_output, enc_h, enc_c ])
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        # convert max index number to marathi word
        sampled_char = cor_index_word[sampled_token_index]
        # aapend it to decoded sent
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length or find stop token.
        if (sampled_char == 'eos' or len(decoded_sentence.split()) >= max_org_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        enc_h, enc_c = h, c

    return decoded_sentence

In [None]:
def get_cor_sentence(input_sequence):
    sentence =''
    for i in input_sequence:
      if i!=0 :
        sentence =sentence +cor_index_word[i]+' '
    return sentence

def get_org_sentence(input_sequence):
    sentence =''
    for i in input_sequence:
      if i!=0:
        sentence =sentence +org_index_word[i]+' '
    return sentence

# # using simple loop we will take 15 random numbers from x_test and get results
# for i in np.random.randint(10, 1000, size=15):
#   print("Org Sentence:",get_org_sentence(X_test[i]))
#   print("Cor Sentence:",get_cor_sentence(y_test[i])[4:-4])
#   # Before passing input it has to be reshape as following
#   print("Predicted Cor:",get_predicted_sentence(X_test[i].reshape(1,max_org_len))[:-4])


In [None]:
! pip install errant

In [None]:
import errant
import spacy
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import namedtuple

NOOP_EDIT = 'A -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0'
nlp = spacy.load("en_core_web_sm")
annotator = errant.load('en', nlp)

def generate_m2(input_sentences, output_sentences, output_path):
    with open(output_path, 'w') as f:
        for input, output in zip(input_sentences, output_sentences):
            edits = annotator.annotate(annotator.parse(input), annotator.parse(output))
            print('S', input, file=f)
            if not edits:
                print(NOOP_EDIT, file=f)
            for edit in edits:
                print(edit.to_m2(), file=f)
            print(file=f)  # Blank divider line



EDIT_OPS = {'M': 'Missing', 'U': 'Unnecessary', 'R': 'Replacement'}
NOOP_EDIT_TYPE = 'noop'
UNK_EDIT_TYPE = 'UNK'
EDIT_TYPES = [
    'ADJ', 'ADJ:FORM', 'ADV', 'CONJ', 'CONTR', 'DET', 'MORPH',
    'NOUN', 'NOUN:INFL', 'NOUN:NUM', 'NOUN:POSS',
    'ORTH', 'OTHER', 'PART', 'PREP', 'PRON', 'PUNCT', 'SPELL',
    'VERB', 'VERB:FORM', 'VERB:INFL', 'VERB:SVA', 'VERB:TENSE', 'WO',
]

Edit = namedtuple('Edit', ['span', 'code', 'correction'])

def load_edits(m2_file_path):
    edits = []
    with open(m2_file_path, 'r') as f:
        for group in f.read().split('\n\n'):
            if not group:
                continue
            sentence, *sent_edits = group.split('\n')
            edits.append([Edit(*e[2:].split('|||')[:3]) for e in sent_edits])
    return edits

def create_error_count_df(gold_edits, output_edits):
    rows = [*EDIT_OPS.values(), *EDIT_TYPES, NOOP_EDIT_TYPE, UNK_EDIT_TYPE]
    df = pd.DataFrame(0, index=rows, columns=['TP', 'FP', 'FN'])
    for gold_sent_edits, output_sent_edits in zip(gold_edits, output_edits):
        gold_set = set(gold_sent_edits)
        out_set = set(output_sent_edits)
        classified_edits = {
            'TP': gold_set & out_set,
            'FP': out_set - gold_set,
            'FN': gold_set - out_set
        }
        for outcome, edits in classified_edits.items():
            for edit in edits:
                if edit.code in (NOOP_EDIT_TYPE, UNK_EDIT_TYPE):
                    df.loc[edit.code, outcome] += 1
                else:
                    op, type_ = edit.code.split(':', maxsplit=1)
                    df.loc[EDIT_OPS[op], outcome] += 1
                    df.loc[type_, outcome] += 1
    df['P'] = df['TP'] / (df['TP'] + df['FP'])
    df['R'] = df['TP'] / (df['TP'] + df['FN'])
    df['F0.5'] = (1 + 0.5**2) * ((df['P'] * df['R']) / (0.5**2 * df['P'] + df['R']))
    return df

def analyze_error_types(actual_path, predicted_path):
    gold_edits = load_edits(actual_path)
    output_edits = load_edits(predicted_path)
    error_df = create_error_count_df(gold_edits, output_edits)
    print(error_df)
    sns.heatmap(error_df[['P', 'R', 'F0.5']], vmin=0.0, vmax=1.0, cmap='Reds', annot=True, yticklabels=True)
    plt.show()

def analyze_params(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"{total_params:,} total parameters.")
    total_trainable_params = sum(
        p.numel() for p in model.parameters() if p.requires_grad)
    print(f"{total_trainable_params:,} training parameters.")

In [None]:
xt_2 = np.arange( X_val.shape[ 0 ] )
np.random.shuffle(xt_2)

In [None]:
xt_2

In [None]:
slice1 = X_val[ xt_2[ :100 ], : ]

In [None]:
def get_cor_test_cases():
  generated = np.array([])
  for i in range(slice1.shape[0]):
    predicted = get_predicted_sentence(slice1[i].reshape(1,max_org_len))[:-4]
    print(i)
    generated = np.append(generated, predicted)
  return generated


In [None]:
generated_sentences = get_cor_test_cases()

In [None]:
x = map(get_org_sentence, slice1)

In [None]:
list(x)

In [None]:
generated_sentences

## BILSTM (SENTENCE PAD LEFT + RIGHT)

In [None]:
import pandas as pd
import numpy as np


train_df = pd.read_json(train_path)
val_df = pd.read_json(val_path)
test_df = pd.read_json(test_path)

def make_context_sents(para, pos):
    if pos + 1 < len(para) and pos - 1 >= 0:
        return para[pos-1: pos+2]
    elif pos - 1 >= 0:
        return para[pos - 1: pos + 1]
    elif pos + 1 < len(para):
        return para[pos: pos + 2]
    else:
        return para[pos]


train_df['original_with_context'] = train_df.apply(lambda x: make_context_sents(x.paragraph, x.pos), axis = 1)
train_df['original_with_context'] = train_df['original_with_context'].apply(lambda x: (' '.join(x)).strip())

test_df['original_with_context'] = test_df.apply(lambda x: make_context_sents(x.paragraph, x.pos), axis = 1)
test_df['original_with_context'] = test_df['original_with_context'].apply(lambda x: (' '.join(x)).strip())

val_df['original_with_context'] = val_df.apply(lambda x: make_context_sents(x.paragraph, x.pos), axis = 1)
val_df['original_with_context'] = val_df['original_with_context'].apply(lambda x: (' '.join(x)).strip())


def add_context_to_target(para, pos, target):
    if pos + 1 < len(para) and pos - 1 >= 0:
        return para[pos - 1] + target + para[pos + 1]
    elif pos - 1 >= 0:
        return para[pos - 1] + target
    elif pos + 1 < len(para):
        return target + para[pos + 1]
    else:
        return para[pos]

train_df['corrected_with_context'] = train_df.apply(lambda x: add_context_to_target(x.paragraph, x.pos, x.corrected), axis = 1)
#train_df['corrected_with_context'] = train_df['corrected_with_context'].apply(lambda x: (' '.join(x)).strip())

test_df['corrected_with_context'] = test_df.apply(lambda x: add_context_to_target(x.paragraph, x.pos, x.corrected), axis = 1)
#test_df['corrected_with_context'] = test_df['corrected_with_context'].apply(lambda x: (' '.join(x)).strip())

val_df['corrected_with_context'] = val_df.apply(lambda x: add_context_to_target(x.paragraph, x.pos, x.corrected), axis = 1)
#val_df['corrected_with_context'] = val_df['corrected_with_context'].apply(lambda x: (' '.join(x)).strip())

print(train_df)

In [None]:
combined_train_val_df = pd.concat([train_df, val_df], ignore_index=True)

In [None]:
# adding special tokens
combined_train_val_df['corrected_with_context'] =combined_train_val_df.corrected.apply(lambda x: 'sos '+ x + ' eos')

# Convert into list of sentence we need list to pass in tokenizer
org_texts = combined_train_val_df.original_with_context.to_list()
cor_texts = combined_train_val_df.corrected_with_context.to_list()

def tokenize_sent(text):
  '''
  Take list on texts as input and
  returns its tokenizer and enocoded text
  '''
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(text)

  return tokenizer, tokenizer.texts_to_sequences(text)


# Tokenize
org_tokenizer, org_encoded= tokenize_sent(text= org_texts)
cor_tokenizer, cor_encoded= tokenize_sent(text= cor_texts)

# Original Word --> index dictionary
org_index_word = org_tokenizer.index_word

# Orignal Index --> word dictionary
org_word_index= org_tokenizer.word_index

# size of Original vocabulary for encoder input
# For zero padding we have to add +1 in size
ORG_VOCAB_SIZE = len(org_tokenizer.word_counts)+1

# Corrected Word --> index dict
cor_word_index= cor_tokenizer.word_index

# Corrected Index --> word dict
cor_index_word = cor_tokenizer.index_word

# Cor vocab size for decoder output
COR_VOCAB_SIZE=len(cor_tokenizer.word_counts)+1

# Getting max length of org and cor sentences
max_org_len = 0
for i in range(len(org_encoded)):
  if len(org_encoded[i]) > max_org_len:
    max_org_len= len(org_encoded[i])

max_cor_len = 0
for i in range(len(cor_encoded)):
  if len(org_encoded[i]) > max_cor_len:
    max_cor_len= len(cor_encoded[i])


# Padding both
org_padded = pad_sequences(org_encoded, maxlen=max_org_len, padding='post')
cor_padded = pad_sequences(cor_encoded, maxlen=max_cor_len, padding='post')

# Convert to array
org_padded= np.array(org_padded)
cor_padded= np.array(cor_padded)


In [None]:
X_train, X_val = org_padded[:28066+1], org_padded[28066+1:]
y_train, y_val = cor_padded[:28066+1], cor_padded[28066+1:]

In [None]:
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, Concatenate
from tensorflow.keras import Input, Model

# Encoder input
encoder_inputs = Input(shape=(max_org_len,))

# Embedding layer- i am using 1024 output-dim for embedding you can try diff values 100,256,512,1000
enc_emb = Embedding(ORG_VOCAB_SIZE, 1024)(encoder_inputs)

# Bidirectional lstm layer
enc_lstm1 = Bidirectional(LSTM(256,return_sequences=True,return_state=True))
encoder_outputs1, forw_state_h, forw_state_c, back_state_h, back_state_c = enc_lstm1(enc_emb)

# Concatenate both h and c
final_enc_h = Concatenate()([forw_state_h,back_state_h])
final_enc_c = Concatenate()([forw_state_c,back_state_c])

# get Context vector
encoder_states =[final_enc_h, final_enc_c]

In [None]:
# -*- coding: utf-8 -*-
"""attention.ipynb
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/1XrjPL3O_szhahYZW0z9yhCl9qvIcJJYW
"""

import tensorflow as tf
import os
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K


class AttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state """

            assert_msg = "States must be a list. However states {} is of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch_size*en_seq_len, latent_dim
            reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden))
            # <= batch_size*en_seq_len, latent_dim
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden))
            if verbose:
                print('wa.s>',W_a_dot_s.shape)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>',U_a_dot_h.shape)

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            reshaped_Ws_plus_Uh = K.tanh(K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
            if verbose:
                print('Ws+Uh>', reshaped_Ws_plus_Uh.shape)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len))
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            if verbose:
                print('ei>', e_i.shape)

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """
            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]

        def create_inital_state(inputs, hidden_size):
            # We are not using initial states, but need to pass something to K.rnn funciton
            fake_state = K.zeros_like(inputs)  # <= (batch_size, enc_seq_len, latent_dim
            fake_state = K.sum(fake_state, axis=[1, 2])  # <= (batch_size)
            fake_state = K.expand_dims(fake_state)  # <= (batch_size, 1)
            fake_state = K.tile(fake_state, [1, hidden_size])  # <= (batch_size, latent_dim
            return fake_state

        fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1])
        fake_state_e = create_inital_state(encoder_out_seq, encoder_out_seq.shape[1])  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

In [None]:
decoder_inputs = Input(shape=(None,))

# decoder embedding with same number as encoder embedding
dec_emb_layer = Embedding(COR_VOCAB_SIZE, 1024)
dec_emb = dec_emb_layer(decoder_inputs)   # apply this way because we need embedding layer for prediction

# In encoder we used Bidirectional so it's having two LSTM's so we have to take double units(256*2=512) for single decoder lstm
# LSTM using encoder's final states as initial state
decoder_lstm = LSTM(512, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

# Using Attention Layer
attention_layer = AttentionLayer()
attention_result, attention_weights = attention_layer([encoder_outputs1, decoder_outputs])

# Concat attention output and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_result])

# Dense layer with softmax
decoder_dense = Dense(COR_VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)


# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
# compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Define callbacks
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
checkpoint = ModelCheckpoint("MODEL_CHECKPOINTS", monitor='val_accuracy')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)
callbacks_list = [checkpoint, early_stopping]

# Training set
encoder_input_data = X_train
# To make same as target data skip last number which is just padding
decoder_input_data = y_train[:,:-1]
# Decoder target data has to be one step ahead so we are taking from 1 as told in keras docs
decoder_target_data =  y_train[:,1:]

# devlopment set
encoder_input_test = X_val
decoder_input_test = y_val[:,:-1]
decoder_target_test=  y_val[:,1:]

history = model.fit([encoder_input_data, decoder_input_data],decoder_target_data,
                    epochs=5,
                    batch_size=32,
                    validation_data = ([encoder_input_test, decoder_input_test],decoder_target_test),
                    callbacks= callbacks_list)

# Don't forget to save weights of trained model
model.save_weights("model_sent.h5") # can give whole path to save model

In [None]:
# Its good to restart runtime and create model and load weights
model.load_weights("model_sent.h5")

# INFERENCE MODEL
# encoder Inference model
encoder_model = Model(encoder_inputs, outputs = [encoder_outputs1, final_enc_h, final_enc_c])

# Decoder Inference
decoder_state_h = Input(shape=(512,)) # This numbers has to be same as units of lstm's on which model is trained
decoder_state_c = Input(shape=(512,))

# we need hidden state for attention layer
decoder_hidden_state_input = Input(shape=(max_org_len,512))
# get decoder states
dec_states = [decoder_state_h, decoder_state_c]

# embedding layer
dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=dec_states)

# Attention inference
attention_result_inf, attention_weights_inf = attention_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_concat_input_inf = Concatenate(axis=-1, name='concat_layer')([decoder_outputs2, attention_result_inf])

dec_states2= [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_concat_input_inf)

# get decoder model
decoder_model= Model(
                    [decoder_inputs] + [decoder_hidden_state_input, decoder_state_h, decoder_state_c],
                     [decoder_outputs2]+ dec_states2)

In [None]:
def get_predicted_sentence(input_seq):
    # Encode the input as state vectors.
    enc_output, enc_h, enc_c = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = cor_word_index['sos']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [enc_output, enc_h, enc_c ])
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        # convert max index number to marathi word
        sampled_char = cor_index_word[sampled_token_index]
        # aapend it to decoded sent
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length or find stop token.
        if (sampled_char == 'eos' or len(decoded_sentence.split()) >= max_org_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        enc_h, enc_c = h, c

    return decoded_sentence

In [None]:
def get_cor_sentence(input_sequence):
    sentence =''
    for i in input_sequence:
      if i!=0 :
        sentence =sentence +cor_index_word[i]+' '
    return sentence

def get_org_sentence(input_sequence):
    sentence =''
    for i in input_sequence:
      if i!=0:
        sentence =sentence +org_index_word[i]+' '
    return sentence

# using simple loop we will take 15 random numbers from x_test and get results
for i in np.random.randint(10, 1000, size=15):
  print("Org Sentence:",get_org_sentence(X_val[i]))
  print("Cor Sentence:",get_cor_sentence(y_val[i])[4:-4])
  # Before passing input it has to be reshape as following
  print("Predicted Cor:",get_predicted_sentence(X_val[i].reshape(1,max_org_len))[:-4])
  print("----------------------------------------------------------------------------------------")

In [None]:
decoder_inputs = Input(shape=(None,))

# decoder embedding with same number as encoder embedding
dec_emb_layer = Embedding(COR_VOCAB_SIZE, 1024)
dec_emb = dec_emb_layer(decoder_inputs)   # apply this way because we need embedding layer for prediction

# In encoder we used Bidirectional so it's having two LSTM's so we have to take double units(256*2=512) for single decoder lstm
# LSTM using encoder's final states as initial state
decoder_lstm = LSTM(512, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

# Using Attention Layer
attention_layer = AttentionLayer()
attention_result, attention_weights = attention_layer([encoder_outputs1, decoder_outputs])

# Concat attention output and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_result])

# Dense layer with softmax
decoder_dense = Dense(COR_VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)


# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
# compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Define callbacks
checkpoint = ModelCheckpoint("MODEL_CHECKPOINTS", monitor='val_accuracy')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)
callbacks_list = [checkpoint, early_stopping]

# Training set
encoder_input_data = X_train
# To make same as target data skip last number which is just padding
decoder_input_data = y_train[:,:-1]
# Decoder target data has to be one step ahead so we are taking from 1 as told in keras docs
decoder_target_data =  y_train[:,1:]

# devlopment set
encoder_input_test = X_val
decoder_input_test = y_val[:,:-1]
decoder_target_test=  y_val[:,1:]

history = model.fit([encoder_input_data, decoder_input_data],decoder_target_data,
                    epochs=1,
                    batch_size=32,
                    validation_data = ([encoder_input_test, decoder_input_test],decoder_target_test),
                    callbacks= callbacks_list)

# Don't forget to save weights of trained model
model.save_weights("model_sent.h5") # can give whole path to save model

In [None]:
# Its good to restart runtime and create model and load weights
model.load_weights("model_sent.h5")

# INFERENCE MODEL
# encoder Inference model
encoder_model = Model(encoder_inputs, outputs = [encoder_outputs1, final_enc_h, final_enc_c])

# Decoder Inference
decoder_state_h = Input(shape=(512,)) # This numbers has to be same as units of lstm's on which model is trained
decoder_state_c = Input(shape=(512,))

# we need hidden state for attention layer
decoder_hidden_state_input = Input(shape=(max_org_len,512))
# get decoder states
dec_states = [decoder_state_h, decoder_state_c]

# embedding layer
dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=dec_states)

# Attention inference
attention_result_inf, attention_weights_inf = attention_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_concat_input_inf = Concatenate(axis=-1, name='concat_layer')([decoder_outputs2, attention_result_inf])

dec_states2= [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_concat_input_inf)

# get decoder model
decoder_model= Model(
                    [decoder_inputs] + [decoder_hidden_state_input, decoder_state_h, decoder_state_c],
                     [decoder_outputs2]+ dec_states2)

In [None]:
def get_predicted_sentence(input_seq):
    # Encode the input as state vectors.
    enc_output, enc_h, enc_c = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = cor_word_index['sos']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [enc_output, enc_h, enc_c ])
        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        # convert max index number to marathi word
        sampled_char = cor_index_word[sampled_token_index]
        # aapend it to decoded sent
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length or find stop token.
        if (sampled_char == 'eos' or len(decoded_sentence.split()) >= max_org_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        enc_h, enc_c = h, c

    return decoded_sentence

In [None]:
def get_cor_sentence(input_sequence):
    sentence =''
    for i in input_sequence:
      if i!=0 :
        sentence =sentence +cor_index_word[i]+' '
    return sentence

def get_org_sentence(input_sequence):
    sentence =''
    for i in input_sequence:
      if i!=0:
        sentence =sentence +org_index_word[i]+' '
    return sentence

# using simple loop we will take 15 random numbers from x_test and get results
for i in np.random.randint(10, 1000, size=15):
  print("Org Sentence:",get_org_sentence(X_val[i]))
  print("Cor Sentence:",get_cor_sentence(y_val[i])[4:-4])
  # Before passing input it has to be reshape as following
  print("Predicted Cor:",get_predicted_sentence(X_val[i].reshape(1,max_org_len))[:-4])
  print("----------------------------------------------------------------------------------------")

In [None]:
!pip install bertviz

In [None]:
from transformers import AutoTokenizer, AutoModel
from transformers import BartModel, BartForConditionalGeneration, BartTokenizerFast

model = BartModel.from_pretrained("gotutiyan/gec-bart-base", output_attentions=True)

In [None]:
tokenizer = BartTokenizerFast.from_pretrained("gotutiyan/gec-bart-base")

In [None]:
CUSTOM_TESTS = [
    ['She saw a cat.', 'He screams out loud.'],  # PRON, VERB:TENSE
    ['The P versus NP problem is an unsolved problem in computer science.', 'No one has solved them to this day.'],  # PRON
    ['The Millennium Prize Problems are seven very complex mathematical problems.', 'No one has solved it to this day.'],
    ['Car crashes are easily preventable.', 'Most cases occurred because the driver was careless.'],  # VERB:TENSE
    ['A study was done on 1000 car crashes.', 'Most cases occur because the driver is careless.'],
    ["If he thinks about it more, I'm sure he'll figure something out.", 'The right idea eventually came to him.'],  # VERB:TENSE
    ['The right idea will eventually come to him.', 'Many weeks of effort finally paid off.'],
    ['Everyone knows that cats are adorable.', 'But they make for great companions.'],  # CONJ
    ['Cats can be annoying at times.', 'And they make for great companions.'],
    ['I visit the apple store frequently.', "I'm always eager to check out the latest phone."],  # ORTH
    ['I visit the apple store frequently.', 'Fruit works great as a snack.'],
    ['Tom told his sister there was a spider in her hair.', 'Cried out in alarm.'],  # PRON
    ['There have been complaints about long queues in the canteens.', "I'm looking them now."],  # PREP
    ["I lost my earphones earlier.", "I'm looking them now."]
]

In [None]:
! pip install errant

In [None]:
!errant_compare -hyp "/content/bart_sf3500_test.m2" -ref  '/content/drive/MyDrive/datasets/wi+locness/dataset_splits/test.m2'

## VISUALIZING ATTENTION

In [None]:
!pip install bertviz

In [None]:
from bertviz import model_view

In [None]:
text = "I eated dinner tonight because I am hungry"
encoder_input_ids = tokenizer(text, return_tensors="pt", add_special_tokens=True).input_ids
with tokenizer.as_target_tokenizer():
  decoder_input_ids = tokenizer(text, return_tensors="pt", add_special_tokens=True).input_ids
  outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)
  encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])
  decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])

In [None]:
from bertviz import model_view
model_view(
    encoder_attention=outputs.encoder_attentions,
    decoder_attention=outputs.decoder_attentions,
    cross_attention=outputs.cross_attentions,
    encoder_tokens= encoder_text,
    decoder_tokens = decoder_text
)