In [3]:
import json
from dataclasses import dataclass, field
from typing import Optional
import os
import argparse
from transformers import AutoTokenizer, BartForConditionalGeneration, Text2TextGenerationPipeline
from transformers import HfArgumentParser, TrainingArguments, Trainer, set_seed
from datasets import load_dataset, Dataset
from loguru import logger
import torch
import csv

class CsvDataset(object):
    def __init__(self, file_path):
        self.file_path = file_path

    def load(self):
        data_list = []
        with open(self.file_path, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
            # Skip the first row 
            next(reader)
            for row in reader:
                # correct, incorrect
                correct_text = row[0]
                incorrect_text = row[1]
                data_list.append(correct_text + '\t' + incorrect_text)
        return {'text': data_list}

d = CsvDataset("./data/data/train_cleaned.csv")
data_dict = d.load()
train_dataset = Dataset.from_dict(data_dict, split='train')

d = CsvDataset("./data/data/dev.csv")
data_dict = d.load()
valid_dataset = Dataset.from_dict(data_dict, split='test')
logger.info(train_dataset)
logger.info(valid_dataset)

[32m2024-04-09 14:00:03.952[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [1mDataset({
    features: ['text'],
    num_rows: 33935
})[0m
[32m2024-04-09 14:00:03.953[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m38[0m - [1mDataset({
    features: ['text'],
    num_rows: 4384
})[0m


In [4]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")

  return self.fget.__get__(instance, owner)()


In [5]:
def tokenize_dataset(tokenizer, dataset, max_len):
    def convert_to_features(example_batch):
        src_texts = []
        trg_texts = []
        for example in example_batch['text']:
            terms = example.split('\t', 1)
            src_texts.append(terms[0])
            trg_texts.append(terms[1])
        input_encodings = tokenizer.batch_encode_plus(
            src_texts,
            truncation=True,
            padding='max_length',
            max_length=max_len,
        )
        target_encodings = tokenizer.batch_encode_plus(
            trg_texts,
            truncation=True,
            padding='max_length',
            max_length=max_len,
        )

        encodings = {
            'input_ids': input_encodings['input_ids'],
            'attention_mask': input_encodings['attention_mask'],
            'target_ids': target_encodings['input_ids'],
            'target_attention_mask': target_encodings['attention_mask']
        }

        return encodings
    dataset = dataset.map(convert_to_features, batched=True)
    # Set the tensor type and the columns which the dataset should return
    columns = ['input_ids', 'target_ids', 'attention_mask', 'target_attention_mask']
    dataset.with_format(type='torch', columns=columns)
    # Rename columns to the names that the forward method of the selected
    # model expects
    dataset = dataset.rename_column('target_ids', 'labels')
    dataset = dataset.rename_column('target_attention_mask', 'decoder_attention_mask')
    dataset = dataset.remove_columns(['text'])
    return dataset

train_data = tokenize_dataset(tokenizer, train_dataset,128)
valid_data = tokenize_dataset(tokenizer, valid_dataset,128)

Map: 100%|██████████| 33935/33935 [00:11<00:00, 3044.79 examples/s]
Map: 100%|██████████| 4384/4384 [00:01<00:00, 2659.05 examples/s]


In [6]:
training_args = TrainingArguments(
    num_train_epochs=5,          # total # of training epochs 
    per_device_train_batch_size=32,  # batch size per device during training 
    per_device_eval_batch_size=32,   # batch size for evaluation 
    learning_rate=1e-4,             # learning rate
    save_steps=False,
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=valid_data,
)
trainer.train()
## save models
model.save_pretrained("result_bart/")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
50,4.5692
100,0.2037
150,0.1529
200,0.2472
250,0.1774
300,0.1663
350,0.1575
400,0.1233
450,0.1257
500,0.1173


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [10]:
new_model = BartForConditionalGeneration.from_pretrained("./result_bart/")

In [4]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
new_model = BartForConditionalGeneration.from_pretrained("./bart_model/")

In [5]:
def bart_correct(tokenizer, model, text: str, max_length: int = 128):
    import numpy as np

    text = "< " + text
    inputs = tokenizer(text, return_tensors='pt')
    
    input_ids = inputs['input_ids'][:, :max_length]
    attention_mask = inputs['attention_mask'][:, :max_length]
    
    model.eval()
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask).logits
        
        predicted_token_indexes = torch.argmax(logits, dim=-1)
        
        decode_tokens = tokenizer.decode(predicted_token_indexes[0], skip_special_tokens=True)
        
        decode_tokens = decode_tokens.strip()
        
    return decode_tokens

In [6]:
bart_correct(tokenizer, new_model, "He are a nice person.", 128)

'He is a nice person.'

In [8]:
import pandas as pd

test = pd.read_csv('data/train_300samples.csv',na_filter=False)

In [9]:
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize

def pred_sentences(df):
    temp = pd.DataFrame()
    a = []
    b = []
    for i in tqdm(range(300)):
        #if i == 170:
        #    continue
        reference = df['correct'].iloc[i]
        pred = bart_correct(tokenizer, new_model, df['incorrect'].iloc[i], 128)
        tokens = word_tokenize(pred)
        pred = " ".join(tokens)
        a.append(reference)
        b.append(pred)
    temp['Target Sentence'] = a
    temp['Corrected Sentence'] = b
    return temp

In [10]:
temp = pd.DataFrame()
temp = pred_sentences(test)
temp.to_csv('data/sentences.csv',index=False)

100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [03:25<00:00,  1.46it/s]


In [11]:
import csv
import numpy as np
from Levenshtein import distance as levenshtein_distance
from jiwer import wer
from rouge import Rouge

# Function to read sentences from a CSV file
def read_sentences_from_csv(file_path):
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        target_sentences = []
        correct_sentences = []
        for row in reader:
            target_sentences.append(row['Target Sentence'])
            correct_sentences.append(row['Corrected Sentence'])
    return target_sentences, correct_sentences

# Compute Exact Match Accuracy
def exact_match_accuracy(targets, corrects):
    exact_matches = [1 for target, correct in zip(targets, corrects) if target == correct]
    return sum(exact_matches) / len(targets)

# Calculate Levenshtein Distance
def average_levenshtein_distance(targets, corrects):
    distances = [levenshtein_distance(target, correct) for target, correct in zip(targets, corrects)]
    return np.mean(distances)

# Determine Word Error Rate
def average_wer(targets, corrects):
    error_rates = [wer(correct, target) for target, correct in zip(targets, corrects)]
    return np.mean(error_rates)

# Compute ROUGE Score
def compute_rouge(targets, corrects):
    rouge = Rouge()
    scores = rouge.get_scores(targets, corrects, avg=True)
    return scores

# Main function to compute metrics
def main(csv_file_path):
    target_sentences, correct_sentences = read_sentences_from_csv(csv_file_path)
    
    exact_match = exact_match_accuracy(target_sentences, correct_sentences)
    levenshtein_dist = average_levenshtein_distance(target_sentences, correct_sentences)
    word_error_rate = average_wer(target_sentences, correct_sentences)
    rouge_scores = compute_rouge(target_sentences, correct_sentences)
    
    print(f"Exact Match Accuracy: {exact_match}")
    print(f"Average Levenshtein Distance: {levenshtein_dist}")
    print(f"Average Word Error Rate: {word_error_rate}")
    print("ROUGE Scores:", rouge_scores)

# Replace 'sentences.csv' with the path to your actual CSV file
main('data/sentences.csv')

Exact Match Accuracy: 0.30333333333333334
Average Levenshtein Distance: 7.8966666666666665
Average Word Error Rate: 0.11998602052400152
ROUGE Scores: {'rouge-1': {'r': 0.9209161400211485, 'p': 0.9128017739721874, 'f': 0.9157767270105097}, 'rouge-2': {'r': 0.8111905453400288, 'p': 0.8044661676197792, 'f': 0.8069622536383714}, 'rouge-l': {'r': 0.9168034109452109, 'p': 0.9086315376565004, 'f': 0.9116454336142031}}
