<a href="https://colab.research.google.com/github/tzf101/BDA-Bangla-Text-Data-Augmentation/blob/main/utils_notebook/iml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading Libraries



### Libraries

In [65]:
from google.colab import drive
drive.mount("/content/MyDrive", force_remount=True)

Mounted at /content/MyDrive


In [66]:
from google.colab import drive

In [67]:
!pip install transformers



In [68]:
!pip install git+https://github.com/csebuetnlp/normalizer

Collecting git+https://github.com/csebuetnlp/normalizer
  Cloning https://github.com/csebuetnlp/normalizer to /tmp/pip-req-build-1_u06icj
  Running command git clone --filter=blob:none --quiet https://github.com/csebuetnlp/normalizer /tmp/pip-req-build-1_u06icj
  Resolved https://github.com/csebuetnlp/normalizer to commit d405944dde5ceeacb7c2fd3245ae2a9dea5f35c9
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [69]:
!pip install sentencepiece
!pip install rouge
!pip install sacrebleu
!pip install -U sentence-transformers
!pip install bert-score



In [70]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from normalizer import normalize
import sentencepiece as spm
import pandas as pd

import sacrebleu
from rouge import Rouge
from sacrebleu import corpus_bleu
from bert_score import score
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu

### Loading Models

In [71]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglabert_generator")
model = AutoModelForMaskedLM.from_pretrained("csebuetnlp/banglabert_generator")

In [72]:
sbert_model = SentenceTransformer('l3cube-pune/bengali-sentence-bert-nli')

### Score calculation

In [73]:
def calculate_sbert_score(original, augmented):
    emb1 = sbert_model.encode(original)
    emb2 = sbert_model.encode(augmented)
    cosine_scores = util.pytorch_cos_sim(emb1, emb2)
    sbert_score = cosine_scores.item()
    return sbert_score

In [74]:
def calculate_scores(original, augmented):
    # BLEU Score
    reference = original
    candidate = augmented
    # bleu_score = sentence_bleu(reference, candidate)
    bleu_score = [sacrebleu.corpus_bleu([aug], [[orig]]).score for aug, orig in zip(augmented, original)]
    # BERTScore
    P, R, F1 = score([augmented], [original], lang="bn", rescale_with_baseline=True)

    # SBERT Score with Cosine Similarity
    emb1 = sbert_model.encode(original)
    emb2 = sbert_model.encode(augmented)
    cosine_scores = util.pytorch_cos_sim(emb1, emb2)
    sbert_score = cosine_scores.item()

    return bleu_score, F1.item(), sbert_score

### Masking and Unmasking functions

In [75]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import random

class IML:
    def __init__(self, model, tokenizer):
        self.tokenizer = tokenizer
        self.model = model

    def tokenize(self, text: str) -> torch.Tensor:
        return self.tokenizer.encode(text, return_tensors='pt')

    def predict_tokens(self, input_ids: torch.Tensor) -> torch.Tensor:
        with torch.no_grad():
            outputs = self.model(input_ids)
            return outputs[0]

    def find_masked_indices(self, input_ids: torch.Tensor) -> torch.Tensor:
        return (input_ids == self.tokenizer.mask_token_id).nonzero(as_tuple=True)[1]

    def replace_masked_tokens(self, input_ids: torch.Tensor, predictions: torch.Tensor, masked_indices: torch.Tensor) -> torch.Tensor:
        for idx in masked_indices:
            predicted_token_id = predictions[0, idx].topk(1).indices.item()
            input_ids[0, idx] = predicted_token_id
        return input_ids

    def unmask_text(self, masked_text: str) -> str:
        input_ids = self.tokenize(masked_text)
        predictions = self.predict_tokens(input_ids)
        masked_indices = self.find_masked_indices(input_ids)
        unmasked_input_ids = self.replace_masked_tokens(input_ids, predictions, masked_indices)
        return self.tokenizer.decode(unmasked_input_ids[0], skip_special_tokens=True)

    def specific_index_masking(self, text: str, idx: int) -> str:
        token_ids = self.tokenizer.encode(text, return_tensors='pt')[0]
        unk_token_id = self.tokenizer.unk_token_id  # Get the UNK token ID

        if idx < 0 or idx >= len(token_ids) - 1:
            return text
            # raise ValueError("Index is out of range for the text length.")

        # Check if the token is an OOV token
        if token_ids[idx] == unk_token_id:
            return text  # Return the original text if the token is OOV

        masked_token_ids = [self.tokenizer.mask_token_id if i == idx else token_id for i, token_id in enumerate(token_ids)]
        return self.tokenizer.decode(masked_token_ids)

    def iterative_mask_and_unmask(self, text: str) -> str:
        # print(f'Original: {text}')
        unmasked_text = text

        # Splitting the string into words
        words = text.split()
        # Enumerating and printing each word with its index
        for index, word in enumerate(words):
            # print(f'Step{index} Input   : {unmasked_text}')
            masked_text = self.specific_index_masking(unmasked_text, index)
            # print(f'Step{index} Masked  : {masked_text}')
            unmasked_text = self.unmask_text(masked_text)
            # print(f'Step{index} Unmasked: {unmasked_text}')
            # print('------------------------------------')
        return unmasked_text



### Testing

In [76]:
# Usage
iml = IML(model, tokenizer)

text = "একজন ভাই জাতীয় বিশ্ববিদ্যালয় নিয়ে একটু ত্যানাপ্যাচাল করছে, জাতীয় নিয়ে কোন Bad comment করবেন না। বাপের লাখ টাকা অার স্যারদের দেয়া সাজেসন্সে পড়ি না। নিজে কষ্ট করে পড়াশোনা করে রেজাল্ট করছি। হ্যা অামি মানি যারা পাবলিকে চান্স পায়না বা অপেক্ষা তালিকায় থেকে বাদ পড়ে যায়, তারাই জাতীয় বিশ্ববিদ্যালয় তে পড়ে, যাদের মেধা অাছে, অার্থিক অবস্থা ভালো না, বা কেউ যেনো কোনদিন বলতে না পারে বাপের টাকা দিয়া পড়ছি। জাতীয়তে পড়তেও প্রতিযোগিতা করতে হয়। রিংকু সাহা ঢাকা কলেজ ২০১০-১১ হিঃবি"
text = "এগিয়ে যাও আমরা আছি তোমাদের সাথে"
masked_text = iml.iterative_mask_and_unmask(text)

print(masked_text)

আজ আমিও আছি তোমার সাথে


### Apply MLM on dataset

In [77]:
# Define a function to apply paraphrasing
def apply_iml(row):
    return iml.iterative_mask_and_unmask(row['original_sentence'])

In [78]:
def iml_and_evaluate_dataset(file_path, original_col_name, new_col_name):
    # Load the dataset
    df = pd.read_csv(file_path)
    df = df.rename(columns={original_col_name: 'original_sentence'})

    # Apply the text augmentation function
    df[new_col_name] = df.apply(apply_iml, axis=1)
    # Compute ROUGE scores
    rouge = Rouge()
    rouge_scores = rouge.get_scores(df[new_col_name], df["original_sentence"])
    rouge_df = pd.DataFrame([
        {
            'r1p': score['rouge-1']['p'],
            'r1r': score['rouge-1']['r'],
            'r1f': score['rouge-1']['f'],
            'r2p': score['rouge-2']['p'],
            'r2r': score['rouge-2']['r'],
            'r2f': score['rouge-2']['f'],
            'rlp': score['rouge-l']['p'],
            'rlr': score['rouge-l']['r'],
            'rlf': score['rouge-l']['f'],
        }
        for score in rouge_scores
    ])

    # Compute SacreBLEU scores
    sacrebleu_scores = [sacrebleu.corpus_bleu([aug], [[orig]]).score for aug, orig in zip(df[new_col_name], df["original_sentence"])]
    df["sacrebleu_score"] = sacrebleu_scores

    # SBERT Score with Cosine Similarity
    df["sbert_score"] = [calculate_sbert_score(orig, aug) for orig, aug in zip(df["original_sentence"], df[new_col_name])]

    # Combine the dataframes
    result_df = pd.concat([df, rouge_df], axis=1)
    result_df["method"] = "iml"

    return result_df

# Running on dataset

In [79]:
file_path = '/content/MyDrive/MyDrive/Research/Thesis: BDA/Main/evaluation/(old)Youtube/Datasets/yt_sentiment_train_10.csv'

In [80]:
result_df = iml_and_evaluate_dataset(file_path, 'sentence1', 'augmented_sentence')

KeyboardInterrupt: 

In [None]:
result_df

### Saving augmented dataset

In [None]:
result_df.to_csv('/content/MyDrive/MyDrive/Research/Thesis: BDA/Main/evaluation/(old)Youtube/Datasets/yt_sentiment_train_10_iml.csv', index=False)