<a href="https://colab.research.google.com/github/tzf101/BDA-Bangla-Text-Data-Augmentation/blob/main/utils_notebook/rs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading Libraries



### Libraries

In [14]:
from google.colab import drive
drive.mount("/content/MyDrive", force_remount=True)

Mounted at /content/MyDrive


In [15]:
!pip install sentencepiece
!pip install rouge
!pip install sacrebleu
!pip install -U sentence-transformers
!pip install bert-score



In [16]:
import sentencepiece as spm
import pandas as pd

import sacrebleu
from rouge import Rouge
from sacrebleu import corpus_bleu
from bert_score import score
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu

### Loading Models

In [17]:
sbert_model = SentenceTransformer('l3cube-pune/bengali-sentence-bert-nli')

### Score calculation

In [18]:
def calculate_sbert_score(original, augmented):
    emb1 = sbert_model.encode(original)
    emb2 = sbert_model.encode(augmented)
    cosine_scores = util.pytorch_cos_sim(emb1, emb2)
    sbert_score = cosine_scores.item()
    return sbert_score

In [19]:
def calculate_scores(original, augmented):
    # BLEU Score
    reference = original
    candidate = augmented
    # bleu_score = sentence_bleu(reference, candidate)
    bleu_score = [sacrebleu.corpus_bleu([aug], [[orig]]).score for aug, orig in zip(augmented, original)]
    # BERTScore
    P, R, F1 = score([augmented], [original], lang="bn", rescale_with_baseline=True)

    # SBERT Score with Cosine Similarity
    emb1 = sbert_model.encode(original)
    emb2 = sbert_model.encode(augmented)
    cosine_scores = util.pytorch_cos_sim(emb1, emb2)
    sbert_score = cosine_scores.item()

    return bleu_score, F1.item(), sbert_score

### RS function

In [20]:
!pip install bnlp



In [21]:
!pip install bnlp-toolkit



In [22]:
import random
from random import shuffle
import re

In [23]:
from bnlp import BengaliCorpus as corpus

In [24]:
from bnlp import BengaliWord2Vec

In [25]:
class RS():
    def swap_word(self, new_words):
        random_idx_1 = random.randint(0, len(new_words) - 1)
        random_idx_2 = random.randint(0, len(new_words) - 1)
        while random_idx_2 == random_idx_1:
            random_idx_2 = random.randint(0, len(new_words) - 1)
        new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
        return new_words

    def augment(self, text, n, debug=False):
        words = text.split()
        new_words = words.copy()
        for _ in range(n):
            new_words = self.swap_word(new_words)
        output =  ' '.join(new_words)
        if debug:
            output += "(rs)"
        return output

### Testing

In [26]:
rs = RS()
text = "সঠিক তদন্ত করতে হবে। বিচারের আওতায় আনতে হবে যে এই কাজ টা করেছে।"
augmented_text = rs.augment(text, n=2)
print(augmented_text)

সঠিক বিচারের করতে হবে তদন্ত আওতায় আনতে হবে। যে এই কাজ টা করেছে।


### Apply RS on dataset

In [27]:
# Define a function to apply paraphrasing
def apply_rs(row):
    return rs.augment(row['original_sentence'], n=2)

In [28]:
def sr_and_evaluate_dataset(file_path, original_col_name, new_col_name):
    # Load the dataset
    df = pd.read_csv(file_path)
    df = df.rename(columns={original_col_name: 'original_sentence'})

    # Apply the text augmentation function
    df[new_col_name] = df.apply(apply_rs, axis=1)
    # Compute ROUGE scores
    rouge = Rouge()
    rouge_scores = rouge.get_scores(df[new_col_name], df["original_sentence"])
    rouge_df = pd.DataFrame([
        {
            'r1p': score['rouge-1']['p'],
            'r1r': score['rouge-1']['r'],
            'r1f': score['rouge-1']['f'],
            'r2p': score['rouge-2']['p'],
            'r2r': score['rouge-2']['r'],
            'r2f': score['rouge-2']['f'],
            'rlp': score['rouge-l']['p'],
            'rlr': score['rouge-l']['r'],
            'rlf': score['rouge-l']['f'],
        }
        for score in rouge_scores
    ])

    # Compute SacreBLEU scores
    sacrebleu_scores = [sacrebleu.corpus_bleu([aug], [[orig]]).score for aug, orig in zip(df[new_col_name], df["original_sentence"])]
    df["sacrebleu_score"] = sacrebleu_scores

    # SBERT Score with Cosine Similarity
    df["sbert_score"] = [calculate_sbert_score(orig, aug) for orig, aug in zip(df["original_sentence"], df[new_col_name])]

    # Combine the dataframes
    result_df = pd.concat([df, rouge_df], axis=1)
    result_df["method"] = "rs2"

    return result_df

# Running on dataset

In [29]:
file_path = '/content/MyDrive/MyDrive/Research/Thesis: BDA/Main/evaluation/(old)Youtube/Datasets/yt_sentiment_train_10.csv'

In [None]:
result_df = sr_and_evaluate_dataset(file_path, 'sentence1', 'augmented_sentence')

In [None]:
result_df

### Saving augmented dataset

In [None]:
result_df.to_csv('/content/MyDrive/MyDrive/Research/Thesis: BDA/Main/evaluation/(old)Youtube/Datasets/yt_sentiment_train_10_rs.csv', index=False)