# Loading Libraries

### Libraries

In [85]:
# from google.colab import drive
# drive.mount("/content/MyDrive", force_remount=True)

In [86]:
!pip install git+https://github.com/csebuetnlp/normalizer

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/csebuetnlp/normalizer
  Cloning https://github.com/csebuetnlp/normalizer to /tmp/pip-req-build-gk4t0pd1
  Running command git clone --filter=blob:none --quiet https://github.com/csebuetnlp/normalizer /tmp/pip-req-build-gk4t0pd1
  Resolved https://github.com/csebuetnlp/normalizer to commit d405944dde5ceeacb7c2fd3245ae2a9dea5f35c9
  Preparing metadata (setup.py) ... [?25ldone


In [87]:
!pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable


In [88]:
# !pip install datsets transformers[sentencepiece]
# !pip install sentencepiece

In [89]:
!pip install rouge

Defaulting to user installation because normal site-packages is not writeable


In [90]:
!pip install sacrebleu

Defaulting to user installation because normal site-packages is not writeable


In [91]:
!pip install -U sentence-transformers

Defaulting to user installation because normal site-packages is not writeable


In [92]:
!pip install bert-score

Defaulting to user installation because normal site-packages is not writeable


In [93]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from normalizer import normalize
import sentencepiece as spm
import pandas as pd

import sacrebleu
from rouge import Rouge
from sacrebleu import corpus_bleu
from bert_score import score
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu

In [94]:
import torch
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())


0
<torch.cuda.device object at 0x7f333ea70310>
1
NVIDIA GeForce RTX 3090
True


### Models

In [95]:
model = AutoModelForSeq2SeqLM.from_pretrained("../SSD Files/models/paraphrase/banglat5_banglaparaphrase")

In [96]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
print(f'Model is using: {device}')

Model is using: cuda


In [97]:
tokenizer = AutoTokenizer.from_pretrained("../SSD Files/models/paraphrase/banglat5_banglaparaphrase", use_fast=False)

In [98]:
local_model_directorysbert = "../SSD Files/models/SBERT/bengali-sentence-bert-nli"

sbert_model = SentenceTransformer(local_model_directorysbert)

### Paraphrase function

In [99]:
def calculate_sbert_score(original, augmented):
    emb1 = sbert_model.encode(original)
    emb2 = sbert_model.encode(augmented)
    cosine_scores = util.pytorch_cos_sim(emb1, emb2)
    sbert_score = cosine_scores.item()
    return sbert_score

In [100]:

def calculate_scores(original, augmented):
    # BLEU Score
    reference = original
    candidate = augmented
    # bleu_score = sentence_bleu(reference, candidate)
    bleu_score = [sacrebleu.corpus_bleu([aug], [[orig]]).score for aug, orig in zip(augmented, original)]
    # BERTScore
    P, R, F1 = score([augmented], [original], lang="bn", rescale_with_baseline=True)

    # SBERT Score with Cosine Similarity
    emb1 = sbert_model.encode(original)
    emb2 = sbert_model.encode(augmented)
    cosine_scores = util.pytorch_cos_sim(emb1, emb2)
    sbert_score = cosine_scores.item()

    return bleu_score, F1.item(), sbert_score

In [101]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

class BanglaParaphraseGenerator:
    def __init__(self):
        self.model = model
        self.model = self.model.to(device)  # Assuming `device` is determined using torch.cuda.is_available()
        self.tokenizer = tokenizer

    def generateParaphrase(self, sentence, max_length=100):
        input_tokens = self.tokenizer(normalize(sentence), return_tensors="pt").input_ids
        input_tokens = input_tokens.to(device)  # Move input tokens to the same device as the model

        generated_tokens = self.model.generate(input_tokens, max_length=max_length)
        decoded_tokens = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

        return decoded_tokens





In [102]:
# Define a function to apply paraphrasing
def apply_paraphrasing(row):
    return paraphrase_generator.generateParaphrase(row['original_sentence'])


In [103]:
def paraphrase_and_evaluate_dataset(file_path, original_col_name, new_col_name):
    # Load the dataset
    df = pd.read_csv(file_path)
    df = df.rename(columns={original_col_name: 'original_sentence'})

    # Apply the text augmentation function
    # Apply the text augmentation function row by row and print progress
    paraphrased_texts = []
    for i, row in df.iterrows():
        augmented_text = apply_paraphrasing(row)
        paraphrased_texts.append(augmented_text)
        if (i + 1) % 100 == 0:
            print(f"{i + 1} rows processed")

    df[new_col_name] = paraphrased_texts

#    # Compute ROUGE scores
#     rouge = Rouge()
#     rouge_scores = []

#     for new_col_name, original_sentence in zip(df[new_col_name], df["original_sentence"]):
#         try:
#             score = rouge.get_scores(new_col_name, original_sentence)[0]
#             rouge_scores.append({
#                 'r1f1': score['rouge-1']['f'],
#                 'r2f1': score['rouge-2']['f'],
#                 'rlf1': score['rouge-l']['f'],
#             })
#         except Exception as e:
#             rouge_scores.append({'r1f1': 0, 'r2f1': 0, 'rlf1': 0})

#     rouge_df = pd.DataFrame(rouge_scores)


#     # Initialize a list to hold SacreBLEU scores
#     sacrebleu_scores = []

#     # Iterate over each row in the DataFrame
#     for index, row in df.iterrows():
#         try:
#             # Calculate the SacreBLEU score for each pair of sentences
#             # SacreBLEU expects a list of hypotheses and a list of lists of references
#             score = sacrebleu.corpus_bleu([row['augmented_sentence']], [[row['original_sentence']]]).score
#             sacrebleu_scores.append(score)
#         except Exception as e:
#             # Append a score of 0 if an error occurs
#             sacrebleu_scores.append(0)

#     # Assign the list of scores to a new column in the DataFrame
#     df["sacrebleu_score"] = sacrebleu_scores


#     # SBERT Score with Cosine Similarity
#     sbert_scores = []

#     # Iterate over each row in the DataFrame
#     for index, row in df.iterrows():
#         try:
#             score = calculate_sbert_score([row['augmented_sentence']], [[row['original_sentence']]])
#             sbert_scores.append(score)
#         except Exception as e:
#             sbert_scores.append(0)

#     df["sbert_score"] = sbert_scores
#     # BERTScore
#     # BERTScore, handling KeyError per row
#     # Assuming you have a function 'calculate_f1_score' for F1 score calculation

#     # # Initialize a list to hold F1 scores
#     # bert_scores_f1 = []

#     # for index, row in df.iterrows():
#     #     try:
#     #         # Calculate the F1 score for each pair of sentences
#     #         # Assuming you have a 'calculate_f1_score' function for this purpose
#     #         _, _, F1 = score([row['augmented_sentence']], [[row['original_sentence']]], lang="bn", rescale_with_baseline=True)
#     #         bert_scores_f1.append(F1.item())  # Assuming F1 is a tensor with a single value
#     #     except KeyError:
#     #         # Handle the exception if needed
#     #         bert_scores_f1.append(0)  # Append 0 as a placeholder for the score in case of an error

#     # # Assign the list of F1 scores to a new column in the DataFrame
#     # df["bert_scores_f1"] = bert_scores_f1


#     # Combine the dataframes
#     result_df = pd.concat([df, rouge_df], axis=1)
    df["method"] = "pp"

    return df
     

# Testing

In [104]:
# Example usage:
input_text = "সঠিক তদন্ত করতে হবে। বিচারের আওতায় আনতে হবে যে এই কাজ টা করেছে।"
paraphrase_generator = BanglaParaphraseGenerator()
output_text = paraphrase_generator.generateParaphrase(input_text)
print(output_text)
     

সঠিক তদন্ত করা উচিত এবং সেই ব্যক্তিকে বিচারের আওতায় আনা উচিত, যিনি তা করেছেন।


# Running on Dataset

In [105]:
file_path = '../evaluation/sentnob/cleaned_original/train.csv'


In [106]:
import pandas as pd
train = pd.read_csv(file_path)
# Set the display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Print the train dataframe clipped to 10 rows
train.head(5)


Unnamed: 0,sentence1,label
0,"স্বাস্থ্যবান হতে চাই , আমি বয়সের তুলনায় অনেক ব...",0
1,ভাইয়া নতুন ভিডিও আসে না কেন,0
2,সৌরভ গাঙ্গুলী ছাড়া দাদাগিরি কখনো জমে উঠত না,0
3,ক্রিকেট কে বাচাতে হলে পাপকে অতিশিগ্রিই তাকেও গ...,2
4,আমিতো সেই ঝালপ্রিয়ো মানুষ,1


### eval

In [107]:
result_df = paraphrase_and_evaluate_dataset(file_path, 'sentence1', 'augmented_sentence')

100 rows processed
200 rows processed
300 rows processed
400 rows processed
500 rows processed
600 rows processed
700 rows processed
800 rows processed
900 rows processed
1000 rows processed
1100 rows processed
1200 rows processed
1300 rows processed
1400 rows processed
1500 rows processed
1600 rows processed
1700 rows processed
1800 rows processed
1900 rows processed
2000 rows processed
2100 rows processed
2200 rows processed
2300 rows processed
2400 rows processed
2500 rows processed
2600 rows processed
2700 rows processed
2800 rows processed
2900 rows processed
3000 rows processed
3100 rows processed
3200 rows processed
3300 rows processed
3400 rows processed
3500 rows processed
3600 rows processed
3700 rows processed
3800 rows processed
3900 rows processed
4000 rows processed
4100 rows processed
4200 rows processed
4300 rows processed
4400 rows processed
4500 rows processed
4600 rows processed
4700 rows processed
4800 rows processed
4900 rows processed
5000 rows processed
5100 rows

### view

In [None]:
# result_df

### Saving the dataset

In [None]:
file_path = '../evaluation/sentnob/augmented/train_100_pp.csv'
result_df.to_csv(file_path, index=False)