<a href="https://colab.research.google.com/github/tzf101/BDA-Bangla-Text-Data-Augmentation/blob/main/utils_notebook/sr_ri_rd_rs_(latest_working).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mounting

In [1]:
from google.colab import drive
drive.mount("/content/MyDrive/", force_remount=True)

Mounted at /content/MyDrive/


# Loading Libraries



In [2]:
!pip install bnlp bnlp-toolkit

Collecting bnlp
  Downloading bnlp-0.8.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bnlp-toolkit
  Downloading bnlp_toolkit-4.0.0-py3-none-any.whl (22 kB)
Collecting sklearn-crfsuite (from bnlp-toolkit)
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting ftfy (from bnlp-toolkit)
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji==1.7.0 (from bnlp-toolkit)
  Downloading emoji-1.7.0.tar.gz (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite->bnlp-toolkit)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

# Imports

In [3]:
import random
from random import shuffle
import re
from bnlp import BengaliCorpus as corpus
from bnlp import BengaliWord2Vec
import random
import pandas as pd

punkt not found. downloading...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# SR

In [55]:
import random

from bnlp import BengaliWord2Vec, BengaliCorpus

class BengaliSynonymReplacer:
    def __init__(self):
        self.bwv = BengaliWord2Vec()
        self.stopwords = set(BengaliCorpus.stopwords)

    def get_synonyms(self, word):
        synonyms = set()
        try:
            similar_words = self.bwv.get_most_similar_words(word, topn=10)
            for word_tuple in similar_words:
                synonyms.add(word_tuple[0])
            if word in synonyms:
                synonyms.remove(word)
        except KeyError:
            # Ignore words not in vocabulary
            pass
        return list(synonyms)

    def augment(self, text, n, debug=False):
        words = text.split()
        new_words = words.copy()
        random_word_list = list(set([word for word in words if word not in self.stopwords]))
        random.shuffle(random_word_list)
        num_replaced = 0
        for random_word in random_word_list:
            synonyms = self.get_synonyms(random_word)
            if len(synonyms) >= 1:
                synonym = random.choice(list(synonyms))
                new_words = [synonym if word == random_word else word for word in new_words]
                num_replaced += 1
            if num_replaced >= n:
                break

        output = ' '.join(new_words)
        if debug:
            output += "(sr)"
        return output

    def apply_sr(self, row, n):
        return self.augment(row['sentence1'], n)

    def sr_and_evaluate_dataset(self, df, n):
        df['augmented_text'] = df.apply(lambda row: self.apply_sr(row, n), axis=1)
        df["method"] = "sr" + str(n)
        return df


# RI

In [84]:
import random
from bnlp import BengaliWord2Vec, BengaliCorpus

class BengaliRandomInsertion:
    def __init__(self):
        self.bwv = BengaliWord2Vec()
        self.stopwords = set(BengaliCorpus.stopwords)

    def get_synonyms(self, word):
        synonyms = set()
        try:
            similar_words = self.bwv.get_most_similar_words(word, topn=10)
            for word_tuple in similar_words:
                synonyms.add(word_tuple[0])
            if word in synonyms:
                synonyms.remove(word)
        except KeyError:
            # Ignore words not in vocabulary
            pass
        return list(synonyms)

    def add_word(self, new_words):
        synonyms = []
        attempts = 0
        max_attempts = 10  # Set a maximum number of attempts to find a synonym

        while len(synonyms) < 1 and attempts < max_attempts:
            random_word = new_words[random.randint(0, len(new_words) - 1)]
            synonyms = self.get_synonyms(random_word)
            attempts += 1

        if synonyms:
            random_synonym = random.choice(synonyms)  # Choose a random synonym
            random_idx = random.randint(0, len(new_words) - 1)
            new_words.insert(random_idx, random_synonym)

    def augment(self, text, n, debug=False):
        words = text.split()
        new_words = words.copy()
        for _ in range(n):
            self.add_word(new_words)
        output = ' '.join(new_words)
        if debug:
            output += "(ri)"
        return output

    def apply_ri(self, row, n):
        return self.augment(row['sentence1'], n)

    def ri_and_evaluate_dataset(self, df, n):
        df['augmented_text'] = df.apply(lambda row: self.apply_ri(row, n), axis=1)
        df["method"] = "ri" + str(n)
        return df


# RD

In [4]:
class RD:
    def __init__(self):
        pass

    def augment(self, text, p=0.3, debug=False):
        words = text.split()

        # Use list comprehension to select words to keep
        new_words = [word for word in words if random.uniform(0, 1) > p]

        # Handle the case where all words are removed
        if not new_words:
            rand_int = random.randint(0, len(words) - 1)
            return words[rand_int]

        output = ' '.join(new_words)
        if debug:
            output += " (rd)"
        return output

    def evaluate(self, df, p):
        aug = []
        for sample in df['sentence1']:
            augmented_text = self.augment(sample, p)
            aug.append(augmented_text)
        df['augmented_sentence'] = aug
        df['method'] = 'rd' + str(p)
        return df

# RS

In [16]:
import random

class RS:
    def __init__(self):
        pass

    def swap_word(self, new_words):
        if len(new_words) > 1:  # Only swap if there are at least 2 words
            random_idx_1, random_idx_2 = random.sample(range(len(new_words)), 2)
            new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
        return new_words

    def augment(self, text, n, debug=False):
        words = text.split()
        for _ in range(n):
            self.swap_word(words)  # Modify in place
        output = ' '.join(words)
        if debug:
            output += " (rs)"
        return output

    def evaluate(self, df, n):
        aug = []
        for sample in df['sentence1']:
            augmented_text = self.augment(sample, n)
            aug.append(augmented_text)
        df['augmented_sentence'] = aug
        df['method'] = 'rs' + str(n)
        return df

In [7]:
root_dir = '/content/MyDrive/MyDrive/Research/Thesis: BDA/Main/datasets/post mid/bmoc/'
df = pd.read_csv(f'{root_dir}training_50.csv')

In [82]:
SR_Aug = BengaliSynonymReplacer()
output = SR_Aug.sr_and_evaluate_dataset(df, 2)

In [None]:
output.to_csv(f'{root_dir}training_50_sr2.csv')

In [99]:
RI_Aug = BengaliRandomInsertion()
output = RI_Aug.ri_and_evaluate_dataset(df, 2)

In [88]:
output.to_csv(f'{root_dir}training_50_ri2.csv')

In [10]:
RD_Aug = RD()
output = RD_Aug.evaluate(df, 0.2)

In [12]:
output.to_csv(f'{root_dir}training_50_rd2.csv')

In [18]:
rs = RS()
# Assuming 'train' DataFrame is defined
output = rs.evaluate(df, 2)

In [20]:
output.to_csv(f'{root_dir}training_50_rs2.csv')

In [19]:
output

Unnamed: 0,sentence1,label,augmented_sentence,method
0,প্রতিদিন বাঁচো যেভাবে ভাল্লাগে দুরাশা নয় শ...,joy,প্রতিদিন দুরাশা যেভাবে হাঁসিতে বাঁচো নয় শুদ্ধ ...,rs2
1,ওখানে গিয়ে না খাইলেই হয় সবাই যদি এইগুলা খাওয়া...,anger,ওখানে গিয়ে না খাইলেই হয় সবাই যদি এইগুলা কেনার ...,rs2
2,চমকে লাফ দিয়ে শোয়া থেকে উঠে বসলো সার্জন হাপরে...,surprise,চমকে লাফ সার্জন শোয়া থেকে উঠে বসলো দিয়ে হাপরের...,rs2
3,চুয়েটে যাওয়ার আগে প্রতিবছর পহেলা বৈশাখ মানে ছি...,joy,চুয়েটে যাওয়ার আগে প্রতিবছর পহেলা বৈশাখ মানে ছি...,rs2
4,সত্য বলা ও আমানত রক্ষা করা মুমিন ব্যক্তির সর্ব...,sadness,সত্য বলা রক্ষা আমানত ও করা মুমিন ব্যক্তির সর্ব...,rs2
...,...,...,...,...
2095,এগুলো সঠিকভাবে তুলে না ধরলে সরকার ধামাচাপা দিত...,fear,হবে সঠিকভাবে তুলে না ধরলে সরকার ধামাচাপা দিতে ...,rs2
2096,দীর্ঘ আট বছর যাবত শরীরের কোন হচ্ছে না ওজন ...,sadness,দীর্ঘ পারছি বছর যাবত শরীরের কোন হচ্ছে না ওজন ই...,rs2
2097,আমার চোখের আড়ালেই আমারে সারপ্রাইজ দিছে সবাই মি...,surprise,আমার চোখের আড়ালেই আমারে সারপ্রাইজ দিছে সবাই বছ...,rs2
2098,আসলে এখানকার মুসলমানরা কখনোই বুঝবে না সংখ্যালঘ...,disgust,আসলে এখানকার মুসলমানরা কখনোই বুঝবে না সংখ্যালঘ...,rs2
