## Install

In [None]:
!pip install transformers
!pip install nlpaug

## Import the required libraries

In [None]:
import re
import string
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

import nlpaug.augmenter.word.context_word_embs as aug

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import word_tokenize

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Text Cleanup Utilities

In [None]:
def convert_to_lower(text):
    return text.lower()

def remove_numbers(text):
    number_pattern = r"\d+"
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        lemma_word = lemmatizer.lemmatize(tokens[i])
        tokens[i] = lemma_word
        return " ".join(tokens)
    
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)

def remove_extra_white_spaces(text):
    single_char_pattern = r"\s+[a-zA-Z]\s+"
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

## Data Import and Cleanup

In [None]:
import os
fname = os.path.join("<<your path and input excel file name with extension>>")
df = pd.read_excel (fname)
df.fillna('', inplace=True)

In [None]:
df = df[['Comments', 'label']]
df

In [None]:
print(df.isnull().sum())
print(df.isna().sum())

In [None]:
df['label'].value_counts()

In [None]:
df['Comments'].value_counts()

In [None]:
df = df.drop(df[df.label == ''].index)
df = df.drop(df[df.Comments == ''].index)

df = df.drop(df[df.label.isnull()].index)
df = df.drop(df[df.Comments.isnull()].index)

In [None]:
df['Comments'] = df['Comments'].apply(lambda x: convert_to_lower(x))
df['Comments'] = df['Comments'].apply(lambda x: remove_numbers(x))
df['Comments'] = df['Comments'].apply(lambda x: remove_punctuation(x))
df['Comments'] = df['Comments'].apply(lambda x: remove_stopwords(x))
df['Comments'] = df['Comments'].apply(lambda x: remove_extra_white_spaces(x))
df['Comments'] = df['Comments'].apply(lambda x: lemmatizing(x))

In [None]:
df = df.drop(df[df.label == ''].index)
df = df.drop(df[df.Comments == ''].index)

df = df.drop(df[df.label.isnull()].index)
df = df.drop(df[df.Comments.isnull()].index)

In [None]:
print(df.isnull().sum())
print(df.isna().sum())

In [None]:
df['label'].value_counts()

In [None]:
train_df = df.sample(frac=0.8, random_state=1)
test_df = df.drop(train_df.index)

In [None]:
train_df.value_counts('label')

In [None]:
test_df.value_counts('label')

In [None]:
test_df.to_excel("<<your path and output excel file name with extension for the hold-out data>>")

## Use BERT contextual embeddings augmentation

In [None]:
augmenter = aug.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")

In [None]:
def augmentData(df, augmenter, repetitions, num_samples, label):
    augmented_texts = []
    imbalanced_class_df = df[df['label'] == label].reset_index(drop=True)
    for i in tqdm(np.random.randint(0, len(imbalanced_class_df), num_samples)):
        # generating 'num_samples' augmented texts
        for _ in range(repetitions):
            augmented_text = augmenter.augment(imbalanced_class_df['Comments'].iloc[i])
            augmented_texts.append(augmented_text)
    
    data = {
        'Comments': augmented_texts,
        'label': label
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

## Check if text augmentation works for a few samples 

In [None]:
sample_text = train_df['Comments'].iloc[100]
sample_text

In [None]:
augmenter = aug.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")
augmented_sample_text = augmenter.augment(sample_text)
augmented_sample_text

In [None]:
for i in range(5):
    print(augmenter.augment(sample_text))

## Augment the data for each of the minority classes

To balance the data, the output classes need not have exactly the same number of records. They just need to have approximately the same number of records.

In [None]:
train_df['label'].value_counts()

In [None]:
aug_df = augmentData(train_df, augmenter, 1, 1500, "Output minority class 1")
aug_df['label'].value_counts()

In [None]:
aug_df = augmentData(aug_df, augmenter, 1, 1500, "Output minority class 2")

In [None]:
aug_df = augmentData(aug_df, augmenter, 1, 1500, "Output minority class 3")

In [None]:
aug_df = augmentData(aug_df, augmenter, 1, 1500, "Output minority class 4")

In [None]:
aug_df = augmentData(aug_df, augmenter, 1, 1500, "Output minority class 5")

In [None]:
aug_df['label'].value_counts()

In [None]:
aug_df.to_excel("<<your path and output excel file name with extension for the augmented training data>>")  