In [None]:
import pandas as pd
import numpy as np

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action

import re
import string 

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("  ", sep=";")
df.info()

Text processing 

In [None]:
def texts_cleaner(text):
    # remove numbers 
    text = ''.join(c for c in text if not c.isdigit())
    # lower case 
    #text = "".join([i.lower() for i in text if i not in string.punctuation])
    # remove any spaces
    text = text.strip()
    # remove any white spaces from beginning of string
    text = text.lstrip() 
    # remove any white spaces from ending of string
    text = text.rstrip()
    text = re.sub('\s+', ' ', text)
    #removing : \ characters  from the text
    text = re.sub(r'(:\S+) | (\\S+)', r'', text)
    return text

In [None]:
df['text'] = df['text'].apply(lambda x: texts_cleaner(x))
df['text'].tail()

In [None]:
# Drop duplicates and reset index
df = df[['text', 'label', 'translated']]
df.drop_duplicates(inplace=True)
df = df.reset_index(drop=True)
df.info()


### Map Textual labels to numeric using Label Encoder:

In [None]:
#Map Textual labels to numeric using Label Encoder:
from sklearn.preprocessing import LabelEncoder
df["label2"] = LabelEncoder().fit_transform(df["label"])
df.head(2)

Function for the text augmentation

In [None]:
## Augmented data 
from sklearn.utils import shuffle
from tqdm.auto import tqdm

#Augment French by BERT
aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.2) #aug_p: Percentage of word will be augmented

def augment_text(df,samples,label):
    new_text=[]
    label2 = []
    res = {}
    label = label      

    for ii in label:
        df_n=df[df.label2==ii].reset_index(drop=True)
    
        ## data augmentation loop
        for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['text']
            label = df_n.iloc[i]['label2']
            label2.append(label)
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)

        res = {new_text[i]: label2[i] for i in range(len(new_text))}
  
    return res

In [None]:
# Select classes with 5 texts
samples= 6
num_text = 5
x =df.groupby('label2').count().reset_index()
df1 =x[x.text==num_text]
label = df1['label2'].tolist()
# Call the function
aug_text1 = augment_text(df, samples, label)
#aug_text1

In [None]:
# Select classes with 6 texts
samples= 6
num_text = 6
df1 =x[x.text==num_text]
label = df1['label2'].tolist()
# Call the function: df and number of samples per class_label
aug_text2 = augment_text(df, samples, label)
#aug_text2
#Updated dictionary
aug_text1.update(aug_text2)

In [None]:
# Select classes with 7 texts
samples= 3
num_text = 7
df1 =x[x.text==num_text]
label = df1['label2'].tolist()
# Call the function: df and number of samples per class_label
aug_text3 = augment_text(df, samples, label)
#aug_text3
#Updated dictionary
aug_text1.update(aug_text3)

In [None]:
# Select classes with 10 texts
samples= 4
num_text = 10
df1 =x[x.text==num_text]
label = df1['label2'].tolist()
# Call the function: df and number of samples per class_label
aug_text4 = augment_text(df, samples, label)
#aug_text4
#Updated dictionary
aug_text1.update(aug_text4)

In [None]:
# Select classes with 9 texts
samples= 5
num_text = 9
df1 =x[x.text==num_text]
label = df1['label2'].tolist()
# Call the function: df and number of samples per class_label
aug_text5 = augment_text(df, samples, label)
aug_text5
#Updated dictionary
aug_text1.update(aug_text5)

DataFrame

In [None]:
#Convert dictionary into a dataframe
new = pd.DataFrame(aug_text1.items(), columns=['text', 'label2'])
df_augmented=shuffle(new).reset_index(drop=True)
df_augmented.info()

In [None]:
#Add label 5 to identify that it is a augmented text by NLPaug library
df_augmented['translated'] = 5
#Drop duplicates
df_augmented.drop_duplicates(inplace=True)
df_augmented.info()

### Append dataFrames

In [None]:
#Ordering the columns
df = df[['text','label2','translated']]
#Append DataFrames
df_final = df.append(df_augmented, ignore_index=True)
df_final.drop_duplicates(inplace=True)
df_final=shuffle(df_final).reset_index(drop=True)
df_final.info()

In [None]:
df_final.to_csv('augmented_text_byNLPaug.csv', index=False)