In [None]:
import pandas as pd
import numpy as np

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("./Clean_dataset.csv")

In [None]:
df.info()

In [None]:
plt.style.use('ggplot')

num_classes = len(df["label"].value_counts())

colors = plt.cm.Dark2(np.linspace(0, 1, num_classes))
iter_color = iter(colors)

df['label'].value_counts().plot.barh(title="Reviews for each label (n, %)", 
                                                 ylabel="labels",
                                                 color=colors,
                                                 figsize=(9,9))

for i, v in enumerate(df['label'].value_counts()):
  c = next(iter_color)
  plt.text(v, i,
           " "+str(v)+", "+str(round(v*100/df.shape[0],2))+"%", 
           color=c, 
           va='center', 
           fontweight='bold')

In [None]:
df.head(3)

In [None]:
#Map Textual labels to numeric using Label Encoder:
from sklearn.preprocessing import LabelEncoder
df["label2"] = LabelEncoder().fit_transform(df["label"])
df.head(3)

In [None]:
df_orig =df.groupby('label').count().reset_index()
df_orig.head(3)


### DataFrame augmented by translation

In [None]:
df_trans = pd.read_csv("data_after_translation.csv")
df_trans.head(3)

In [None]:
# Plot original data vs augmented data by translation
df_aug =df_trans.groupby('label').count().reset_index()
# Merge dataFrames
data_df = pd.merge(df_orig, df_aug, on ='label')

In [None]:
# Plot
data_df = data_df.sort_values('text_x', ascending = True)
#use fivethirty eights style of plots
plt.style.use("fivethirtyeight")#create the base axis to add the bars to
fig, ax = plt.subplots(1,1, figsize = (12,8))#extract the labels
label = data_df["label"]
#use this to create x ticks to add the data to
x = np.arange(len(label))#set a width for each bar 
width = 0.3#create out first bar
#set it so that x will be the centre of the bars
#so that we can add our labels later
#so set the centre of the first to be 1/2 width away
#to the left
rect1 = ax.bar(x - width/2,
              data_df["text_x"],
              width = width, 
               label = "original", color="red",
               edgecolor = "white"
              )#create the second bar
#with a centre half a width to the right
rect2 = ax.bar(x + width/2,
              data_df["text_y"],
              width = width,
              label = "translated", color="blue",
              edgecolor = "white")#add the labels to the axis
ax.set_ylabel("texts",
             fontsize = 10,
             labelpad = 10)
ax.set_xlabel("label",
             fontsize = 12,
             labelpad =12)
ax.set_title("Labels per class",
            fontsize = 12,
            pad = 20)#set the ticks
ax.set_xticks(x)
ax.set_xticklabels(label)#add the legend
#using the labels of the bars
ax.legend(title = "Text",
         fontsize = 8,
         title_fontsize = 20)#adjust the tick paramaters
ax.tick_params(axis = "x",
              which = "both", labelsize = 10,
              labelrotation = 90)
ax.tick_params(axis = "y", 
              which = "both",
              labelsize = 8 )

for container in ax.containers:
    ax.bar_label(container, size=7)

### After balance 

In [None]:
import pandas as pd
df = pd.read_csv("./df_augmented.csv")
df.head(3)

In [None]:
df.info()

In [None]:
plt.style.use('ggplot')

num_classes = len(df["label"].value_counts())

colors = plt.cm.Dark2(np.linspace(0, 1, num_classes))
iter_color = iter(colors)

df['label'].value_counts().plot.barh(title="Reviews for each label (n, %)", 
                                                 ylabel="labels",
                                                 color=colors,
                                                 figsize=(9,9))

for i, v in enumerate(df['label'].value_counts()):
  c = next(iter_color)
  plt.text(v, i,
           " "+str(v)+", "+str(round(v*100/df.shape[0],2))+"%", 
           color=c, 
           va='center', 
           fontweight='bold')

In [None]:
#Map Textual labels to numeric using Label Encoder:
from sklearn.preprocessing import LabelEncoder
df["label2"] = LabelEncoder().fit_transform(df["label"])
df.head(2)

### Group by label

In [None]:
x =df.groupby('label2').count().reset_index()
x.head(3)

In [None]:
# Select classes with 3 texts
df1 =x[x.text==3]
list_3text = df1['label2'].tolist()
list_3text

In [None]:
## Augmented data 
from sklearn.utils import shuffle
from tqdm.auto import tqdm

#Augment French by BERT
aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.2) #aug_p: Percentage of word will be augmented

def augment_text(df,samples=7):
    new_text=[]
    label2 = []
    res = {}
    label = list_3text      # list of classes with 3 texts 

    for ii in label:
        df_n=df[df.label2==ii].reset_index(drop=True)
    
        ## data augmentation loop
        for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['text']
            label = df_n.iloc[i]['label2']
            label2.append(label)
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)

        res = {new_text[i]: label2[i] for i in range(len(new_text))}
  
    return res

In [None]:
# dictionary: augmented data 
aug_text1 = augment_text(df)
aug_text1

In [None]:
# Select classes with 5 texts
df2 =x[x.text==5]
list_5text = df2['label2'].tolist()
list_5text

In [None]:
## Augmented data  
from sklearn.utils import shuffle
from tqdm.auto import tqdm

#Augment French by BERT
aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.2) #aug_p: Percentage of word will be augmented

def augment_text(df,samples=5):
    new_text=[]
    label2 = []
    res = {}
    label = list_5text  # list of classes with 5 texts 

    for ii in label:
        df_n=df[df.label2==ii].reset_index(drop=True)
    
        ## data augmentation loop
        for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['text']
            label = df_n.iloc[i]['label2']
            label2.append(label)
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)

        res = {new_text[i]: label2[i] for i in range(len(new_text))}
  
    return res

In [None]:
aug_text2 = augment_text(df)
#aug_text2

In [None]:
#Updated dictionary
aug_text1.update(aug_text2)

In [None]:
# Select classes with 6 texts
df3 =x[x.text==6]
list_6text = df3['label2'].tolist()
list_6text

In [None]:
## Augmented data 
from sklearn.utils import shuffle
from tqdm.auto import tqdm

#Augment French by BERT
aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.2) #aug_p: Percentage of word will be augmented

def augment_text(df,samples=4):
    new_text=[]
    label2 = []
    res = {}
    label = list_6text      # list of classes with 6 texts 

    for ii in label:
        df_n=df[df.label2==ii].reset_index(drop=True)
    
        ## data augmentation loop
        for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['text']
            label = df_n.iloc[i]['label2']
            label2.append(label)
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)

        res = {new_text[i]: label2[i] for i in range(len(new_text))}
  
    return res

In [None]:
aug_text3 = augment_text(df)
#aug_text3

In [None]:
#Updated dictionary
aug_text1.update(aug_text3)

In [None]:
# Select classes with 7 texts
df4 =x[x.text==7]
list_7text = df4['label2'].tolist()
list_7text

In [None]:
## Augmented data 
from sklearn.utils import shuffle
from tqdm.auto import tqdm

#Augment French by BERT
aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.2) #aug_p: Percentage of word will be augmented

def augment_text(df,samples=3):
    new_text=[]
    label2 = []
    res = {}
    label = list_7text      # list of classes with 7 texts 

    for ii in label:
        df_n=df[df.label2==ii].reset_index(drop=True)
    
        ## data augmentation loop
        for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['text']
            label = df_n.iloc[i]['label2']
            label2.append(label)
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)

        res = {new_text[i]: label2[i] for i in range(len(new_text))}
  
    return res

In [None]:
aug_text4 = augment_text(df)
#aug_text4

In [None]:
#Updated dictionary
aug_text1.update(aug_text4)

In [None]:
df5 =x[x.text==9]
list_9text = df5['label2'].tolist()
list_9text

In [None]:
## DICTIONARY 
from sklearn.utils import shuffle
from tqdm.auto import tqdm

#Augment French by BERT
aug = naw.ContextualWordEmbsAug(model_path='bert-base-multilingual-uncased', aug_p=0.2) #aug_p: Percentage of word will be augmented

def augment_text(df,samples=1):
    new_text=[]
    label2 = []
    res = {}
    label = list_9text

    for ii in label:
        df_n=df[df.label2==ii].reset_index(drop=True)
    
        ## data augmentation loop
        for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['text']
            label = df_n.iloc[i]['label2']
            label2.append(label)
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)

        res = {new_text[i]: label2[i] for i in range(len(new_text))}
  
    return res

In [None]:
aug_text5 = augment_text(df)
aug_text5

In [None]:
#Updated dictionary
aug_text1.update(aug_text5)
#print(aug_text1)

In [None]:
#Convert dictionary into a dataframe
df_augmented = pd.DataFrame(aug_text1.items(), columns=['text', 'label2'])
#Add label 
df_augmented['translated'] = 2

In [None]:
df_augmented

In [None]:
df_augmented.drop_duplicates(inplace=True)
df_augmented.info()

## Append dataFrames

In [None]:
df.head()

In [None]:
#Map Textual labels to numeric using Label Encoder:
#from sklearn.preprocessing import LabelEncoder
#df["label2"] = LabelEncoder().fit_transform(df["label"])
#Ordering the columns
df = df[['text','label2','translated']]
df

In [None]:
df_final = df.append(df_augmented, ignore_index=True)
df_final

In [None]:
df_final.drop_duplicates(inplace=True)
df_final.info()