# Data Augmentation

## Install and Imports

In [None]:
!pip install -q datasets transformers[torch]
!pip install accelerate -U -q
!pip install -q huggingface_hub
!pip install -q wandb
!pip install -q nlpaug
!pip install -q sacremoses nltk

In [None]:
from urllib import request
import os
import csv

In [None]:
import pandas as pd
import numpy as np

## Fetch all data

In [None]:
def fetch_url(module_url):
  module_name = module_url.split('/')[-1]
  print(f'Fetching {module_url}')
  #with open("file_1.txt") as f1, open("file_2.txt") as f2
  with request.urlopen(module_url) as f, open(module_name,'w') as outf:
    a = f.read()
    outf.write(a.decode('utf-8'))

In [None]:
train_parids_labels_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/practice%20splits/train_semeval_parids-labels.csv"
fetch_url(train_parids_labels_url)

In [None]:
train_url = "https://raw.githubusercontent.com/CRLala/NLPLabs-2024/main/Dont_Patronize_Me_Trainingset/dontpatronizeme_pcl.tsv"
fetch_url(train_url)


In [None]:
dev_url = "https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/practice%20splits/dev_semeval_parids-labels.csv"
fetch_url(dev_url)

In [None]:
dev_parids_labels_url = "https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/practice%20splits/dev_semeval_parids-labels.csv"
fetch_url(dev_parids_labels_url)

## Create Train and Val dataset

In [None]:
def load_test(test_path):
  rows=[]
  with open(test_path) as f:
    for line in f:
      t=line.strip().split('\t')
      rows.append(t)
  test_set_df = pd.DataFrame(rows, columns="par_id art_id keyword country text".split())

In [None]:
def load_train(train_path):
  rows=[]
  with open(os.path.join(train_path)) as f:
    for line in f.readlines()[4:]:
      par_id=line.strip().split('\t')[0]
      art_id = line.strip().split('\t')[1]
      keyword=line.strip().split('\t')[2]
      country=line.strip().split('\t')[3]
      t=line.strip().split('\t')[4]#.lower()
      l=line.strip().split('\t')[-1]
      if l=='0' or l=='1':
        lbin=0
      else:
        lbin=1
      rows.append(
        {'par_id':par_id,
        'art_id':art_id,
        'keyword':keyword,
        'country':country,
        'text':t,
        'label':lbin,
        'orig_label':l
        }
      )
  train_df = pd.DataFrame(rows, columns=['par_id', 'art_id', 'keyword', 'country', 'text', 'label', 'orig_label'])
  return train_df

In [None]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [None]:
train_path = "./dontpatronizeme_pcl.tsv"
data = load_train(train_path)

In [None]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
trids.par_id = trids.par_id.astype(str)
trids.head()

In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

In [None]:
trdf1 = pd.DataFrame(rows)
print(trdf1.info())
trdf1.head()

In [None]:
valids = pd.read_csv('dev_semeval_parids-labels.csv')
valids.par_id = valids.par_id.astype(str)
valids.head()


In [None]:
rows = [] # will contain par_id, label and text
for idx in range(len(valids)):
  parid = valids.par_id[idx]
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

In [None]:
valdf1 = pd.DataFrame(rows)
valdf1.head()

## Augmentations

In [None]:
import pandas as pd
from tqdm import tqdm

def augment_dataset(dataframe, augmenter):
    # Select a subset for augmentation to balance the dataset or based on your criteria
    minority_class = dataframe[dataframe['label'] == 1]
    samples_to_augment = minority_class.sample(n= len(minority_class), random_state=42)

    augmented_texts = []
    for text in tqdm(samples_to_augment['text']):
        # Apply augmentation
        augmented_text = augmenter.augment(text)
        augmented_texts.append(augmented_text)

    # Create a new dataframe with augmented data
    augmented_df = pd.DataFrame({
        'par_id': samples_to_augment['par_id'],
        'community': samples_to_augment['community'],
        'text': augmented_texts,
        'label': samples_to_augment['label']
    })

    # Append the augmented data to the original dataframe
    augmented_dataframe = pd.concat([dataframe, augmented_df]).reset_index(drop=True)

    return augmented_dataframe


### Back Translation

In [None]:
import nlpaug.augmenter.word as naw

back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de',
    to_model_name='facebook/wmt19-de-en'
)

In [None]:
augmented_trdf1 = augment_dataset(trdf1, back_translation_aug)

In [None]:
from datasets import Dataset, DatasetDict
augmented_trdf1['text'] = augmented_trdf1['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

data = DatasetDict({
    "train": Dataset.from_pandas(augmented_trdf1),
    "valid": Dataset.from_pandas(valdf1),
})

In [None]:
data.push_to_hub("ImperialIndians23/nlp_cw_data_unprocessed_augmented", token="put your token here")

### Synonym Augmenter

In [None]:
import nlpaug.augmenter.word as naw
import nltk
nltk.download('wordnet')

synonym_aug = naw.SynonymAug(aug_src='wordnet')

In [None]:
augmented_trdf1 = augment_dataset(trdf1, synonym_aug)

In [None]:
from datasets import Dataset, DatasetDict
augmented_trdf1['text'] = augmented_trdf1['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

data = DatasetDict({
    "train": Dataset.from_pandas(augmented_trdf1),
    "valid": Dataset.from_pandas(valdf1),
})

In [None]:
data

In [None]:
data.push_to_hub("ImperialIndians23/nlp_cw_data_unprocessed_augmented_synonym", token="put your token here")

## Back Translation + Synonym

- Running Synonym Augmentation on Back Translated and Original sentences.


In [None]:
import nlpaug.augmenter.word as naw
import nltk
nltk.download('wordnet')

synonym_aug = naw.SynonymAug(aug_src='wordnet')

In [None]:
from datasets import load_dataset
dataset = load_dataset("ImperialIndians23/nlp_cw_data_unprocessed_augmented")

In [None]:
dataset

In [None]:
train_df = pd.DataFrame(dataset["train"])

In [None]:
augmented_trdf1 = augment_dataset(train_df, synonym_aug)

In [None]:
from datasets import Dataset, DatasetDict
augmented_trdf1['text'] = augmented_trdf1['text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

data = DatasetDict({
    "train": Dataset.from_pandas(augmented_trdf1),
    "valid": Dataset.from_pandas(valdf1),
})

In [None]:
data

In [None]:
data.push_to_hub("ImperialIndians23/nlp_cw_data_unprocessed_augmented_both", token="put your token here")