In [1]:
import os

COLAB = False
if 'google.colab' in str(get_ipython()):
    COLAB = True

if COLAB:
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    repo_path = '/content/drive/Othercomputers/My Mac/266-implicit-hate-speech-detection'

    hf_token = userdata.get('hf_token')

else:
    repo_path = '..'

!python -m pip install nlpaug



In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import nlpaug.augmenter.word as naw

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
processed_dir = os.path.join(repo_path, 'data/processed')
augment_dir = os.path.join(repo_path, 'data/easy_data_augmentation')
orig_train_file = os.path.join(processed_dir, '%s/%s_train.csv')
save_file = os.path.join(augment_dir, '%s/%s_train.csv')

datasets = ['ihc', 'ishate']

## Easy Data Augmentation


In [6]:
substitute_aug = naw.random.RandomWordAug(
    action='substitute',
    aug_min=1,
)

crop_aug = naw.random.RandomWordAug(
    action='crop',
    aug_min=1,
)
swap_aug = naw.random.RandomWordAug(
    action='swap',
    aug_min=1,
)

delete_aug = naw.random.RandomWordAug(
    action='delete',
    aug_min=1,
)

In [7]:
def map_aug_method(method):
    match method:
        case 0:
            return "substitute"
        case 1:
            return "crop"
        case 2:
            return "swap"
        case 3:
            return "delete"

In [8]:
def easy_data_augment(text, method):
    out = ''
    match method:
        case 0:
            out = substitute_aug.augment(text)
        case 1:
            out = crop_aug.augment(text)
        case 2:
            out = swap_aug.augment(text)
        case 3:
            out = delete_aug.augment(text)

    return out

In [7]:
for corpus in datasets:
    df = pd.read_csv(orig_train_file % (corpus, corpus))
    counts = df['label'].value_counts()
    counts = tuple(zip(counts.index, counts.values))

    print(counts)

    threshold = counts[1][1]
    l_label = counts[0][0]
    m_label = counts[1][0]
    s_label = counts[2][0]
    s_size = counts[2][1]

    df['orig_id'] = df['id']
    df['orig_cleaned_text'] = df['cleaned_text']


    l_sample = df.loc[df['label'] == l_label].sample(threshold, random_state = 42)
    l_sample['aug_method'] = 'sampled_from_largest_class'
    m_sample = df.loc[df['label'] == m_label].sample(frac=1)
    m_sample['aug_method'] = 'untouched'

    s_orig = df.loc[df['label'] == s_label].sample(frac=1)
    s_orig['aug_method'] = 'untouched'

    diff = threshold - s_size
    s_sample = s_orig.loc[s_orig['cleaned_text'].str.count(' ') > 2].sample(diff, random_state=42, replace=True)
    s_sample['aug_method'] = np.random.randint(4, size=diff)

    tqdm.pandas(desc=f'Applying augmentation for {corpus}_train.csv')
    s_sample['cleaned_text'] = s_sample.progress_apply(lambda x: easy_data_augment(x['cleaned_text'], x['aug_method']), axis=1)
    s_sample['aug_method'] = s_sample['aug_method'].apply(map_aug_method)

    out = pd.concat([l_sample, m_sample, s_orig, s_sample])
    out = out.sample(frac=1, random_state=42).reset_index()
    out.to_csv(save_file % (corpus, corpus))
    print(out.shape)

((0, 9304), (2, 4970), (1, 762))


Applying augmentation for ihc_train.csv: 100%|██████████| 4208/4208 [00:00<00:00, 4252.79it/s]


(14910, 8)
((0, 12508), (1, 7007), (2, 866))


Applying augmentation for ishate_train.csv: 100%|██████████| 6141/6141 [00:03<00:00, 1822.03it/s]


(21021, 8)


In [8]:
os.listdir('/content/drive/Othercomputers/My Mac/266-implicit-hate-speech-detection/data/easy_data_augmentation/ihc')

['ihc_train.csv']