In [1]:
import os
import time
import random
import argparse
import numpy as np
import pandas as pd
from rich.progress import track

import torch
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
from nlpaug.util import Action

from sentence_transformers import SentenceTransformer, util

# 1，bert roberta  mask  aug

In [2]:
def contextual_augment(data_source, data_target, textcol="text", aug_p=0.2, device1="cuda", device2="cuda"):
    ### contextual augmentation 
    print(f"\n-----transformer_augment-----\n")
    augmenter1 = naw.ContextualWordEmbsAug(
        model_path='roberta-base', action="substitute", aug_min=1, aug_p=aug_p, device=device1)

    augmenter2 = naw.ContextualWordEmbsAug(
        model_path='bert-base-uncased', action="substitute", aug_min=1, aug_p=aug_p, device=device2)

    train_data = pd.read_csv(data_source)
    train_text = train_data[textcol].fillna('.').astype(str).values
    print("train_text:", len(train_text), type(train_text[0]))

    auglist1, auglist2 = [], []
    for txt in train_text:
        atxt1 = augmenter1.augment(txt)
        atxt2 = augmenter2.augment(txt)
        auglist1.append(atxt1[0])
        auglist2.append(atxt2[0])

    train_data['roberta_'+str(int(aug_p*100))] = pd.Series(auglist1)
    train_data['bert_'+str(int(aug_p*100))] = pd.Series(auglist2)
    train_data.to_csv(data_target, index=False)

    for o, a1, a2 in zip(train_text[:5], auglist1[:5], auglist2[:5]):
        print("-----Original Text: \n", o)
        print("-----Augmented Text1: \n", a1)
        print("-----Augmented Text2: \n", a2)

In [3]:
def set_global_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
set_global_random_seed(0)

In [None]:
# contextual_augment('../dataset/StackOverflow.csv', 'StackOverflow.csv', aug_p=0.2)

# 2, strong weak Aug

In [2]:
from eda import *

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
from nlpaug.util import Action

def AugS(texts):
    aug_texts = []

    for text in texts:
        aug_text = eda(
            text,   # 原文本
            alpha_sr=0.2,   # how much to replace each word by synonyms
            alpha_ri=0.2,   # how much to insert new words that are synonyms
            alpha_rs=0.2,   # how much to swap words
            p_rd=0.2,       # how much to delete words
            num_aug=1       # generate more data with standard augmentation
        )
        aug_texts.append(aug_text[0])

    return aug_texts

In [6]:
def AugW(texts):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    augmenter1 = naw.ContextualWordEmbsAug(
        model_path='roberta-base', 
        action="substitute", 
        aug_min=1, 
        aug_p=0.2, 
        device=device
    )
    aug_texts = []
    for text in texts:
        aug_text = augmenter1.augment(text)
        aug_texts.append(aug_text[0])
    
    return aug_texts

In [None]:
data_df = pd.read_csv('../dataset/StackOverflow.csv')
texts = data_df['text'].tolist()

aug_weak = AugW(texts)
aug_strong = AugS(texts)

data_df['w_aug'] = aug_weak
data_df['s_aug'] = aug_strong

data_df.to_csv('./StackOverflow.csv', index=False)
print('StackOverflow, download over!!!')