In [55]:
import jieba
import synonyms
import random
from random import shuffle
from stopwordsiso import stopwords

random.seed(14)
stop_words = stopwords(["zh"])

def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))     
    random.shuffle(random_word_list)
    num_replaced = 0  
    for random_word in random_word_list:          
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)   
            new_words = [synonym if word == random_word else word for word in new_words]   
            num_replaced += 1
        if num_replaced >= n: 
            break

    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')

    return new_words

def get_synonyms(word):
    return synonyms.nearby(word)[0]

def random_insertion(words, n):
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return new_words

def add_word(new_words):
    synonyms = []
    counter = 0    
    while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
    random_synonym = random.choice(synonyms)
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return new_words

def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
    return new_words

def random_deletion(words, p):
    if len(words) == 1:
        return words

    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    return new_words

# augmentation
def augmentation(sentence, alpha_sr=0.3, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
    if len(sentence) <=4:
        alpha_sr = 0.7
    seg_list = jieba.cut(sentence)
    seg_list = ' '.join(seg_list)
    words = list(seg_list.split())
    num_words = len(words)

    augmented_sentences = []
    num_new_per_technique = int(num_aug/4)+1
    n_sr = max(1, int(alpha_sr * num_words))
    n_ri = max(1, int(alpha_ri * num_words))
    n_rs = max(1, int(alpha_rs * num_words))
    
    for _ in range(num_new_per_technique):
        a_words = synonym_replacement(words, n_sr)
        augmented_sentences.append(''.join(a_words))

    for _ in range(num_new_per_technique):
        a_words = random_insertion(words, n_ri)
        augmented_sentences.append(''.join(a_words))

    for _ in range(num_new_per_technique):
        a_words = random_swap(words, n_rs)
        augmented_sentences.append(''.join(a_words))

    for _ in range(num_new_per_technique):
        a_words = random_deletion(words, p_rd)
        augmented_sentences.append(''.join(a_words))
    
    shuffle(augmented_sentences)

    if num_aug >= 1:
        augmented_sentences = augmented_sentences[:num_aug]
    else:
        keep_prob = num_aug / len(augmented_sentences)
        augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

    augmented_sentences.append(sentence)

    return augmented_sentences

In [56]:
print(augmentation(sentence="皮克斯的想象力一绝，真的是极度治愈的一部电影，缓解了最近的执念和焦虑。"))

['皮克斯的想象力一绝，真的是极度治愈的一部电影，缓解了最近的执念和焦虑。', '皮克斯的缓解一绝，最近是极度治愈的一部电影，想象力了真的的执念和焦虑。', '皮克斯的想象力一绝，真的消除是极度或者说治愈的一部电影，缓解了最近的执念和焦虑。', '皮克斯的想象力，真的是极度的一部电影缓解了最近的执念和焦虑。', '皮克斯的想象力缓解，真的是极度治愈的一部电影，一绝了最近和执念的焦虑。', '孩之宝的幽默感一绝，或许是极度治愈的第一部电影，舒缓了近来的执念和焦虑。', '皮克斯的想象力一绝，真的是极度治愈了一部电影执念缓解的最近的，和焦虑。', '第四部的想象力ZR19，也许是极度治愈的一部电影，减轻了不久前的本该和焦虑。', '皮克斯的想象力一绝，是极度治愈的一部电影，了最近的执念和焦虑。', '皮克斯的想象力一绝，真的是极度治愈的一部电影，缓解了最近的执念和焦虑。']


In [57]:
synonyms.display("梦想")

'梦想'近义词：
  1. 梦想:1.0
  2. 心愿:0.7027635
  3. 梦:0.67332053
  4. 快乐:0.66758704
  5. 愿望:0.65512615
  6. 青春:0.6494938
  7. 追梦:0.64306444
  8. 夙愿:0.6325592
  9. 雄心壮志:0.6264964
  10. 梦想成真:0.6178243


In [58]:
synonyms.display("青春")

'青春'近义词：
  1. 青春:1.0
  2. 热血:0.7377811
  3. 励志:0.7299028
  4. 纯情:0.67202914
  5. 纯真:0.6693155
  6. 美少女:0.65399474
  7. 梦幻:0.63923883
  8. 光阴:0.63336986
  9. 纯爱:0.6236114
  10. 花样年华:0.6167907


In [None]:
import pandas as pd
def gen_aug_pd(input_file, output_file, num_aug=3):
    print("creating augmentated sentences...")
    df = pd.read_csv(input_file)

    df['short_comment'] = df['short_comment'].apply(lambda x: augmentation(x, num_aug=num_aug))

    df = df.explode('short_comment').reset_index(drop=True)

    df.to_csv(output_file, index=False)

    print("Created augmentated sentences.")

gen_aug_pd("finalized_reviews.csv", "augmented.csv")