<a href="https://colab.research.google.com/github/tchih11/qiita_eda/blob/main/notebooks/02_make_eda_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!git clone https://{user_name}:@github.com/tchih11/qiita_eda.git
%cd /content/qiita_eda

Cloning into 'qiita_eda'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 18 (delta 3), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (18/18), done.
/content/qiita_eda


# 各種インストール

In [3]:
# MeCabをインストール
%%capture 
!pip install mecab-python3
!pip install ipadic
!pip install swifter

import gzip
import os
import random
import shutil
import sqlite3
from math import ceil

import ipadic
import MeCab
import pandas as pd
import swifter
from tqdm import tqdm

# 日本語WordNet、stop word の設定

In [4]:
# 日本語wordnetをDLして解凍
! wget "http://compling.hss.ntu.edu.sg/wnja/data/1.1/wnjpn.db.gz"  # 1~2分

with gzip.open('wnjpn.db.gz', 'rb') as f_in:
    with open('wnjpn.db', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# synset(概念ID)とlemma(単語)の組み合わせDataFrameの作成
conn = sqlite3.connect("wnjpn.db")
q = 'SELECT synset,lemma FROM sense,word USING (wordid) WHERE sense.lang="jpn"'
sense_word = pd.read_sql(q, conn)

--2021-02-08 03:52:46--  http://compling.hss.ntu.edu.sg/wnja/data/1.1/wnjpn.db.gz
Resolving compling.hss.ntu.edu.sg (compling.hss.ntu.edu.sg)... 155.69.255.27
Connecting to compling.hss.ntu.edu.sg (compling.hss.ntu.edu.sg)|155.69.255.27|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 60390049 (58M) [application/x-gzip]
Saving to: ‘wnjpn.db.gz’


2021-02-08 03:54:02 (788 KB/s) - ‘wnjpn.db.gz’ saved [60390049/60390049]



In [5]:
# DLしてリスト化
! wget "http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt"
stop_words = pd.read_csv("Japanese.txt",header=None)[0].to_list()        

--2021-02-08 03:54:04--  http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt
Resolving svn.sourceforge.jp (svn.sourceforge.jp)... 44.240.209.230
Connecting to svn.sourceforge.jp (svn.sourceforge.jp)|44.240.209.230|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2202 (2.2K) [text/plain]
Saving to: ‘Japanese.txt’


2021-02-08 03:54:04 (237 MB/s) - ‘Japanese.txt’ saved [2202/2202]



# 関数の定義

In [6]:
# 類義語をリストにして返す関数
def get_synonyms(word):
    synsets = sense_word.loc[sense_word.lemma == word, "synset"]
    synset_words = set(sense_word.loc[sense_word.synset.isin(synsets), "lemma"])

    if word in synset_words:
        synset_words.remove(word)

    return list(synset_words)

def wakati_text(text, hinshi=['名詞', '動詞']):
    m = MeCab.Tagger(ipadic.MECAB_ARGS)
    p = m.parse(text)
    p_split = [i.split("\t") for i in p.split("\n")][:-2]

    # 原文の分かち書き
    raw_words = [x[0] for x in p_split]

    # 同義語検索用の単語の原型リスト（品詞を絞る）
    second_half = [x[1].split(",") for x in p_split]
    original_words = [x[6] if x[0] in hinshi else "" for x in second_half]
    original_words = ["" if word in stop_words else word for word in original_words]

    return raw_words, original_words


def synonym_replacement(raw_words, original_words, n):
    new_words = raw_words.copy()

    # 同義語に置き換える単語をランダムに決める
    original_words_idx = [i for i, x in enumerate(original_words) if x != ""]
    random.shuffle(original_words_idx)

    # 指定の件数になるまで置き換え
    num_replaced = 0
    for idx in original_words_idx:
        raw_word = raw_words[idx]
        random_word = original_words[idx]
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == raw_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    return new_words

def random_insertion(raw_words, original_words, n):
    new_words = raw_words.copy()
    for _ in range(n):
        add_word(new_words, original_words)
    return new_words

def add_word(new_words, original_words):
    synonyms = []
    counter = 0
    insert_word_original = [x for x in original_words if x]
    while len(synonyms) < 1:
        random_word = insert_word_original[random.randint(0, len(insert_word_original)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)


def random_deletion(words, p):
    # 1文字しかなければ削除しない
    if len(words) == 1:
        return words

    # 確率pでランダムに削除
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

    # 全て削除してしまったら、ランダムに1つ単語を返す
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]

    return new_words

def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        nwords = swap_word(new_words)

    return new_words

def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]

    return new_words

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):

    # 分かち書き
    raw_words, original_words = wakati_text(sentence)
    num_words = len(raw_words)

    augmented_sentences = []
    techniques = ceil(alpha_sr) + ceil(alpha_ri) + ceil(alpha_rs) + ceil(p_rd)
    if techniques == 0:
        return

    num_new_per_technique = int(num_aug/techniques)+1

    #ランダムに単語を同義語でn個置き換える
    if (alpha_sr > 0):
        n_sr = max(1, int(alpha_sr*num_words))
        for _ in range(num_new_per_technique):
            a_words = synonym_replacement(raw_words,original_words ,n_sr)
            augmented_sentences.append(''.join(a_words))

    #ランダムに文中に出現する単語の同義語をn個挿入
    if (alpha_ri > 0):
        n_ri = max(1, int(alpha_ri*num_words))
        for _ in range(num_new_per_technique):
            a_words = random_insertion(raw_words,original_words, n_ri)
            augmented_sentences.append(''.join(a_words))

    #ランダムに単語の場所をn回入れ替える
    if (alpha_rs > 0):
        n_rs = max(1, int(alpha_rs*num_words))
        for _ in range(num_new_per_technique):
            a_words = random_swap(raw_words, n_rs)
            augmented_sentences.append(''.join(a_words))

    #ランダムに単語を確率pで削除する
    if (p_rd > 0):
        for _ in range(num_new_per_technique):
            a_words = random_deletion(raw_words, p_rd)
            augmented_sentences.append(''.join(a_words))

    #必要な文章の数だけランダムに抽出
    random.shuffle(augmented_sentences)
    augmented_sentences = augmented_sentences[:num_aug]

    #原文もリストに加える
    augmented_sentences.append(sentence)

    return augmented_sentences

# EDA（Easy Data Augmentation）の実行


In [15]:
def make_eda_datasets(sample_n, num_aug, alpha, save_dir=None):

    # 元データ
    train_eval = pd.read_pickle("./data/train_eval_df.pkl")
    train_eval.columns = ["text", "label_index"]

    # 指定の数にサンプリング
    train_eval_sampled = train_eval.sample(n=sample_n, random_state=0)
    train_eval_sampled.reset_index(drop=True)
    
    # edaを実行
    train_eval_sampled["text"] = train_eval_sampled.text.swifter.apply(lambda x: eda(
        x, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=num_aug))
    # dataframeのセル内にリストができるので展開
    train_eval_sampled = train_eval_sampled.explode("text")

    # idの付与
    train_eval_sampled["text_id"] = train_eval_sampled.index
    train_eval_sampled["aug_id"] = train_eval_sampled.groupby("text_id").cumcount()
    
    # eda前の原文は各text_idの最後にが入っているのでフラグを立てる
    train_eval_sampled["raw_flg"] = (train_eval_sampled.aug_id == num_aug)*1
    train_eval_sampled.reset_index(drop=True)

    # 出力先の指定がある場合はpickleで保存
    if save_dir:
        train_eval_sampled.to_pickle(
            f"{save_dir}/train_eval_eda_{sample_n}_{int(alpha*100)}_{num_aug}_gzip.pkl", compression="gzip")

    return train_eval_sampled

In [16]:
%%time
save_dir = "/content/qiita_eda/data"

eda_2000_5_16 = make_eda_datasets(sample_n=2000,alpha=0.05,num_aug=16,save_dir=save_dir)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=2000.0, style=ProgressStyle(descriptio…


CPU times: user 5h 30min 3s, sys: 10.5 s, total: 5h 30min 13s
Wall time: 5h 31min


In [None]:
%%time
eda_2000_10_16 = make_eda_tsv_datasets(sample_n=2000,alpha=0.1,num_aug=16,save_dir=save_dir)