In [131]:
import nltk
nltk.download('wordnet')
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from nltk.corpus import wordnet, stopwords 
import random
from random import shuffle
import re
import string
from transformers import AdamW, AutoModel, AutoTokenizer

random.seed(42)

[nltk_data] Downloading package wordnet to /home/toshiya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 実験

In [12]:
root = "/home/toshiya/Workspace/learning/signate/SIGNATE_Student_Cup_2020/"

In [41]:
def preprocessing(line):
    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line.split(' ')

In [54]:
def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    replaced_dict = {}
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            replaced_dict[random_word] = synonym
#             new_words = [synonym if word == random_word else word for word in new_words]
            print("replaced", random_word, "with", synonym)
            num_replaced += 1
        if num_replaced >= n: #only replace up to n words
            break

    #this is stupid but we need it, trust me
#     sentence = ' '.join(new_words)
#     new_words = sentence.split(' ')
#     return new_words
    return replaced_dict

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

In [57]:
text = "Executes and writes portions of testing plans, protocols, and documentation for assigned portion of application; identifies and debugs issues with code and suggests changes or improvements."
text = preprocessing(text)
text

['executes',
 'and',
 'writes',
 'portions',
 'of',
 'testing',
 'plans',
 'protocols',
 'and',
 'documentation',
 'for',
 'assigned',
 'portion',
 'of',
 'application',
 'identifies',
 'and',
 'debugs',
 'issues',
 'with',
 'code',
 'and',
 'suggests',
 'changes',
 'or',
 'improvements',
 '']

In [58]:
synonym_replacement(text, 2)

replaced testing with examination
replaced application with application program


{'testing': 'examination', 'application': 'application program'}

In [37]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [40]:
text = "Executes and writes portions of testing plans, protocols, ; or improvements."
encoded = tokenizer.encode(text)
tokenizer.decode(encoded)

'[CLS] executes and writes portions of testing plans, protocols, ; or improvements. [SEP]'

# Augmentation

- n個の単語を類似語で置換 synonym_replacement
    - input: ~~synonym_replacementのための前処理を施したテキスト~~ 生テキスト
    - output: dict(key=置換前の単語, value=置換後の単語)
- text_augmentation
    - synonym_replacementから得たdictに従って単語を置換し同じラベルをつけて結合
    - input: lower処理だけを行ったテキスト
    - output: 置換したテキスト（データフレーム）

In [71]:
def synonym_replace(line, n):
    clean_line = ""

    stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 
            'ours', 'ourselves', 'you', 'your', 'yours', 
            'yourself', 'yourselves', 'he', 'him', 'his', 
            'himself', 'she', 'her', 'hers', 'herself', 
            'it', 'its', 'itself', 'they', 'them', 'their', 
            'theirs', 'themselves', 'what', 'which', 'who', 
            'whom', 'this', 'that', 'these', 'those', 'am', 
            'is', 'are', 'was', 'were', 'be', 'been', 'being', 
            'have', 'has', 'had', 'having', 'do', 'does', 'did',
            'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
            'because', 'as', 'until', 'while', 'of', 'at', 
            'by', 'for', 'with', 'about', 'against', 'between',
            'into', 'through', 'during', 'before', 'after', 
            'above', 'below', 'to', 'from', 'up', 'down', 'in',
            'out', 'on', 'off', 'over', 'under', 'again', 
            'further', 'then', 'once', 'here', 'there', 'when', 
            'where', 'why', 'how', 'all', 'any', 'both', 'each', 
            'few', 'more', 'most', 'other', 'some', 'such', 'no', 
            'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
            'very', 's', 't', 'can', 'will', 'just', 'don', 
            'should', 'now', '']

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]

    words = clean_line.split(' ')
    
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    replaced_dict = {}
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            replaced_dict[random_word] = synonym
            num_replaced += 1
        if num_replaced >= n: #only replace up to n words
            break
    return replaced_dict

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

In [72]:
text = "Executes and writes portions of testing plans, protocols, and documentation for assigned portion of application; identifies and debugs issues with code and suggests changes or improvements."
synonym_replace(text, n=3)

{'improvements': 'improvement',
 'executes': 'carry through',
 'writes': 'drop a line'}

In [81]:
def text_augmentation(text, replaced_dict):
    text = text.lower()
    words = text.split(' ')
    new_words = words.copy()
    
    for target_w, replace_w in replaced_dict.items():
        for i, w in enumerate(words):
            if target_w == w:
                new_words[i] = replace_w
    new_text = ''
    for nw in new_words:
        new_text += nw + ' '
    return new_text[:-1]

In [85]:
text = "Executes and writes portions of testing plans, protocols, and documentation for assigned portion of application; identifies and debugs issues with code and suggests changes or improvements."
replaced_dict = synonym_replace(text, n=3)
text_augmentation(text, replaced_dict)

'executes and writes portions of testing plans, protocols, and software documentation for assigned portion of application; identifies and debug issues with code and suggests changes or improvements.'

# 実行

In [123]:
root = "/home/toshiya/Workspace/learning/signate/SIGNATE_Student_Cup_2020/"

In [124]:
train = pd.read_csv(os.path.join(root, "data", "train.csv"))
train

Unnamed: 0,id,description,jobflag
0,0,"Executes and writes portions of testing plans,...",2
1,1,Maintain Network Performance by assisting with...,3
2,2,Supports the regional compliance manager with ...,4
3,3,Keep up to date with local and national busine...,1
4,4,Assist with Service Organization Control (SOC)...,4
...,...,...,...
2926,2926,Preparation of reports for operational and man...,3
2927,2927,Line and/or indirect management of up to 20 st...,3
2928,2928,Partner with external agencies as needed,1
2929,2929,"Design, Implement and test software for embedd...",3


In [125]:
train.groupby(by='jobflag').count()

Unnamed: 0_level_0,id,description
jobflag,Unnamed: 1_level_1,Unnamed: 2_level_1
1,624,624
2,348,348
3,1376,1376
4,583,583


- 4クラスの分布から以下2パターンのAugmentationを考えた
    - 1,2,4を増やして均一化
    - 1,2を3と同じくらい増やして、4は3の2倍にする

In [126]:
def class_augment(target_job, aug_num):
    """
    target_jobのラベルのデータをaug_num個まで増やす
    """
    df = train[train["jobflag"] == target_job]
    augmented_df = df.loc[:, ["description", "jobflag"]].copy()
    
    while len(augmented_df) < aug_num:
        for d, j in zip(df["description"], df["jobflag"]):
            if len(augmented_df) < aug_num:
                replaced_dict = synonym_replace(d, n=3)
                new_d = text_augmentation(d, replaced_dict)
                tmp = pd.DataFrame({
                    'description': [new_d],
                    'jobflag': [j]
                })
                augmented_df = pd.concat([augmented_df, tmp])
            else:
                break
                
    return augmented_df

## 均一化

In [127]:
# 均一化
auged_1 = class_augment(1, aug_num=1376)
auged_2 = class_augment(2, aug_num=1376)
auged_4 = class_augment(4, aug_num=1376)

In [128]:
auged_df = pd.DataFrame()
for a in [auged_1, auged_2, train[train["jobflag"]==3], auged_4]:
    auged_df = pd.concat([auged_df, a])
auged_df

Unnamed: 0,description,jobflag,id
3,Keep up to date with local and national busine...,1,
7,Must be able to work independently in a small ...,1,
8,Work with stakeholders to identify opportuniti...,1,
24,Undertake preprocessing of structured and unst...,1,
27,Work with cross-functional teams to develop id...,1,
...,...,...,...
0,get and wield cryptic product knowledge and sk...,4,
0,day to day administration/development of sytel...,4,
0,direction of project alter petition and scope ...,4,
0,"body of work with the business stakeholders, o...",4,


In [129]:
# IDを振り直す
auged_df["id"] = list(range(len(auged_df)))
auged_df

Unnamed: 0,description,jobflag,id
3,Keep up to date with local and national busine...,1,0
7,Must be able to work independently in a small ...,1,1
8,Work with stakeholders to identify opportuniti...,1,2
24,Undertake preprocessing of structured and unst...,1,3
27,Work with cross-functional teams to develop id...,1,4
...,...,...,...
0,get and wield cryptic product knowledge and sk...,4,5499
0,day to day administration/development of sytel...,4,5500
0,direction of project alter petition and scope ...,4,5501
0,"body of work with the business stakeholders, o...",4,5502


In [130]:
# 保存
auged_df.to_csv(os.path.join(root, "processed_data", "02_train_augmented_balanced.csv"), index=False)

## テストデータの推定分布に合わせる

In [135]:
nums = np.array([404, 320, 345, 674])
ratio = nums / np.sum(nums)
ratio = ratio / ratio[2]
augment_num = ratio * 1376
augment_num

array([1611.31594203, 1276.28985507, 1376.        , 2688.18550725])

In [136]:
auged_1 = class_augment(1, aug_num=1611)
auged_2 = class_augment(2, aug_num=1276)
auged_4 = class_augment(4, aug_num=2688)

auged_df = pd.DataFrame()
for a in [auged_1, auged_2, train[train["jobflag"]==3], auged_4]:
    auged_df = pd.concat([auged_df, a])

auged_df["id"] = list(range(len(auged_df)))

In [139]:
auged_df.to_csv(os.path.join(root, "processed_data", "02_train_augmented_imbalanced.csv"), index=False)