In [1]:
!pip install tokenizers



In [2]:
import os
import pandas as pd
import csv

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# using LADEC dataset: The Large Database of English Compounds
LADEC_csv = pd.read_csv('/content/drive/MyDrive/Colab_CS/Tesser/data/LADECv1-2019.csv')
LADEC_compound = LADEC_csv['stim'].values.tolist()
LADEC_n1 = LADEC_csv['c1'].values.tolist()
LADEC_n2 = LADEC_csv['c2'].values.tolist()

LADEC_data = []
LADEC_data = LADEC_compound + LADEC_n1 + LADEC_n2 # 더 많은 데이터 셋을 사용하기 위해 복합/개별 단어를 합침
LADEC_data = list(filter(None,set(LADEC_data))) # 중복 단어 / None 제거

print(LADEC_data[:5])

['mouthbreeder', 'play', 'chap', 'beadwork', 'bowmen']


In [4]:
train_csv = pd.read_csv('/content/drive/MyDrive/Colab_CS/Tesser/data/ml_6_spacing_train.csv')
train_csv = train_csv.where(pd.notnull(train_csv), None) # NaN to None

train_compound = train_csv['compound'].values.tolist()
train_n1 = train_csv['n1'].values.tolist()
train_n2 = train_csv['n2'].values.tolist()
train_n3 = train_csv['n3'].values.tolist()

train_data = []
train_data = train_compound + train_n1 + train_n2 + train_n3 # 더 많은 데이터 셋을 사용하기 위해 복합/개별 단어를 합침
train_data = list(filter(None,set(train_data))) # 중복 단어 / None 제거

print(train_data[10:15])

['jitterbug', 'buyback', 'mile', 'hogtie', 'skate']


In [5]:
# merge LADEC datasets + train datasets
all_train_data = train_data + LADEC_data

len(all_train_data)

14752

In [6]:
with open('/content/drive/MyDrive/Colab_CS/Tesser/data/train_text.txt', 'w', encoding='utf-8') as f:
    for line in all_train_data:
        f.write(line+'\n')

In [7]:
from tokenizers import ByteLevelBPETokenizer  # 데이터 셋이 문장이 아닌 단어이므로 Byte Level BPE 알고리즘 선택

tokenizer = ByteLevelBPETokenizer()

corpus_file   = ['/content/drive/MyDrive/Colab_CS/Tesser/data/train_text.txt']  # data path
vocab_size    = 4000  # 만들고자하는 vocab의 size
min_frequency = 2  # merge를 수행할 최소 빈도수

# Train
tokenizer.train(files=corpus_file,
               vocab_size=vocab_size,
               min_frequency=min_frequency,
               show_progress=True)
print('train complete')


# sample word
word = 'copywriter'
output = tokenizer.encode(word)
print(f'word : {word}')
print('=>idx   : %s'%output.ids)
print('=>tokens: %s'%output.tokens)
print('=>offset: %s'%output.offsets)
print('=>decode: %s\n'%tokenizer.decode(output.ids))

word = 'showman'
output = tokenizer.encode(word)
print(f'word : {word}')
print('=>idx   : %s'%output.ids)
print('=>tokens: %s'%output.tokens)
print('=>offset: %s'%output.offsets)
print('=>decode: %s\n'%tokenizer.decode(output.ids))

# save tokenizer
hf_model_path='/content/drive/MyDrive/Colab_CS/Tesser/tokenizer_model'
if not os.path.isdir(hf_model_path):
    os.mkdir(hf_model_path)
tokenizer.save_model(hf_model_path)

train complete
word : copywriter
=>idx   : [982, 1048]
=>tokens: ['copy', 'writer']
=>offset: [(0, 4), (4, 10)]
=>decode: copywriter

word : showman
=>idx   : [754, 299]
=>tokens: ['show', 'man']
=>offset: [(0, 4), (4, 7)]
=>decode: showman



['/content/drive/MyDrive/Colab_CS/Tesser/tokenizer_model/vocab.json',
 '/content/drive/MyDrive/Colab_CS/Tesser/tokenizer_model/merges.txt']

In [8]:
result = [tokenizer.encode(token).tokens[:3] for token in train_compound]  

In [10]:
print(len(result))
print(train_compound[:5])
print(result[:5])

1500
['humbug', 'pastureland', 'alleyway', 'earmark', 'roundup']
[['hum', 'bug'], ['pasture', 'land'], ['alley', 'way'], ['ear', 'mark'], ['round', 'up']]


In [11]:
# compound word + pred data

compound_col = pd.DataFrame(train_compound)
compound_col.columns = ['compound']

eval_train = pd.DataFrame(result)
eval_train.columns = ['n1','n2','n3']

eval_data = pd.concat([compound_col, eval_train], axis=1)

In [12]:
eval_data.to_csv('/content/drive/MyDrive/Colab_CS/Tesser/data/train_eval.csv', index=False)

In [13]:
train_csv.head()

Unnamed: 0,compound,n1,n2,n3
0,humbug,hum,bug,
1,pastureland,pasture,land,
2,alleyway,alley,way,
3,earmark,ear,mark,
4,roundup,round,up,


In [14]:
eval_data.head()

Unnamed: 0,compound,n1,n2,n3
0,humbug,hum,bug,
1,pastureland,pasture,land,
2,alleyway,alley,way,
3,earmark,ear,mark,
4,roundup,round,up,


In [15]:
# train acc check
correct = 0
for i in range(len(train_csv)):
    if train_csv.values[i][1] == eval_data.values[i][1] and train_csv.values[i][2] == eval_data.values[i][2] and train_csv.values[i][3] == eval_data.values[i][3]:
        correct += 1
score = (correct / len(train_csv)) * 100
print(f"train acc : {round(score,2)}%")

train acc : 79.13%


In [16]:
test_csv = pd.read_csv('/content/drive/MyDrive/Colab_CS/Tesser/data/ml_6_spacing_test.csv')
test_csv = test_csv.where(pd.notnull(test_csv), None) # NaN to None
test_data = []

test_compound = test_csv['compound'].values.tolist()
test_data = test_compound
test_data = list(filter(None,set(test_data))) # 중복 단어 / None 제거

print(test_data[10:15])

['shoelace', 'supercollider', 'goalkeeper', 'showcase', 'checkup']


In [17]:
# evaluation

result = [tokenizer.encode(token).tokens[:3] for token in test_compound]  # 500개의 테스트 데이터에 학습한 모델 적용 

print(len(result))
print(test_compound[:5])
print(result[:5])

# compound word + pred data

compound_col = pd.DataFrame(test_compound)
compound_col.columns = ['compound']

eval_test = pd.DataFrame(result)
eval_test.columns = ['n1','n2','n3']

eval_data = pd.concat([compound_col, eval_test], axis=1)

eval_data.to_csv('/content/drive/MyDrive/Colab_CS/Tesser/data/220314_evaluation_test.csv', index=False)

eval_data.head()

500
['longwinded', 'copywriter', 'sportswriter', 'heartbroken', 'showman']
[['long', 'wind', 'ed'], ['copy', 'writer'], ['sports', 'writer'], ['heart', 'bro', 'ken'], ['show', 'man']]


Unnamed: 0,compound,n1,n2,n3
0,longwinded,long,wind,ed
1,copywriter,copy,writer,
2,sportswriter,sports,writer,
3,heartbroken,heart,bro,ken
4,showman,show,man,


In [18]:
%%shell
jupyter nbconvert --to html /content/drive/MyDrive/Colab_CS/Tesser/220314_LADEC_Tesser.ipynb

[NbConvertApp] Converting notebook /content/drive/MyDrive/Colab_CS/Tesser/220314_LADEC_Tesser.ipynb to html
[NbConvertApp] Writing 317926 bytes to /content/drive/MyDrive/Colab_CS/Tesser/220314_LADEC_Tesser.html


