# [20250328] Do it NLP (~chapter2)

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install Korpora



In [3]:
from Korpora import Korpora

nsmc = Korpora.load('nsmc', force_download=True)


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/



[nsmc] download ratings_train.txt: 14.6MB [00:00, 128MB/s]                            
[nsmc] download ratings_test.txt: 4.90MB [00:00, 81.1MB/s]


In [4]:
import os

def write_lines(path, lines):
  with open(path, 'w', encoding='utf-8') as f:
    for line in lines:
      f.write(f'{line}\n')

write_lines('/content/train.txt', nsmc.train.get_all_texts())
write_lines('/content/test.txt', nsmc.test.get_all_texts())

In [5]:
os.makedirs('/content/drive/MyDrive/nlpbook/bbpe', exist_ok=True)

In [7]:
from tokenizers import ByteLevelBPETokenizer
bytebpe_tokenizer = ByteLevelBPETokenizer()
bytebpe_tokenizer.train(
    files=['/content/train.txt', '/content/test.txt'],
    vocab_size=10000,
    special_tokens=['[PAD]']
)

bytebpe_tokenizer.save_model('/content/drive/MyDrive/nlpbook/bbpe')

['/content/drive/MyDrive/nlpbook/bbpe/vocab.json',
 '/content/drive/MyDrive/nlpbook/bbpe/merges.txt']

In [8]:
os.makedirs('/content/drive/MyDrive/nlpbook/wordpiece', exist_ok=True)

In [9]:
from tokenizers import BertWordPieceTokenizer
wordpiece_tokenizer = BertWordPieceTokenizer(lowercase=False)
wordpiece_tokenizer.train(
    files=['/content/train.txt', '/content/test.txt'],
    vocab_size=10000
)

wordpiece_tokenizer.save_model('/content/drive/MyDrive/nlpbook/wordpiece')

['/content/drive/MyDrive/nlpbook/wordpiece/vocab.txt']

In [10]:
from transformers import GPT2Tokenizer

tokenizer_gpt = GPT2Tokenizer.from_pretrained('/content/drive/MyDrive/nlpbook/bbpe')
tokenizer_gpt.pad_token = '[PAD]'

In [13]:
sentences = [
    '아 더빙... 진짜 짜증나네요 목소리',
    '흠...포스터보고 초딩영화줄...오버연기조차 가볍지 않구나',
    '별루 였다..'
]
tokenized_sentences = [tokenizer_gpt.tokenize(sentence) for sentence in sentences]

print(f'original  sentence: {sentences[0]}')
print(f'tokenized sentence: {tokenized_sentences[0]}')

original  sentence: 아 더빙... 진짜 짜증나네요 목소리
tokenized sentence: ['ìķĦ', 'ĠëįĶë¹Ļ', '...', 'Ġì§Ħì§ľ', 'Ġì§ľì¦ĿëĤĺ', 'ëĦ¤ìļĶ', 'Ġëª©ìĨĮë¦¬']


In [15]:
batch_inputs = tokenizer_gpt(
    sentences,
    padding='max_length',
    max_length=12,
    truncation=True
)

In [16]:
batch_inputs['input_ids']

[[334, 2338, 336, 581, 4055, 464, 3808, 0, 0, 0, 0, 0],
 [3693, 336, 2876, 758, 2883, 356, 806, 336, 9875, 875, 2960, 7292],
 [4957, 451, 3653, 263, 0, 0, 0, 0, 0, 0, 0, 0]]

In [17]:
batch_inputs['attention_mask']

[[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]

In [23]:
from transformers import BertTokenizer
tokenizer_bert = BertTokenizer.from_pretrained(
    '/content/drive/MyDrive/nlpbook/wordpiece',
    do_lower_case=False
)

In [25]:
BertTokenized_sentences = [
    tokenizer_bert.tokenize(sentence) for sentence in sentences
]

In [26]:
print(f'original  sentence: {sentences[0]}')
print(f'tokenized sentence: {BertTokenized_sentences[0]}')

original  sentence: 아 더빙... 진짜 짜증나네요 목소리
tokenized sentence: ['아', '더빙', '.', '.', '.', '진짜', '짜증나', '##네요', '목소리']


In [27]:
BertBatch_inputs = tokenizer_bert(
    sentences,
    padding='max_length',
    max_length=12,
    truncation=True
)

In [28]:
BertBatch_inputs['input_ids']

[[2, 621, 2631, 16, 16, 16, 1993, 3678, 1990, 3323, 3, 0],
 [2, 997, 16, 16, 16, 2609, 2045, 2796, 1981, 1158, 16, 3],
 [2, 3274, 9507, 16, 16, 3, 0, 0, 0, 0, 0, 0]]