In [10]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp311-cp311-win_amd64.whl (977 kB)
     ---------------------------------------- 0.0/977.5 kB ? eta -:--:--
     --------------------- --------------- 573.4/977.5 kB 18.2 MB/s eta 0:00:01
     ------------------------------------  972.8/977.5 kB 15.3 MB/s eta 0:00:01
     ------------------------------------- 977.5/977.5 kB 10.3 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Note: you may need to restart the kernel to use updated packages.


In [11]:
import sentencepiece as spm
import pandas as pd
import urllib.request
import csv

In [12]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="IMDb_Reviews.csv")
# 영화 리뷰 데이터

('IMDb_Reviews.csv', <http.client.HTTPMessage at 0x2576c93a5d0>)

In [13]:
train_df = pd.read_csv('IMDb_Reviews.csv')
train_df['review']
# 이 상태론 안됨. 텍스트 파일로 변환해야함

0        My family and I normally do not watch local mo...
1        Believe it or not, this was at one time the wo...
2        After some internet surfing, I found the "Home...
3        One of the most unheralded great works of anim...
4        It was the Sixties, and anyone with long hair ...
                               ...                        
49995    the people who came up with this are SICK AND ...
49996    The script is so so laughable... this in turn,...
49997    "So there's this bride, you see, and she gets ...
49998    Your mind will not be satisfied by this nobud...
49999    The chaser's war on everything is a weekly sho...
Name: review, Length: 50000, dtype: object

In [14]:
# 텍스트 파일로 변환
with open('imdb_review.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(train_df['review']))

In [15]:
# vocabulary생성
spm.SentencePieceTrainer.Train('--input=imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

In [18]:
vocab_list = pd.read_csv('imdb.vocab', sep = '\t', header = None, quoting = csv.QUOTE_NONE)
vocab_list.sample(10)

Unnamed: 0,0,1
4041,▁blind,-4038
1314,▁flick,-1311
4578,▁hits,-4575
1095,▁left,-1092
2673,▁trad,-2670
3212,round,-3209
2833,▁est,-2830
1168,▁stupid,-1165
1717,band,-1714
3636,entially,-3633


In [19]:
sp = spm.SentencePieceProcessor()
vocab_file ='imdb.model'
sp.load(vocab_file)

True

In [20]:
lines = [
  "I didn't at all think of it this way.",
  "I have waited a long time for someone to film"
]

In [27]:
sp.encode_as_pieces(lines[0]) # 문장 -> 서브 워드로 변환
sp.encode_as_ids(lines[0])    # 문장 -> 정수 코드로 변환

[41, 624, 4950, 4926, 139, 170, 378, 30, 58, 73, 413, 4945]

In [26]:
sp.PieceToId('▁I')

41

In [28]:
# 네이버 영화 리뷰 데이터
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

('ratings.txt', <http.client.HTTPMessage at 0x2570b7a5850>)

In [29]:
naver_df = pd.read_table('ratings.txt')
naver_df[:5]

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [30]:
naver_df = naver_df.dropna(how = 'any') # Null 값이 존재하는 행 제거
print(naver_df.isnull().values.any()) # Null 값이 존재하는지 확인

False


In [31]:
print('리뷰 개수 :',len(naver_df)) # 리뷰 개수 출력

리뷰 개수 : 199992


In [34]:
with open('naver_review.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(naver_df['document']))

In [35]:
# 모델 토크나이저 생성
spm.SentencePieceTrainer.Train('--input=naver_review.txt --model_prefix=naver --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

In [36]:
vocab_list = pd.read_csv('naver.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_list[:10]

Unnamed: 0,0,1
0,<unk>,0
1,<s>,0
2,</s>,0
3,..,0
4,영화,-1
5,▁영화,-2
6,▁이,-3
7,▁아,-4
8,...,-5
9,ᄏᄏ,-6


In [37]:
sp = spm.SentencePieceProcessor()
vocab_file = "naver.model"
sp.load(vocab_file)

True

In [38]:
lines = [
  "뭐 이딴 것도 영화냐.",
  "진짜 최고의 영화입니다 ㅋㅋ",
]

In [40]:
print(sp.encode_as_pieces(lines[1]))

['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']


In [41]:
sp.DecodePieces(['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ'])

'진짜 최고의 영화입니다 ᄏᄏ'

In [42]:
sp.DecodeIds([54, 200, 821, 85])

'진짜 원 산~~'

In [43]:
print(sp.encode('진짜 최고의 영화입니다 ㅋㅋ', out_type=str))
print(sp.encode('진짜 최고의 영화입니다 ㅋㅋ', out_type=int))

['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']
[54, 204, 825, 121]


In [44]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # Bert-base의 토크나이저

100%|███████████████████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 380729.25B/s]


In [45]:
tokenizer.vocab_size

30522

In [48]:
tokenizer.vocab
print(tokenizer.vocab['do'])
print(tokenizer.vocab['love'])

2079
2293


In [52]:
# 없는 것은 여러 단어들의 조합으로 가능
# OOV embedding
tokenizer.vocab['em']
tokenizer.vocab['##bed']
tokenizer.vocab['##ding']
tokenizer.vocab['##s']

2015

In [53]:
tokenizer.tokenize('Here is the sentence I want embeddings for.')

['here',
 'is',
 'the',
 'sentence',
 'i',
 'want',
 'em',
 '##bed',
 '##ding',
 '##s',
 'for',
 '.']

In [55]:
with open('vocabulary.txt', 'w', encoding='utf8') as f:
    for token in tokenizer.vocab.keys():
        f.write(token + '\n')

In [56]:
df = pd.read_fwf('vocabulary.txt', header=None)
df

Unnamed: 0,0
0,[PAD]
1,[unused0]
2,[unused1]
3,[unused2]
4,[unused3]
...,...
30517,##．
30518,##／
30519,##：
30520,##？


In [None]:
# 특수 토큰
'''
[PAD] - 0
[UNK] - 100
[CLS] - 101
[SEP] - 102
[MASK] - 103
'''