In [1]:
pip install sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [9]:
import sentencepiece as spm
import pandas as pd
import csv

In [10]:
train_df = pd.read_csv('IMDb_Reviews.csv')
train_df['review']

0        My family and I normally do not watch local mo...
1        Believe it or not, this was at one time the wo...
2        After some internet surfing, I found the "Home...
3        One of the most unheralded great works of anim...
4        It was the Sixties, and anyone with long hair ...
                               ...                        
49995    the people who came up with this are SICK AND ...
49996    The script is so so laughable... this in turn,...
49997    "So there's this bride, you see, and she gets ...
49998    Your mind will not be satisfied by this nobud...
49999    The chaser's war on everything is a weekly sho...
Name: review, Length: 50000, dtype: object

In [11]:
len(train_df)

50000

In [12]:
with open('imdb_review,txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(train_df['review']))

In [16]:
spm.SentencePieceTrainer.Train('--input=imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: imdb_review.txt
  input_format: 
  model_prefix: imdb
  model_type: BPE
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 9999
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1

In [17]:
vocab_list = pd.read_csv('imdb.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_list.sample(5)

Unnamed: 0,0,1
3024,▁critic,-3021
3747,▁clean,-3744
29,ll,-26
1132,▁came,-1129
2715,▁creepy,-2712


In [18]:
len(vocab_list)

5000

In [19]:
sp = spm.SentencePieceProcessor()
vocab_file = 'imdb.model'
sp.load(vocab_file)

True

In [20]:
lines = [
    "I didn't at all think of it this way.",
    "I have waited a long time for someone to film"
]

for line in lines:
    print(line)
    print(sp.encode_as_pieces(line)) # 문장을 subword sequence로 변환
    print(sp.encode_as_ids(line)) # 문장을 정수 시퀸스로 변환
    print()

I didn't at all think of it this way.
['▁I', '▁didn', "'", 't', '▁at', '▁all', '▁think', '▁of', '▁it', '▁this', '▁way', '.']
[41, 624, 4950, 4926, 139, 170, 378, 30, 58, 73, 413, 4945]

I have waited a long time for someone to film
['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[41, 142, 1364, 1121, 4, 668, 285, 93, 1079, 33, 91]



In [21]:
sp.GetPieceSize() # 단어 집합 크기

5000

In [22]:
sp.IdToPiece(300) # Id에 해당하는 서브워드

'ure'

In [24]:
sp.PieceToId('ing') # 서브워드의 ID

20

In [25]:
sp.DecodeIds([41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91]) # 여러가지 ID들로 문장을 반환

'Iul wa fall aold timeooland to film'

In [27]:
lst = [41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91]

for l in lst:
    print("'" + sp.IdToPiece(l) + "'", end=',')

'▁I','ul','▁wa','▁fall','▁a','old','▁time','oo','land','▁to','▁film',

In [28]:
sp.DecodePieces(['▁I','▁have','▁wa','ited','▁a','▁long','▁time','▁for','▁someone','▁to','▁film']) # subword 들로 문장을 반환

'I have waited a long time for someone to film'

In [29]:
print(sp.encode('I have waited a long time for someone to film')) # 인코딩 처리

[41, 142, 1364, 1121, 4, 668, 285, 93, 1079, 33, 91]


In [31]:
print(sp.encode('I have waited a long time for someone to film', out_type=str))
print(sp.encode('I have waited a long time for someone to film', out_type=int)) # 타입설정

['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[41, 142, 1364, 1121, 4, 668, 285, 93, 1079, 33, 91]


In [32]:
naver_df = pd.read_table('ratings.txt')
naver_df.head()

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [35]:
print('리뷰 개수 : ', len(naver_df))

리뷰 개수 :  200000


In [36]:
print(naver_df.isnull().values.any())

True


In [37]:
naver_df = naver_df.dropna()
print(naver_df.isnull().values.any())

False


In [38]:
print('리뷰 개수 : ', len(naver_df))

리뷰 개수 :  199992


In [39]:
with open('naver_review.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(naver_df['document']))

In [44]:
spm.SentencePieceTrainer.Train('--input=naver_review.txt --model_prefix=naver --vocab_size=5000, model_type=bpe --max_sentence_length=9999')

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=naver_review.txt --model_prefix=naver --vocab_size=5000, model_type=bpe --max_sentence_length=9999
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: naver_review.txt
  input_format: 
  model_prefix: naver
  model_type: BPE
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 9999
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id

In [45]:
vocab_list = pd.read_csv('naver.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_list.head()

Unnamed: 0,0,1
0,<unk>,0
1,<s>,0
2,</s>,0
3,..,0
4,영화,-1


In [46]:
vocab_list.sample(5)

Unnamed: 0,0,1
1883,▁a,-1880
4956,렙,-4953
2768,!!!!!,-2765
4940,쉼,-4937
4162,겉,-4159


In [47]:
len(vocab_list)

5000

In [48]:
sp = spm.SentencePieceProcessor()
vocab_file = 'naver.model'
sp.load(vocab_file)

True

In [49]:
lines = [
    '의미없는 삶을 두려워하라', # 원하는 문장 삽입
    '배우들의 연기는 좋았지만 스토리는 별로였음',
]

for line in lines:
    print(line)
    print(sp.encode_as_pieces(line))
    print(sp.encode_as_ids(line))
    print()

의미없는 삶을 두려워하라
['▁의미', '없는', '▁삶을', '▁두', '려', '워', '하라']
[776, 228, 2263, 279, 3401, 3520, 3228]

배우들의 연기는 좋았지만 스토리는 별로였음
['▁배우들의', '▁연기는', '▁좋았', '지만', '▁스토리는', '▁별로', '였음']
[928, 1024, 209, 35, 1364, 258, 1481]



In [50]:
sp.GetPieceSize()

5000

In [51]:
sp.IdToPiece(773)

'▁그러'

In [54]:
sp.PieceToId('려')

3401

In [57]:
sp.DecodePieces([773, 226, 2260, 277, 3401, 3520, 3223])

'그러 때다운 위려워 좋아할'

In [58]:
sp.DecodePieces(['▁배우들의', '▁연기는', '▁좋았', '지만', '▁스토리는', '▁별로', '였음'])

'배우들의 연기는 좋았지만 스토리는 별로였음'

In [59]:
print(sp.encode('의미없는 삶을 두려워하라', out_type=str))
print(sp.encode('의미없는 삶을 두려워하라', out_type=int))

['▁의미', '없는', '▁삶을', '▁두', '려', '워', '하라']
[776, 228, 2263, 279, 3401, 3520, 3228]
