In [45]:
import torch
import pandas as pd
from torch import nn
from torch.utils.data import Dataset
from collections import Counter
from torchtext.vocab import vocab
from tqdm import tqdm
import ast


class BertDataSet(Dataset) :
    def __init__(self,dir,index_col=0) -> None:
        super().__init__()
        self.data_load = pd.read_csv(dir,index_col=index_col)
        self.counter = Counter()
        self.vocab = None
        self.sentences = self._merge_sentences()

    def _change_string_to_list(self,str_list):
        """

        dataframe안에 list를 통으로 넣으면 str으로 저장된다.
        ast 라이브러리를 쓰면 원래 ㅣist 이지만 str 타입으로 표현된 값을 다시 list 타입으로 바꿔준다.

        """
        return ast.literal_eval(str_list)

    def _merge_sentences(self) :
        """
        
        하나의 list 안에 모든 sentence 넣기

        """
        total = []
        for row in self.data_load.iterrows() :
            book_info = row[1]
            t = []
            for i in range(1,4) :
                t += self._change_string_to_list(book_info.iloc[i])

            total += t
        total = list(filter(None,total))
        
        return total

    

    # def __getitem__() :
    #     pass
    # def __call__(self, ) :
    #     pass


In [7]:
# 리스트 하나에 문장 전체 넣기
# counter로 단어 중복 제거하기
# vocab으로 index와 단어 매치하기 

In [46]:
test = BertDataSet('./data/bookraw_total.csv')
pd.DataFrame(test.sentences)[0].to_csv('./data/bookraw_list.txt',index=False)

In [88]:
import sentencepiece as spm
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(lowercase=True, strip_accents=False)

tokenizer.train('./data/bookraw_list.txt',vocab_size=100000,limit_alphabet=6000, min_frequency=10)

# Tokenizing Test
tokenizer.encode('python과 javascript를 만들며 고민했다').tokens

# Saving Vocab
tokenizer.save_model('.', 'bert')






['python과', 'javascript를', '만들며', '고민', '##했다']

In [123]:
## Tokenizer 불러오기
## 한글을 불러올 땐 strip accents = False 필수

vocab = './data/vocab.txt'
tokenizer = BertWordPieceTokenizer.from_file(vocab=vocab,strip_accents=False)


encoded = tokenizer.encode('python과 javascript를 만들며 고민했다')
print(encoded.tokens)

['[CLS]', 'python', '##과', 'javascript', '##를', '만들며', '고민', '##했다', '[SEP]']


In [1]:
from transformers import DistilBertModel,BertModel, AutoTokenizer

a = DistilBertModel.from_pretrained('monologg/distilkobert')


Some weights of the model checkpoint at monologg/distilkobert were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [136]:
tokenizer = AutoTokenizer.from_pretrained("monologg/distilkobert")