In [12]:
import glob
import pandas as pd
import json
import torch
from torch.utils.data import Dataset
import gzip
import numpy as np
from transformers import AutoTokenizer
from xml.etree.ElementTree import parse
from sys import getsizeof
import numpy as np
import tqdm


def merge_dataset(dataset_directories: list[str], result_dir:str):
    datasets = glob.glob(f"{dataset_directories}/*.gz")
    assert len(datasets) != 0, "Please check that the dataset file exists."
    result = []

    pbar = tqdm.tqdm(
        datasets,
        smoothing=0,
        leave=True,
        dynamic_ncols=True,
    )
    for dataset_dir in pbar:
        f = gzip.GzipFile(dataset_dir, "r")
        temp_tokens = np.load(f)
        result = np.concatenate((temp_tokens, result), axis=0) if len(result) != 0 else temp_tokens 
    
    print(result.shape)
    with gzip.open(result_dir, "wb") as f:
        np.save(f, result)


def encode_from_texts(texts:list[str], tokenizer: AutoTokenizer, block_size:int, BOS_TOKEN:str="[BOS]", EOS_TOKEN:str="[EOS]"):
    tokens = []
    pbar = tqdm.tqdm(
        texts,
        smoothing=0,
        leave=True,
        dynamic_ncols=True,
    )
    for text in pbar:
        # print(text)
        if text == "":
            continue
        
        # text = text.replace("\n", f"{SEP_TOKEN}")
        text = f"{BOS_TOKEN} {text} {EOS_TOKEN}" 

        temp_tokens = np.array(tokenizer.encode(text), dtype=np.int64)
        length = len(temp_tokens)
        padding = -length % (block_size+1)
        temp_tokens = np.reshape(np.concatenate((temp_tokens, np.ones(padding)*tokenizer.encode("[PAD]"))), (-1, block_size+1))
        # print(temp_tokens.shape)
        tokens = np.concatenate((tokens, temp_tokens), axis=0) if len(tokens) != 0 else temp_tokens

    return tokens

def read_text_from_xml(xml_dir:str):
    try:
        tree = parse(xml_dir)
        root = tree.getroot()
        text = " ".join([x.text for x in root.findall("text")[0].findall("p")])
        return text
    except: return ''

def encode_text_from_xml(folder_dir: str, tokenizer: AutoTokenizer, block_size:int, BOS_TOKEN:str, EOS_TOKEN:str):
    assert folder_dir[-1] != "/", "Check the directory please."
    xml_file_directories = glob.glob(f"{folder_dir}/*")

    texts = [read_text_from_xml(xml_dir) for xml_dir in xml_file_directories]
    
    tokens = encode_from_texts(texts, tokenizer, block_size, BOS_TOKEN, EOS_TOKEN)

    return tokens

def read_text_from_txt(txt_dir: str, encoding):
    with open(txt_dir, "r", encoding=encoding) as f:
        texts = f.read()
    print(texts[:100])
    return texts

def encode_text_from_txt(folder_dir: str, tokenizer: AutoTokenizer, block_size: int, encoding):
    assert folder_dir[-1] != "/", "Check the directory please."
    txt_file_directories = glob.glob(f"{folder_dir}/*.txt")

    texts = [read_text_from_txt(txt_dir, encoding=encoding) for txt_dir in txt_file_directories]
    
    tokens = encode_from_texts(texts, tokenizer, block_size)

    return tokens


In [13]:
tokenizer = AutoTokenizer.from_pretrained(
    'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16',
    bos_token='[BOS]', eos_token='[EOS]', unk_token='[UNK]', pad_token='[PAD]', mask_token='[MASK]'
    )

In [20]:
text = read_text_from_txt("../dataset/korean_murim_book.txt", encoding="cp949")
temp_tokens = np.array(tokenizer.encode(text), dtype=np.int64)
length = len(temp_tokens)
padding = -length % (64+1)
temp_tokens = np.reshape(np.concatenate((temp_tokens, np.ones(padding)*tokenizer.encode("[PAD]"))), (-1, 64+1))

로그인 무림

프롤로그

러시아 속담에 이런 말이 있다.

'공짜 치즈는 쥐덫 위에 있다.'

지금 생각해 보면 그날의 모든 것이 누군가의 쥐덫이 아닌가 싶다.

7년간 일했던 직


In [23]:
with gzip.open("../tmp/murim.tar.gz", "wb") as f:
    np.save(f, temp_tokens)

In [8]:
dataset1 = encode_text_from_xml("./NIKL_NP_v1.2/국립국어원 비출판물 말뭉치(버전 1.2)", tokenizer=tokenizer, block_size=128, BOS_TOKEN="[BOS]", EOS_TOKEN="[EOS]")

0it [00:00, ?it/s]


In [9]:
with gzip.open("dataset_cache.tar.gz", "wb") as f:
    np.save(f, dataset1)

In [10]:
dirs_to_process = glob.glob("./030.웹데이터 기반 한국어 말뭉치 데이터/01.데이터/1.Training/라벨링데이터/TL1/*")

dirs_to_process

[]

In [11]:
directory = dirs_to_process[0]
files_to_process = glob.glob(f"{directory}/*.json")

IndexError: list index out of range

In [None]:
import json

file_path = "./test.json"

with open(files_to_process[0], 'r') as file:
    data = json.load(file)

In [None]:
"【워싱턴=신화/뉴시스】(이름) 기자 = 도널드 트럼프 미 대통령은 오는 5".index("(이름) 기자")
"【워싱턴=신화/뉴시스】(이름) 기자 = 도널드 트럼프 미 대통령은 오는 5"[:-2]
["d", "d", "k", "a"][:-2]

['d', 'd']

In [None]:
dirs_to_process = glob.glob("../../dataset/030.웹데이터 기반 한국어 말뭉치 데이터/TL1/*/*.json")
len(dirs_to_process)

51830

In [None]:
import json

sources = []

total = 0

def load_dataset(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)

    for entity in data["named_entity"]:
        paragraph = []
        for content in entity["content"][:-2]:
            sentence = content["sentence"]

            if "(이름) 기자" in sentence:
                pos = sentence.index("(이름) 기자") + 10
                sentence = sentence[pos:]
            ignore_signs = ["참조링크", "관련기사"]
            for sign in ignore_signs: 
                if sign in sentence:
                    sentence = ""
                    break
            paragraph.append(sentence)
            
        sources.append(" ".join(paragraph))
        del paragraph
    
import tqdm
for filepath in tqdm.tqdm(dirs_to_process):
    load_dataset(filepath)

100%|██████████| 51830/51830 [07:46<00:00, 111.10it/s]


In [14]:
with open("../tmp/corpus.txt", "w", encoding="utf-8") as f:
    f.writelines("\n\n====\n\n".join(sources))

In [2]:
with open("../tmp/corpus.txt", "r") as f:
    source = f.read()

3262903551

In [7]:
import os
from pathlib import Path
from typing import Optional

import torch
from sentencepiece import SentencePieceProcessor, SentencePieceTrainer

# https://github.com/Lightning-AI/lit-llama/blob/main/lit_llama/tokenizer.py
class Tokenizer:
    """Tokenizer for LLaMA."""
    def __init__(self, model_path: Path) -> None:
        self.processor = SentencePieceProcessor(model_file=str(model_path))
        self.bos_id = self.processor.bos_id()
        self.eos_id = self.processor.eos_id()
        self.pad_id = self.processor.pad_id()

    @property
    def vocab_size(self) -> int:
        return self.processor.vocab_size()

    def encode(
        self,
        string: str,
        bos: bool = True,
        eos: bool = False,
        max_length: int = -1,
        pad: bool = False,
        device: Optional[torch.device] = None
    ) -> torch.Tensor:
        tokens = self.processor.encode(string)
        if bos:
            tokens = [self.bos_id] + tokens
        if eos:
            tokens = tokens + [self.eos_id]
        if max_length > 0:
            tokens = tokens[:max_length]
        if pad and len(tokens) < max_length:
            tokens += [self.pad_id] * (max_length - len(tokens))

        return torch.tensor(tokens, dtype=torch.int, device=device)

    def decode(self, tokens: torch.Tensor) -> str:
        return self.processor.decode(tokens.tolist())

    @staticmethod
    def train(input: str, destination: str, vocab_size=32000) -> None:
        model_prefix = os.path.join(destination, "tokenizer")
        SentencePieceTrainer.Train(input=input, model_prefix=model_prefix, vocab_size=vocab_size)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import os

destination_path = "../tmp/tokenizer"
os.makedirs(destination_path, exist_ok=True)
Tokenizer.train(input="../tmp/corpus.txt", destination=destination_path, vocab_size=480000)

In [4]:
source = source.split("\n\n====\n\n")

In [6]:
len(source)

4862715

In [17]:
import random
random.shuffle(source)
n = len(source)
train_data = source[:int(n*0.9)]
val_data = source[int(n*0.9):]
del source

In [22]:
tokenizer = Tokenizer("../tmp/tokenizer/tokenizer.model")

import tqdm 
import numpy as np
def encode_from_texts_v2(texts:list[str], tokenizer: Tokenizer, block_size:int):
    tokens = []
    pbar = tqdm.tqdm(
        texts,
        smoothing=0,
        leave=True,
        dynamic_ncols=True,
    )
    for text in pbar:
        # print(text)
        if text == "":
            continue
        
        encoded_text = tokenizer.encode(text, bos=True, eos=True)
        temp_tokens = np.array(encoded_text, dtype=np.int64)
        length = len(temp_tokens)
        padding = -length % (block_size+1)
        temp_tokens = np.reshape(np.concatenate((temp_tokens, np.ones(padding)*tokenizer.pad_id)), (-1, block_size+1))
        tokens = np.concatenate((tokens, temp_tokens), axis=0) if len(tokens) != 0 else temp_tokens

    return tokens

# train_ids = encode_from_texts_v2(train_data, tokenizer, block_size=256)
val_ids = encode_from_texts_v2(val_data, tokenizer, block_size=256)

100%|██████████| 486272/486272 [14:55:21<00:00,  9.05it/s]  


In [24]:
tokenizer.pad_id

-1

In [23]:
import gzip
# with gzip.open("train_ids_dataset.tar.gz", "wb") as f:
#     np.save(f, train_ids)
with gzip.open("large_dataset.tar.gz", "wb") as f:
    np.save(f, val_ids)

In [25]:
target_indices = val_ids == -1
val_ids[target_indices] = 480001

In [None]:
import json

sources = []

total = 0

def load_dataset(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)

    for entity in data["named_entity"]:
        paragraph = []
        for content in entity["content"][:-2]:
            sentence = content["sentence"]

            if "(이름) 기자" in sentence:
                pos = sentence.index("(이름) 기자") + 10
                sentence = sentence[pos:]
            ignore_signs = ["참조링크", "관련기사"]
            for sign in ignore_signs: 
                if sign in sentence:
                    sentence = ""
                    break
            paragraph.append(sentence)
            
        sources.append(" ".join(paragraph))
        del paragraph
    
    total += 1
    print(f"{total} was processed")

import multiprocessing

pool = multiprocessing.Pool(processes=8)
pool.map(load_dataset, dirs_to_process)
pool.close()
pool.join()

In [None]:
with open("corpus.txt", "w", encoding="utf-8") as f:
    f.writelines("\n\n====\n\n".join(sources))

4862715

In [44]:
tokens = encode_from_texts(sources, tokenizer, block_size=128, BOS_TOKEN="[BOS]", EOS_TOKEN="[EOS]")

  0%|▌                                                                                                                                                           | 17384/4862715 [12:59<60:19:44, 22.31it/s]


KeyboardInterrupt: 

In [4]:
import random
randomed_selected_sources = random.sample(sources, 500000)

In [7]:
tokens = encode_from_texts(randomed_selected_sources, tokenizer, block_size=128, BOS_TOKEN="[BOS]", EOS_TOKEN="[EOS]")

  7%|▋         | 34895/500000 [33:17<7:23:49, 17.47it/s]


KeyboardInterrupt: 

In [51]:
with gzip.open("korean_corpus_dataset_cache.tar.gz", "wb") as f:
    np.save(f, tokens)

In [54]:
merge_dataset("./corpus", "corpus.tar.gz")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.86s/it]


(204886, 129)
