In [1]:
from pathlib import Path
import json

settings = json.loads(Path('settingstest.json').read_text())
preprocessed_data_path = Path(settings['preprocessed_data_path'])
data_sources = ['AIHub', 'ModuCorpus', 'NIKL']
data_dirs = map(lambda x: preprocessed_data_path / x, data_sources)

In [2]:
f = Path('jsonl_file_sizes.txt').open('w')

def get_directory_sizes(directory):
    directory_path = Path(directory)
    for file_path in sorted(directory_path.glob("*/preprocessed")):
        total_size = 0
        for file in file_path.glob("*/*.jsonl"):
            total_size += file.stat().st_size
        yield file_path, total_size / (1024 * 1024 * 1024)

total_size = 0
for data_dir in data_dirs:
    data_size = 0
    dataset_size = 0
    for file_path, size in get_directory_sizes(data_dir):
        f.write(f"{size:>7.4f} GB\t{file_path.parent}\n")
        data_size += size
        dataset_size += 1
    f.write(f"{data_size:>7.4f} GB\t{data_dir}\n")
    print(f"{data_dir} 파일 크기: {data_size:.4f} GB")
    print(dataset_size)
    dataset_size = 0
    total_size += data_size
    
f.write(f"{total_size:.4f} GB\t전체 파일 크기\n")
print(f"전체 파일 크기: {total_size:.4f} GB")
f.close()


/home/nlp-02/kkb2/DICELLM/data/AIHub 파일 크기: 337.9671 GB
141
/home/nlp-02/kkb2/DICELLM/data/ModuCorpus 파일 크기: 42.2595 GB
58
/home/nlp-02/kkb2/DICELLM/data/NIKL 파일 크기: 1.1582 GB
4
전체 파일 크기: 381.3848 GB


In [5]:
from urllib3 import PoolManager
from pathlib import Path
import random
import json

http = PoolManager()
code = json.loads(Path('notion_code.json').open().read())
auth = code['NOTION_AUTHORIZATION_KEY']
db = code['NOTION_DATABASE_ID']

url = f"https://api.notion.com/v1/databases/{db}/query"
headers = {
    'Authorization': auth,
    'Notion-Version': '2022-06-28',
    "Content-Type": "application/json"
}
body = {}
has_more = True
datasets = []
while has_more:
    response = http.request('POST',
                        url,
                        body = json.dumps(body),
                        headers = headers,
                        retries = False)
    source = json.loads(response.data.decode('utf-8'))
    datasets.extend(source['results'])
    has_more = source['has_more']
    body['start_cursor'] = source['next_cursor']

dataset2task = dict()
for dataset in datasets:
    dataset_name = dataset['properties']['Dataset Name']['title'][0]['plain_text']
    source = dataset['properties']['Source']['select']['name']
    tasks = [task['name'] for task in dataset['properties']['Tasks']['multi_select']]
    dataset2task[dataset_name] = tasks
    
dataset2task = {k: v for k, v in sorted(dataset2task.items(), key=lambda x: x[0])}

json.dump(dataset2task, Path('dataset2task.json').open('w'), ensure_ascii=False, indent=4)