In [9]:
import random
random.seed(1234)

import logging
from pprint import pprint
from sys import stdout as STDOUT

# 모든 출력을 임시 디렉터리에 기록함
import atexit
import gc
import io
import os
import tempfile

TEST_DIR = tempfile.TemporaryDirectory()
atexit.register(TEST_DIR.cleanup)

# 윈도우에서 프로세스 깔끔하게 종료하기
OLD_CWD = os.getcwd()
atexit.register(lambda: os.chdir(OLD_CWD))
os.chdir(TEST_DIR.name)

def close_open_files():
    everything = gc.get_objects()
    for obj in everything:
        if isinstance(obj, io.IOBase):
            obj.close()

atexit.register(close_open_files)

<function __main__.close_open_files()>

In [5]:
# 여러 로그 파일을 한 출력 스트림으로  병합해 디버깅을 돕는 프로그램

# 여라 파일 핸들이 주어지면 새로운 데이터가 도착했는지 감지해서 다음 줄을 반환할 방법 필요

class NoNewData(Exception):
    pass

def readline(handle):
    offset = handle.tell() # 현재 읽고 있는 위치
    handle.seek(0, 2)
    length = handle.tell() # 파일의 길이와 일치하는가, 즉 끝났는가?
    
    if length == offset:
        raise NoNewData
    
    handle.seek(offset, 0) # 처음으로 옮기고
    return handle.readline() # 리드라인 함수 반환

In [2]:
import time

def tail_file(handle, interval, write_func):
    while not handle.closed:
        try:
            line = readline(handle)
            
        except NoNewData:
            time.sleep(interval)
        
        else:
            write_func(line)

In [7]:
# 스레드에서 데이터를 쓰는 순서를 직렬화하고, 각 줄이 중간에 충돌해서 뒤섞이는일이 없게 만든다.

from threading import Lock, Thread

def run_threads(handles, interval, output_path):
    with open(output_path, 'wb') as output:
        lock = Lock()
        def write(data):
            with lock: 
                output.write(data)
            
        threads = []
        
        for handle in handles:
            args = (handle, interval, write)
            thread = Thread(target=tail_file, args=args)
            thread.start()
            threads.append(thread)
            
        for thread in threads:
            thread.join()
            
        

In [8]:
import collections
import os
import random
import string
from tempfile import TemporaryDirectory

def setup():
    tmpdir = TemporaryDirectory()
    input_paths = start_write_threads(tmpdir.name, 5)

    handles = []
    for path in input_paths:
        handle = open(path, 'rb')
        handles.append(handle)

    Thread(target=close_all, args=(handles,)).start()

    output_path = os.path.join(tmpdir.name, 'merged')
    return tmpdir, input_paths, handles, output_path


# 예제 5
def confirm_merge(input_paths, output_path):
    found = collections.defaultdict(list)
    with open(output_path, 'rb') as f:
        for line in f:
            for path in input_paths:
                if line.find(path.encode()) == 0:
                    found[path].append(line)

    expected = collections.defaultdict(list)
    for path in input_paths:
        with open(path, 'rb') as f:
            expected[path].extend(f.readlines())

    for key, expected_lines in expected.items():
        found_lines = found[key]
        assert expected_lines == found_lines, \
            f'{expected_lines!r} == {found_lines!r}'

input_paths = ...
handles = ...
output_path = ...

tmpdir, input_paths, handles, output_path = setup()

NameError: name 'TemporaryDirectory' is not defined

In [3]:
# top-down
'''
    1. 최상위 함수가 def 대신 async def를 사용하게 변경
    2. 최상위 함수가 I/O를 호출하는 모든 부분을 asyncio.run_in_executor로 감싸라
    3. run_in_exeecutor 호출이 사용하는 자원이나 콜백이 제대로 동기화(lock이나, asyncio.corutine_threadsafe) 함수를 사용했는지 확인
    4. 호출 계층의 잎 쪽으로 내려가면서 중간에 있는 함수와 메서드를 코루틴으로 변경하며 get_event_loop와 run_in_executeor 호출을 없애려고 시도해라
'''

import asyncio

async def run_tasks_mixed(handles, interval, output_path):
    loop = asyncio.get_event_loop()
    
    with open(output_path, 'wb') as output:
        async def write_async(data):
            output.write(data)
            
        def write(data):
            coro = write_async(data)
            future = asyncio.run_coroutine_threadsafe(coro, loop)
            future.result()
            
        tasks = []
        for handle in handles:
            task = loop.run_in_executor(
                None, tail_file, handle, interval, write
            ) # 실행기 인스턴스를 사용해 주어진 함수를 실행하게 만든다.
            tasks.append(task)
            
        await asyncio.gather(*tasks) # tail_file이 모두 종료되도록 팬인 시킨다
        

In [None]:
        
async def tail_async(handle, interval, write_func):
    loop = asyncio.get_event_loop()
    
    while not handle.closed:
        try:
            line = await loop.run_in_executor(
                None, readline, handle
            )
        except NoNewData:
            await asyncio.sleep(interval)
        else:
            await write_func(line)
            
async def run_tasks(handles, interval, output_path):
    with open(output_path, 'wb') as output:
        async def write_async(data):
            output.write(data)
        
        tasks = []
        
        for handle in handles:
            coro = tail_async(handle, interval, write_async)
            task = asyncio.create_task(coro)
            tasks.append(task)
            
        await asyncio.gather(*tasks)