In [1]:
import random
import struct

def generate_random_big_endian():
    # 生成一个随机的32位无符号整数
    num = random.randint(0, 2**32 - 1)
    # 将这个数转换为大端序的二进制形式
    return struct.pack('>I', num)

# 创建一个大文件
with open('big_file.bin', 'wb') as f:
    for _ in range(2 * 1024**3 // 4):  # 2GB / 4 bytes per number
        f.write(generate_random_big_endian())

In [1]:
import mmap
import numpy as np
import concurrent.futures
import time

def process_data(data):
    total = np.sum(data.astype(np.uint64))
    min_num = np.min(data)
    max_num = np.max(data)
    return total, min_num, max_num

def mmap_multithreaded_compute(filename):
    start_time = time.time()
    with open(filename, 'rb') as f:
        mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
        data = np.frombuffer(mm, dtype=np.dtype('>u4'))

        chunk_size = len(data) // 8  # Divide into 8 chunks
        chunks = [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]
        if len(data) % chunk_size != 0:
            chunks.append(data[len(data) // chunk_size * chunk_size:])

        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = list(executor.map(process_data, chunks))

        total = sum(result[0] for result in results)
        min_num = min(result[1] for result in results)
        max_num = max(result[2] for result in results)

    end_time = time.time()
    return total, min_num, max_num, end_time - start_time

total, min_num, max_num, running_time = mmap_multithreaded_compute('big_file.bin')

print(f"Total: {total}")
print(f"Min: {min_num}")
print(f"Max: {max_num}")
print(f"Running time for mmap and multithreading: {running_time} seconds")

Total: 1.152923360342725e+18
Min: 5
Max: 4294967260
Running time for mmap and multithreading: 0.5519099235534668 seconds


In [2]:
import struct
import time

def simple_read_and_compute(filename):
    total = 0
    min_num = 2**32 - 1
    max_num = 0
    with open(filename, 'rb') as f:
        while True:
            bytes_read = f.read(4)
            if not bytes_read:
                break
            number = struct.unpack('>I', bytes_read)[0]
            total += number
            if number < min_num:
                min_num = number
            if number > max_num:
                max_num = number
    return total, min_num, max_num

start_time = time.time()
total, min_num, max_num = simple_read_and_compute('big_file.bin')
end_time = time.time()

print(f"Total: {total}")
print(f"Min: {min_num}")
print(f"Max: {max_num}")
print(f"Running time for simple read: {end_time - start_time} seconds")


Total: 1152923360342724960
Min: 5
Max: 4294967260
Running time for simple read: 84.41593670845032 seconds
