In [1]:
import random
import struct

def generate_random_big_endian():
    # 生成一个随机的32位无符号整数
    num = random.randint(0, 2**32 - 1)
    # 将这个数转换为大端序的二进制形式
    return struct.pack('>I', num)

# 创建一个大文件
with open('big_file.bin', 'wb') as f:
    for _ in range(2 * 1024**3 // 4):  # 2GB / 4 bytes per number
        f.write(generate_random_big_endian())

In [2]:
import numpy as np
import time

start_time = time.time()

# 使用numpy的fromfile函数读取二进制文件
data = np.fromfile('big_file.bin', dtype=np.dtype('>u4'))

# 计算总和
total = np.sum(data)

# 找到最小和最大的数字
min_num = np.min(data)
max_num = np.max(data)

end_time = time.time()
print(f'Running time of the first code: {end_time - start_time} seconds')

print(f'Total: {total}')
print(f'Min: {min_num}')
print(f'Max: {max_num}')

Running time of the first code: 1.130702257156372 seconds
Total: 310006112
Min: 5
Max: 4294967260


In [3]:
import mmap
import numpy as np
import concurrent.futures
import time

start_time = time.time()

def process_data(data):
    # 计算这部分数据的总和以及最小和最大的数字
    total = np.sum(data)
    min_num = np.min(data)
    max_num = np.max(data)
    return total, min_num, max_num

# 映射文件
with open('big_file.bin', 'rb') as f:
    mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
    data = np.frombuffer(mm, dtype=np.dtype('>u4'))

# 使用多线程并行处理所有数据
with concurrent.futures.ThreadPoolExecutor() as executor:
    chunk_size = len(data) // 10  # 分10份处理
    futures = [executor.submit(process_data, data[i:i+chunk_size]) for i in range(0, len(data), chunk_size)]
    results = [f.result() for f in concurrent.futures.as_completed(futures)]

# 计算总和以及最小和最大的数字
total = sum(result[0] for result in results)
min_num = min(result[1] for result in results)
max_num = max(result[2] for result in results)

end_time = time.time()
print(f'Running time of the second code: {end_time - start_time} seconds')

print(f'Total: {total}')
print(f'Min: {min_num}')
print(f'Max: {max_num}')

Running time of the second code: 0.2974250316619873 seconds
Total: 26079809888
Min: 5
Max: 4294967260
