# Pandasにおけるデータ読込高速化
- Language
    - Python

- How to process large dataset with high speed and saving memory usage
    - specify data types
    - use 'chunksize' option in read_csv function
    - specify columns only to use
    - delete unnecessary columns and pandas DataFrame

- 参考にした記事：
    - https://cream-kuchen.hatenablog.com/entry/how_to_speedup_python_pandas_data_process
    - https://acro-engineer.hatenablog.com/entry/2022/12/12/124822

In [1]:
# import fundamental packages
import pandas as pd
import numpy as np

- pandasのデータフレームのレコードに合わせて各カラムの型を適切なサイズまで小さくする

In [5]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#     start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
#     end_mem = df.memory_usage().sum() / 1024**2
#     if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Read sample dataset

### Usual reading method

In [16]:
inpath = './data/AirQualityUCI.csv'
df0 = pd.read_csv(inpath, sep=';')
memory_size = df0.memory_usage().sum() / (1024**2)               # pandasが使用するメモリサイズ[MB]
print(f"Memory usage of DataFrame: {memory_size:.2f} MB")

Memory usage of DataFrame: 1.23 MB


### Sequential reading

In [17]:
def split_reader(inpath, size):
    reader = pd.read_csv(inpath, chunksize=size, sep=";")
    df = pd.concat((reduce_mem_usage(r) for r in reader), ignore_index=True)
    return df

inpath = "./data/AirQualityUCI.csv"                             # csvファイルのパス
size = 1000000                                                  # 読み込むレコードの単位
df = split_reader(inpath, size)
df.head()
memory_size = df.memory_usage().sum() / (1024**2)               # pandasが使用するメモリサイズ[MB]
print(f"Memory usage of DataFrame: {memory_size:.2f} MB")

Memory usage of DataFrame: 0.79 MB
