## 大きな DataFrame の保存

- CSV
- Pickle

In [1]:
import pandas as pd
from sklearn.datasets import fetch_openml

# MNISTデータを DataFrame で取得
df_X, df_y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=True)

# メモリ使用量
df_X.info(memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Columns: 784 entries, pixel1 to pixel784
dtypes: float64(784)
memory usage: 418.7 MB


In [None]:
# ファイルがあれば削除
import os

csv_file = 'MNIST_X.csv'
pickle_file = 'MNIST.pkl'
pickle_gz_file = 'MNIST.pkl.gz'

for f in [csv_file, pickle_file, pickle_gz_file]:
    try:
        os.remove(f)
    except Exception:
        pass

In [None]:
%%time

# CSVに保存
df_X.to_csv(csv_file)

print('{:.2f}MB'.format(os.path.getsize(csv_file)/(1024*1024)))

In [None]:
%%time

# CSVから読み込み
df = pd.read_csv(csv_file)

In [None]:
%%time

# Pickleに保存
df_X.to_pickle(pickle_file)

print('{:.2f}MB'.format(os.path.getsize(pickle_file)/(1024*1024)))

In [None]:
%%time

# Pickleから読み込み
df = pd.read_pickle(pickle_file)

In [None]:
%%time

# Pickleを圧縮して保存
# - compression: 圧縮方法とオプション（比較的高速に保存する例）
df_X.to_pickle(pickle_gz_file, compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1})

print('{:.2f}MB'.format(os.path.getsize(pickle_gz_file)/(1024*1024)))

In [None]:
%%time

# Pickleから読み込み
df = pd.read_pickle(pickle_gz_file)