# メモリ使用量削減サンプル

In [1]:
import numpy as np
import pandas as pd

## 10万レコードの乱数データを生成

In [2]:
db = pd.DataFrame(columns=["real", "integer"])
db["real"] = np.random.random(100000).reshape(-1)
db["integer"]= np.random.randint(low = 0, high=100, size=(100000))*1.0

In [3]:
db.head()

Unnamed: 0,real,integer
0,0.030393,65.0
1,0.043192,36.0
2,0.628632,56.0
3,0.898832,83.0
4,0.552662,58.0


型の確認とメモリ使用量の確認

In [4]:
db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
real       100000 non-null float64
integer    100000 non-null float64
dtypes: float64(2)
memory usage: 1.5 MB


### カラムのデータをfloat32型とint32型に変換

In [5]:
db["real"] = db["real"].astype(np.float32)
db["integer"] = db["integer"].astype(np.int32)

型の確認とメモリ使用量の確認

In [6]:
db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
real       100000 non-null float32
integer    100000 non-null int32
dtypes: float32(1), int32(1)
memory usage: 781.3 KB


In [7]:
db = pd.DataFrame(columns=["int_category"])
db["int_category"]= np.random.randint(low = 0, high=25000, size=(100000))
db["int_category"] = db["int_category"].astype("category")
db2 = db.copy()

In [8]:
%%time
db = pd.get_dummies(db)

Wall time: 25.1 s


In [9]:
db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 24514 entries, int_category_0 to int_category_24999
dtypes: uint8(24514)
memory usage: 2.3 GB


In [10]:
%%time
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
enc.fit(db2["int_category"].astype(int).values.reshape(-1,1))
result = enc.transform(db2["int_category"].astype(int).values.reshape(-1,1))

Wall time: 255 ms


In [22]:
import sys
print ("Sparse matrixのサイズ:", sys.getsizeof(result), "byte")
print ("numpy配列に変換:", sys.getsizeof(result.toarray().astype(np.int8))/(1024*1024*1024), "Giga byte") # GB

Sparse matrixのサイズ: 56 byte
numpy配列に変換: 2.2830442637205124 Giga byte


In [21]:
print("pandas dataframeのサイズ:", sys.getsizeof(db)/(1024*1024*1024), "Giga byte")

pandas dataframeのサイズ: 2.283044256269932 Giga byte


In [13]:
db.head()

Unnamed: 0,int_category_0,int_category_1,int_category_2,int_category_3,int_category_4,int_category_5,int_category_6,int_category_7,int_category_8,int_category_9,...,int_category_24990,int_category_24991,int_category_24992,int_category_24993,int_category_24994,int_category_24995,int_category_24996,int_category_24997,int_category_24998,int_category_24999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
