* This is to overcome the memory issue when dealing with a large numerical dataset in pandas. The dtypes that pandas automatically choose might be too much a waste depending on the data, so we choose the most efficient dtype for each feature. I am using a laptop with only 8GB RAM, so this saved my life.

* What is nice is that this code only needs to be run once if the dataset is static or large enough to be representative of unseen data. We can save the optimal dtypes and use them the next time we read in data.

* Another thing that helped: in Windows, manually set the pagefile to maximum on an SSD. The minimal size should be 1.5 times the actual RAM, and the maximal size should be 4.5 times the actual RAM.

In [1]:
import numpy as np
import pandas as pd
import warnings; warnings.simplefilter('ignore')

In [2]:
def reduce_memory(data):
        
    before = data.memory_usage().sum() / 1024**2

    print(f"Memory usage before: {before} MB")
    
    for col in data.columns:
        col_type = data[col].dtype
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)

    after = data.memory_usage().sum() / 1024**2
    print(f"Memory usage after: {after} MB')    
    return data

In [3]:
def getdtype(filename):
    df = pd.read_csv(filename, index_col = 0)
    df = reduce_memory(df)
    return df.dtypes

In [None]:
dtype_init = getdtype('train.csv')
dtype_init.to_frame().T.to_csv('dtypes.csv')

Next time we read in data, we can skip all above and only do the following.

In [None]:
dtype_init = pd.read_csv('dtypes.csv', index_col = 0)
dtype_init = {key: dtype_init[key].iloc[0] for key in dtype_init.columns}
df = pd.read_csv('train.csv', index_col = 0, dtype = dtype_init)