- https://www.kaggle.com/datasets/robikscube/ubiquant-parquet?sort=recent-comments&select=investment_ids

#  UMP: Create Pickle Dataset
## Import Packages

In [1]:
import os
import pandas as pd
import numpy as np
import gc
import math

## Utilities

In [2]:
def reduce_memory_usage(df, features):
    for feature in features:
        item = df[feature].astype(np.float16)
        df[feature] = item
        del item
        gc.collect()

## Import dataset

In [7]:
! ls ../data/input/ubiquant-parquet

example_sample_submission.parquet  example_test.parquet  investment_ids


In [5]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
feature_columns = ['investment_id', 'time_id'] + features
train = pd.read_parquet('../data/input/ubiquant-parquet/train_low_mem.parquet', columns=feature_columns + ["target"])
train.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data/input/ubiquant-parquet/train_low_mem.parquet'

## Reducing Memories
There are totally 3141410 records and each record has 303 columns. If we convert all data type to int16 and float16, then the total memory of training data will be  (3141410 x 303 x 2)  / (1024^3) G, which is about 1.8G.

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3141410 entries, 0 to 3141409
Columns: 303 entries, investment_id to target
dtypes: float32(301), uint16(2)
memory usage: 3.5 GB


In [5]:
%%time
reduce_memory_usage(train, features + ["target"])

CPU times: user 1min 15s, sys: 50.9 s, total: 2min 6s
Wall time: 2min 6s


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3141410 entries, 0 to 3141409
Columns: 303 entries, investment_id to target
dtypes: float16(301), uint16(2)
memory usage: 1.8 GB


In [7]:
train.to_pickle("train.pkl")

In [8]:
del train
gc.collect()

21

## Read Pickle file

In [9]:
train = pd.read_pickle("/kaggle/working/train.pkl")
train.head()

Unnamed: 0,investment_id,time_id,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,...,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299,target
0,1,0,0.932617,0.113708,-0.4021,0.378418,-0.203979,-0.413574,0.96582,1.230469,...,-1.095703,0.200073,0.819336,0.941406,-0.086792,-1.086914,-1.044922,-0.287598,0.321533,-0.300781
1,2,0,0.811035,-0.51416,0.742188,-0.616699,-0.194214,1.771484,1.427734,1.133789,...,0.912598,-0.734375,0.819336,0.941406,-0.387695,-1.086914,-0.929688,-0.974121,-0.343506,-0.231079
2,6,0,0.394043,0.615723,0.567871,-0.60791,0.068909,-1.083008,0.979492,-1.125977,...,0.912598,-0.551758,-1.220703,-1.060547,-0.219116,-1.086914,-0.612305,-0.113953,0.243652,0.568848
3,7,0,-2.34375,-0.011871,1.875,-0.606445,-0.586914,-0.815918,0.77832,0.299072,...,0.912598,-0.266357,-1.220703,0.941406,-0.608887,0.104919,-0.783203,1.151367,-0.773438,-1.064453
4,8,0,0.842285,-0.262939,2.330078,-0.583496,-0.618164,-0.742676,-0.946777,1.230469,...,0.912598,-0.741211,-1.220703,0.941406,-0.588379,0.104919,0.753418,1.345703,-0.737793,-0.531738
