### How to read a large csv file with pandas

In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv




#### references:

1. read file by chunksize
 - https://stackoverflow.com/questions/25962114/how-do-i-read-a-large-csv-file-with-pandas
 - https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
1. convert 64bit numeric values to 32bit values: convert int64 to int32; convert float64 to float32
 - https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html
 - **warning**: if the values is out of 32bit value range, the conversion will be erroneous
1. save data to parquet file: use compression='GZIP' to further decrease file size
 - https://arrow.apache.org/docs/python/parquet.html
 - https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html?highlight=write_table

#### steps:

1. read csv by chunk. can set chunk size as 100k (i.e. chunksize = 10e4)
1. convert int64 to int32 and float64 to float32 >> this will cut the file size to half
1. save each chunk to a parquet file


In [1]:
import numpy as np
import pandas as pd
import gc
import copy
import os
import sys

from pathlib import Path
from datetime import datetime, date, time, timedelta
from dateutil import relativedelta

import pyarrow.parquet as pq
import pyarrow as pa

In [None]:
%%time
#https://stackoverflow.com/questions/25962114/how-do-i-read-a-large-csv-file-with-pandas
chunksize = 10e4
print('chunksize=', chunksize)

def process_big_csv(chunk, dest_file):
    #---convert float64 to float32--------
    float64_cols = chunk.select_dtypes(include=['float64']).columns.tolist()
    chunk[float64_cols] = np.float32(chunk[float64_cols].values)
    #---convert int64 to int32
    int64_cols = chunk.select_dtypes(include=['int64']).columns.tolist()
    chunk[int64_cols] = np.int32(chunk[int64_cols].values)
    
    #-- save to parquet file
    table = pa.Table.from_pandas(chunk)
    pq.write_table(table, dest_file, compression = 'GZIP')
    
    del table, chunk
    gc.collect()

train_file = '/kaggle/input/amex-default-prediction/test_data.csv'
with pd.read_csv(train_file, chunksize=chunksize) as reader:
    for i, chunk in enumerate(reader):
        dest_file = f'{i+1}.parquet'
        process_big_csv(chunk, dest_file)

## check output file

In [11]:
%%time
train = pd.read_parquet('114.parquet')

CPU times: user 669 ms, sys: 359 ms, total: 1.03 s
Wall time: 634 ms


In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63762 entries, 11300000 to 11363761
Columns: 190 entries, customer_ID to D_145
dtypes: float32(185), int32(1), object(4)
memory usage: 47.2+ MB


## display files and names

In [15]:
files = next(os.walk('.'))[2]
parquet_files = []
for file in files:
    if '.parquet' in file:
        parquet_files.append(file)

len(parquet_files), parquet_files[:2]

(114, ['65.parquet', '103.parquet'])

In [10]:
!ls -lh

total 9.0G
-rw-r--r-- 1 root root 81M Jul  6 05:21 1.parquet
-rw-r--r-- 1 root root 81M Jul  6 05:17 10.parquet
-rw-r--r-- 1 root root 81M Jul  6 05:12 100.parquet
-rw-r--r-- 1 root root 87M Jul  6 04:48 101.parquet
-rw-r--r-- 1 root root 81M Jul  6 05:13 102.parquet
-rw-r--r-- 1 root root 80M Jul  6 05:08 103.parquet
-rw-r--r-- 1 root root 81M Jul  6 05:16 104.parquet
-rw-r--r-- 1 root root 81M Jul  6 05:20 105.parquet
-rw-r--r-- 1 root root 81M Jul  6 05:14 106.parquet
-rw-r--r-- 1 root root 87M Jul  6 04:49 107.parquet
-rw-r--r-- 1 root root 81M Jul  6 05:16 108.parquet
-rw-r--r-- 1 root root 81M Jul  6 05:20 109.parquet
-rw-r--r-- 1 root root 80M Jul  6 05:15 11.parquet
-rw-r--r-- 1 root root 81M Jul  6 05:10 110.parquet
-rw-r--r-- 1 root root 81M Jul  6 05:12 111.parquet
-rw-r--r-- 1 root root 81M Jul  6 05:14 112.parquet
-rw-r--r-- 1 root root 81M Jul  6 05:11 113.parquet
-rw-r--r-- 1 root root 50M Jul  6 05:15 114.parquet
-rw-r--r-- 1 root root 81M Jul  6 05:10 12.parquet
-rw-r-

In [2]:
folder = r"amex\train"

In [4]:
%%time
train = pd.read_parquet(folder)

Wall time: 1min 26s


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11363762 entries, 0 to 11363761
Columns: 190 entries, customer_ID to D_145
dtypes: float32(185), int32(1), object(4)
memory usage: 8.2+ GB


In [6]:
    table = pa.Table.from_pandas(train)
    pq.write_table(table, r"amex\amex_train_20220706.parquet", compression = 'GZIP')

In [7]:
%%time
train = pd.read_parquet( r"amex\amex_train_20220706.parquet")

Wall time: 1min 24s
