# IO Lab

In [4]:
import os
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
from appvocai-discover.shared.frameworks.spark.factory import SparkSessionPool
from appvocai-discover.shared.persist.file.io import IOService, TarGzHandler
pd.options.display.max_rows = 999
pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 20


## Convert and Compress

In [5]:
fp1 = "data/ext/reviews.pkl"
fp2 = "data/ext/reviews"
fp3 = "data/ext/reviews_2024-07-03_22M.tar.gz"

df1 = IOService.read(fp1)
df1.memory_usage(deep=True).sum().sum()/1024*1024
print(f"Dataset read. It has {df1.shape[0]} rows with memory size of {df1.memory_usage(deep=True).sum().sum()/1024*1024} Mb.")
IOService.write(filepath=fp2, data=df1, row_group_size=1073741824, partition_cols=["category"])
assert os.path.isdir(fp2)
print(f"Created {len(os.listdir(fp2))} parquet files.")
TarGzHandler().compress_directory(directory_path=fp2, tar_gz_path=fp3)
assert os.path.exists(fp3)
print("Conversion complete. Upload to AWS")


Dataset read. It has 22166591 rows with memory size of 16629646140.0 Mb.
Created 14 parquet files.
Compressed data/ext/reviews into data/ext/reviews.tar.gz
Conversion complete. Upload to AWS


## Validate Compression

In [7]:
fp3 = "data/ext/reviews_2024-07-03_22M.tar.gz"
fp4 = "data/ext/reviews2"
TarGzHandler().extract(tar_gz_path=fp3, extract_dir=fp4)
df2 = IOService.read(filepath=fp4)
df2.info(verbose=True, max_cols=20, memory_usage=True, show_counts=True)

Extracted data/ext/reviews_2024-07-03_22M.tar.gz to data/ext/reviews2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22166591 entries, 0 to 22166590
Data columns (total 11 columns):
 #   Column       Non-Null Count     Dtype         
---  ------       --------------     -----         
 0   id           22166591 non-null  string        
 1   app_id       22166591 non-null  string        
 2   app_name     22166591 non-null  string        
 3   category_id  22166591 non-null  object        
 4   author       22166591 non-null  object        
 5   rating       22166591 non-null  float64       
 6   content      22166591 non-null  string        
 7   vote_sum     22166591 non-null  Int64         
 8   vote_count   22166591 non-null  Int64         
 9   date         22166591 non-null  datetime64[ns]
 10  category     22166591 non-null  category      
dtypes: Int64(2), category(1), datetime64[ns](1), float64(1), object(2), string(4)
memory usage: 1.7+ GB
