## Parquet files

    pip install pyarrow pandas


In [1]:
import os

os.makedirs("files", exist_ok=True)
os.chdir("files")

In [2]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

integer_data = [1, 2, 3, 4, 5]
# table = pa.Table.from_pandas({'integer_column': integer_data})

df = pd.DataFrame({"integer_column": integer_data})
table = pa.Table.from_pandas(df)
pq.write_table(table, "integer_data.parquet")

In [3]:
! cat integer_data.parquet

PAR1P@L
   ( 	 	 
<               ,
                                (   
�F &�5 integer_column
��&d                              ,      ,5 schema %integer_column 
&�5 integer_column
��&d                              ,      �
�  ,pandas�{"index_columns": [{"kind": "range", "name": null, "start": 0, "stop": 5, "step": 1}], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "integer_column", "field_name": "integer_column", "pandas_type": "int64", "numpy_type": "int64", "metadata": null}], "creator": {"library": "pyarrow", "version": "11.0.0"}, "pandas_version": "2.0.2"} ARROW:schema�/////4ACAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABBAAQAAAAAAAKAAwAAAAEAAgACgAAAPgBAAAEAAAAAQAAAAwAAAAIAAwABAAIAAgAAAAIAAAAEAAAAAYAAABwYW5kYXMAAMABAAB7ImluZGV4X2NvbHVtbnMiOi

In [4]:
df.to_parquet("integer_data2.parquet")

! cat integer_data2.parquet

PAR1P@L
   ( 	 	 
<               ,
                                (   
�F &�5 integer_column
��&d                              ,      ,5 schema %integer_column 
&�5 integer_column
��&d                              ,      �
�  ,pandas�{"index_columns": [{"kind": "range", "name": null, "start": 0, "stop": 5, "step": 1}], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "integer_column", "field_name": "integer_column", "pandas_type": "int64", "numpy_type": "int64", "metadata": null}], "creator": {"library": "pyarrow", "version": "11.0.0"}, "pandas_version": "2.0.2"} ARROW:schema�/////4ACAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABBAAQAAAAAAAKAAwAAAAEAAgACgAAAPgBAAAEAAAAAQAAAAwAAAAIAAwABAAIAAgAAAAIAAAAEAAAAAYAAABwYW5kYXMAAMABAAB7ImluZGV4X2NvbHVtbnMiOi

In [5]:
df.to_parquet("integer_data_gzip.parquet", compression="gzip")

! cat integer_data_gzip.parquet

PAR1PDL
         
cd� &(��Y�4+� (I��(    >,
                                      
cb``�bdf�pc  C��@   &�5 integer_column
��&h                              ,      ,5 schema %integer_column 
&�5 integer_column
��&h                              ,      �
�  ,pandas�{"index_columns": [{"kind": "range", "name": null, "start": 0, "stop": 5, "step": 1}], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "integer_column", "field_name": "integer_column", "pandas_type": "int64", "numpy_type": "int64", "metadata": null}], "creator": {"library": "pyarrow", "version": "11.0.0"}, "pandas_version": "2.0.2"} ARROW:schema�/////4ACAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABBAAQAAAAAAAKAAwAAAAEAAgACgAAAPgBAAAEAAAAAQAAAAwAAAAIAAwABAAIAAgAAAAIAAAAEAAAAAYAAABwYW5kYXMAAMABAAB7ImluZG

In [6]:
float_data = [1.1, 2.2, 3.3, 4.4, 5.5]

df = pd.DataFrame({"float_column": float_data})
table = pa.Table.from_pandas(df)
pq.write_table(table, "float_data.parquet")

In [7]:
string_data = ["apple", "banana", "cherry", "date", "elderberry"]

df = pd.DataFrame({"string_column": string_data})
table = pa.Table.from_pandas(df)
pq.write_table(table, "string_data.parquet")

In [8]:
bool_data = [True, False, 12 == 12, 12 != 12, True]

df = pd.DataFrame({"bool_column": bool_data})
table = pa.Table.from_pandas(df)
pq.write_table(table, "bool_data.parquet")

In [9]:
nested_data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]

df = pd.DataFrame({"nested_column": nested_data})
table = pa.Table.from_pandas(df)
pq.write_table(table, "nested_data.parquet")

In [10]:
df = pd.DataFrame(
    {
        "dictionary_column": [{"key1": "value1", "key2": "value2", "key3": "value3"}],
    }
)
table = pa.Table.from_pandas(df)
pq.write_table(table, "dictionary.parquet")

In [11]:
df = pd.DataFrame({"tuple_column": [(1, 2, 3)], "set_column": [list({1, 2, 3})]})
table = pa.Table.from_pandas(df)
pq.write_table(table, "multiple.parquet")

In [12]:
df = pd.DataFrame(
    {
        "date_column": pd.to_datetime(["2023-01-01", "2023-02-01", "2023-03-01"]),
        "datetime_column": pd.to_datetime(
            ["2023-01-01 10:00:00", "2023-02-01 12:00:00", "2023-03-01 14:00:00"]
        ),
    }
)
table = pa.Table.from_pandas(df)
pq.write_table(table, "date_related.parquet")

### Retrieving data Back

In [13]:
table = pq.read_table("date_related.parquet")
df = table.to_pandas()
df

Unnamed: 0,date_column,datetime_column
0,2023-01-01,2023-01-01 10:00:00
1,2023-02-01,2023-02-01 12:00:00
2,2023-03-01,2023-03-01 14:00:00


In [14]:
table = pq.read_table("date_related.parquet")
data_dict = table.to_pandas().to_dict(orient="list")

data_dict

{'date_column': [Timestamp('2023-01-01 00:00:00'),
  Timestamp('2023-02-01 00:00:00'),
  Timestamp('2023-03-01 00:00:00')],
 'datetime_column': [Timestamp('2023-01-01 10:00:00'),
  Timestamp('2023-02-01 12:00:00'),
  Timestamp('2023-03-01 14:00:00')]}

In [15]:
table = pq.read_table("date_related.parquet")
data_list = table.to_pandas().values.tolist()

data_list

[[1672531200000000000, 1672567200000000000],
 [1675209600000000000, 1675252800000000000],
 [1677628800000000000, 1677679200000000000]]

In [16]:
table = pq.read_table("date_related.parquet")
datetime_column = table.column("datetime_column").to_pandas()


datetime_column

0   2023-01-01 10:00:00
1   2023-02-01 12:00:00
2   2023-03-01 14:00:00
Name: datetime_column, dtype: datetime64[ns]

In [17]:
parquet_file = pq.ParquetFile("date_related.parquet")
print(parquet_file.schema)

<pyarrow._parquet.ParquetSchema object at 0x0000029B368FFD00>
required group field_id=-1 schema {
  optional int64 field_id=-1 date_column (Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
  optional int64 field_id=-1 datetime_column (Timestamp(isAdjustedToUTC=false, timeUnit=microseconds, is_from_converted_type=false, force_set_converted_type=false));
}



In [18]:
print(parquet_file.metadata)

<pyarrow._parquet.FileMetaData object at 0x0000029B368EF740>
  created_by: parquet-cpp-arrow version 11.0.0
  num_columns: 2
  num_rows: 3
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 2094
