In [2]:
import os
import pyarrow.parquet as pq

def describe_parquet(file_path):
    file_size = os.path.getsize(file_path)
    print(f"File Size: {file_size} bytes")

    table   = pq.read_table(file_path)
    columns = table.column_names

    print(f"Number of rows: {table.num_rows}")
    print(f"Number of columns: {len(columns)}")

    print("Columns:")
    for column in columns:
        print(column)

describe_parquet("titanic.parquet")

File Size: 40011 bytes
Number of rows: 891
Number of columns: 12
Columns:
PassengerId
Survived
Pclass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked


In [3]:
import pandas as pd

# Read the Parquet file into a DataFrame
df = pd.read_parquet('titanic.parquet')

# Display the contents of the DataFrame
print("Contents of the Parquet file:")
print(df)


Contents of the Parquet file:
     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                     

In [4]:
import pyarrow.parquet as pq

# To extract entries from a Parquet file where the Ship column exactly matches `USS Enterprise-D`
# without loading the entire file into memory, you can use the filters argument in PyArrow's `read_table`
# function to apply "predicate pushdown".  This method allows you to specify conditions that are used to
# filter data during the read operation, which can significantly reduce memory usage by only loading the
# relevant subset of data.  This is useful when dealing with Common Crawl's indexes, because they're huge!

# Define filters to apply predicate pushdown
# Here we specify that we only want rows where the 'Ship' column is 'USS Enterprise-D'
filters = [('Sex', '=', 'male')]

# Read the Parquet file with the filters applied to avoid loading a monstrously large file into memory
table = pq.read_table('titanic.parquet', filters=filters)

# Convert to Pandas DataFrame for easier viewing/manipulation (optional)
filtered_df = table.to_pandas()

print(filtered_df)

     PassengerId  Survived  Pclass                            Name   Sex  \
0              1         0       3         Braund, Mr. Owen Harris  male   
1              5         0       3        Allen, Mr. William Henry  male   
2              6         0       3                Moran, Mr. James  male   
3              7         0       1         McCarthy, Mr. Timothy J  male   
4              8         0       3  Palsson, Master. Gosta Leonard  male   
..           ...       ...     ...                             ...   ...   
572          884         0       2   Banfield, Mr. Frederick James  male   
573          885         0       3          Sutehall, Mr. Henry Jr  male   
574          887         0       2           Montvila, Rev. Juozas  male   
575          890         1       1           Behr, Mr. Karl Howell  male   
576          891         0       3             Dooley, Mr. Patrick  male   

      Age  SibSp  Parch            Ticket     Fare Cabin Embarked  
0    22.0      1   

In [5]:
import pandas as pd

# Create a sample DataFrame
data = {
    'Captain': ['James T. Kirk', 'Jean-Luc Picard', 'Benjamin Sisko', 'Kathryn Janeway', 'Jonathan Archer', 'William T. Riker', 'Edward Jellico'],
    'Actor': ['William Shatner', 'Patrick Stewart', 'Avery Brooks', 'Kate Mulgrew', 'Scott Bakula', 'Jonathan Frakes', 'Ronny Cox'],
    'Ship': ['USS Enterprise', 'USS Enterprise-D', 'Deep Space 9', 'USS Voyager', 'Enterprise NX-01', 'USS Titan', 'USS Enterprise-D'],
    'Quote': ['Beam me up, Scotty!', 'Make it so.', "It's a faaaaake!", "There's coffee in that nebula.", "We're not out here to play God.", 'I love surprise parties.', 'Get it done.']
}
df = pd.DataFrame(data)

# Write DataFrame to Parquet file
df.to_parquet('example.parquet', index=False)

print("Parquet file 'example.parquet' has been created successfully.")

Parquet file 'example.parquet' has been created successfully.
