# Open and Explore Parquet Files

Explore the intermediate data....

In [2]:
# Dependencies (installs if missing)
import importlib
import sys
import subprocess

def ensure(package: str, import_name: str | None = None):
    """Import a package; if missing, pip-install it and import again."""
    name = import_name or package
    try:
        return importlib.import_module(name)
    except ImportError:
        print(f"Installing {package} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        return importlib.import_module(name)

pd = ensure("pandas")
_ = ensure("pyarrow")  # Parquet engine

print("pandas:", pd.__version__)
import pyarrow
print("pyarrow:", pyarrow.__version__)


pandas: 2.2.3
pyarrow: 14.0.2


In [1]:
# Set the path to the Parquet file (or a directory containing Parquet partitions)
from pathlib import Path

# Windows paths
parquetPath = Path(r"FORK_tr-text-fabric\\data\\intermediate\\tr_complete.parquet")

if parquetPath.is_dir():
    print("Directory detected. Will read as a partitioned Parquet dataset:")
    print(parquetPath.resolve())
elif parquetPath.exists():
    print("File detected:")
    print(parquetPath.resolve())
else:
    print(f"Not found: {parquetPath.resolve()}")
    print("\nParquet files under the current folder:")
    candidates = list(Path(".").rglob("*.parquet"))
    if not candidates:
        print("  (none found)")
    else:
        for p in candidates[:200]:
            print(" -", p)
        if len(candidates) > 200:
            print(f"... and {len(candidates) - 200} more")
    raise FileNotFoundError("Set parquetPath to an existing .parquet file or directory.")


File detected:
D:\Onedrive\GitHub\FORK_tr-text-fabric\data\intermediate\tr_complete.parquet


In [2]:
# Read the Parquet file/dataset into a pandas DataFrame
import pandas as pd

if parquetPath.is_dir():
    import pyarrow.dataset as ds
    dataset = ds.dataset(parquetPath, format="parquet")
    table = dataset.to_table()  
    df = table.to_pandas()
else:
    df = pd.read_parquet(parquetPath, engine="pyarrow")

print("Loaded.")
print("Rows:", len(df))
print("Columns:", len(df.columns))
df.head(10)


Loaded.
Rows: 140726
Columns: 26


Unnamed: 0,word_id,book,chapter,verse,word_rank,word,morph,strong,lemma,sp,...,parent,clause_id,phrase_id,gloss,aligned,n1904_node_id,nu,ps,source,gn
0,92349,1CO,1,1,1,Παῦλος,N-NSM,G3972,Παῦλος,subs,...,,,2047279.0,Paul,True,90169.0,,,n1904,
1,92350,1CO,1,1,2,κλητὸς,A-NSM,G2822,κλητός,adjv,...,,,2047279.0,"called, summoned, invited",True,90170.0,,,n1904,
2,92351,1CO,1,1,3,ἀπόστολος,N-NSM,G652,ἀπόστολος,subs,...,,,2047279.0,"apostle, messenger",True,90171.0,,,n1904,
3,92352,1CO,1,1,4,Ἰησοῦ,N-GSM,G2424,Ἰησοῦς,subs,...,,,2047279.0,"Jesus, Joshua",True,90173.0,,,n1904,
4,92353,1CO,1,1,5,Χριστοῦ,N-GSM,G5547,Χριστός,subs,...,,,,"Christ, Messiah",False,,s,,nlp,m
5,92354,1CO,1,1,6,διὰ,PREP,G1223,διά,prep,...,,,2047279.0,"through, (with gen.); on account of, because o...",True,90174.0,,,n1904,
6,92355,1CO,1,1,7,θελήματος,N-GSN,G2307,θέλημα,subs,...,,,2047279.0,"will, wish, desire",True,90175.0,,,n1904,
7,92356,1CO,1,1,8,θεοῦ,N-GSM,G2316,θεός,subs,...,,,,God,False,,s,,nlp,m
8,92357,1CO,1,1,9,καὶ,CONJ,G2532,καί,conj,...,,,2047279.0,"and, also, likewise",True,90177.0,,,n1904,
9,92358,1CO,1,1,10,Σωσθένης,N-NSM,G4988,Σωσθένης,subs,...,,,2047279.0,Sosthenes,True,90178.0,,,n1904,


In [3]:
# Quick inspection
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140726 entries, 0 to 140725
Data columns (total 26 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   word_id        140726 non-null  int64  
 1   book           140726 non-null  object 
 2   chapter        140726 non-null  int64  
 3   verse          140726 non-null  int64  
 4   word_rank      140726 non-null  int64  
 5   word           140726 non-null  object 
 6   morph          140726 non-null  object 
 7   strong         140726 non-null  object 
 8   lemma          140726 non-null  object 
 9   sp             140726 non-null  object 
 10  case           80444 non-null   object 
 11  tense          30069 non-null   object 
 12  voice          30069 non-null   object 
 13  mood           29415 non-null   object 
 14  function       57875 non-null   object 
 15  role           42568 non-null   object 
 16  parent         3581 non-null    float64
 17  clause_id      112118 non-nul

In [4]:
# Column list + basic stats
print("Columns:\n", list(df.columns))

# Numeric summary (if applicable)
df.describe(include="number").T


Columns:
 ['word_id', 'book', 'chapter', 'verse', 'word_rank', 'word', 'morph', 'strong', 'lemma', 'sp', 'case', 'tense', 'voice', 'mood', 'function', 'role', 'parent', 'clause_id', 'phrase_id', 'gloss', 'aligned', 'n1904_node_id', 'nu', 'ps', 'source', 'gn']


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
word_id,140726.0,70363.5,40624.241328,1.0,35182.25,70363.5,105544.75,140726.0
chapter,140726.0,10.23479,7.214113,1.0,4.0,9.0,15.0,28.0
verse,140726.0,18.75278,13.582113,1.0,8.0,16.0,26.0,80.0
word_rank,140726.0,10.61429,7.128545,1.0,5.0,9.0,15.0,59.0
parent,3581.0,68455.6,41353.832029,37.0,29696.0,67686.0,104515.0,140726.0
clause_id,112118.0,1020141.0,11348.714701,1000000.0,1010251.25,1020753.0,1030252.0,1038868.0
phrase_id,107178.0,2033579.0,18983.028648,2000000.0,2016927.25,2034572.0,2050556.0,2064622.0
n1904_node_id,124961.0,68804.08,39783.00212,1.0,34476.0,68879.0,103281.0,137779.0
