# Open and Explore Parquet Files

Explore (visalize) the intermediate data (as is on Jan 7, 2026).

## Fix dependencies (installs if missing)

In [16]:
import importlib
import sys
import subprocess

def ensure(package: str, import_name: str | None = None):
    """Import a package; if missing, pip-install it and import again."""
    name = import_name or package
    try:
        return importlib.import_module(name)
    except ImportError:
        print(f"Installing {package} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        return importlib.import_module(name)

pd = ensure("pandas")
_ = ensure("pyarrow")  # Parquet engine

print("pandas:", pd.__version__)
import pyarrow
print("pyarrow:", pyarrow.__version__)


pandas: 2.2.3
pyarrow: 14.0.2


## Load data into memmory 

We try to obtain it straigt from GitHub.

In [17]:
from pathlib import Path
import io
import requests
import pandas as pd

# --- GitHub source ---
githubOwner = "mjredmond"
githubRepo = "tr-text-fabric"
githubBranch = "main"
githubFileRelPathPosix = "data/intermediate/tr_complete.parquet"

def downloadFromGithubRawToBytes(*, owner: str, repo: str, branch: str, fileRelPathPosix: str) -> bytes:
    rawUrl = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{fileRelPathPosix}"
    chunks = bytearray()

    with requests.get(rawUrl, stream=True, timeout=120) as r:
        r.raise_for_status()
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            if chunk:
                chunks.extend(chunk)

    return bytes(chunks)

def looksLikeGitLfsPointerBytes(data: bytes) -> bool:
    head = data[:400]
    return (b"version https://git-lfs.github.com/spec/v1" in head) or (b"oid sha256:" in head)

# --- Load to DataFrame (prefer local if present; otherwise download and keep entirely in memory) ---
if parquetPath.is_dir():
    import pyarrow.dataset as ds
    print("Directory detected locally. Reading as a partitioned Parquet dataset:")
    print(parquetPath.resolve())
    dataset = ds.dataset(parquetPath, format="parquet")
    table = dataset.to_table()
    df = table.to_pandas()

elif parquetPath.exists():
    print("File detected locally:")
    print(parquetPath.resolve())
    df = pd.read_parquet(parquetPath, engine="pyarrow")

else:
    print("Not found locally. Downloading from GitHub into memory...")
    parquetBytes = downloadFromGithubRawToBytes(
        owner=githubOwner,
        repo=githubRepo,
        branch=githubBranch,
        fileRelPathPosix=githubFileRelPathPosix,
    )

    if looksLikeGitLfsPointerBytes(parquetBytes):
        raise RuntimeError(
            "Downloaded a Git LFS pointer file, not the actual Parquet binary.\n"
            "Fix: clone the repo with Git LFS, or download the real Parquet from a release asset URL."
        )

    print(f"Downloaded {len(parquetBytes):,} bytes into memory.")
    df = pd.read_parquet(io.BytesIO(parquetBytes), engine="pyarrow")

print("Loaded.")


Not found locally. Downloading from GitHub into memory...
Downloaded 9,061,493 bytes into memory.
Loaded.


## Print head of dataframe

In [18]:
print("Rows:", len(df))
print("Columns:", len(df.columns))
df.head(15)

Rows: 140726
Columns: 42


Unnamed: 0,word_id,book,chapter,verse,word_rank,word,morph,strong,lemma,sp,...,text,normalized,trailer,num,ref,id,cls,trans,domain,typems
0,92349,1CO,1,1,1,Παῦλος,N-NSM,G3972,Παῦλος,subs,...,Παῦλος,Παῦλος,,1,1CO 1:1!1,n46001001001,noun,Paul,93001,proper
1,92350,1CO,1,1,2,κλητὸς,A-NSM,G2822,κλητός,adjv,...,κλητὸς,κλητὸς,,2,1CO 1:1!2,n46001001002,adj,a called,33029,
2,92351,1CO,1,1,3,ἀπόστολος,N-NSM,G652,ἀπόστολος,subs,...,ἀπόστολος,ἀπόστολος,,3,1CO 1:1!3,n46001001003,noun,apostle,53009,common
3,92352,1CO,1,1,4,Ἰησοῦ,N-GSM,G2424,Ἰησοῦς,subs,...,Ἰησοῦ,Ἰησοῦ,,4,1CO 1:1!4,n46001001004,noun,Jesus,93001,proper
4,92353,1CO,1,1,5,Χριστοῦ,N-GSM,G5547,Χριστός,subs,...,Χριστοῦ,Χριστοῦ,,5,1CO 1:1!5,n46001001005,noun,Christ,93001,proper
5,92354,1CO,1,1,6,διὰ,PREP,G1223,διά,prep,...,διὰ,διὰ,,6,1CO 1:1!6,n46001001006,prep,by [the],89012,
6,92355,1CO,1,1,7,θελήματος,N-GSN,G2307,θέλημα,subs,...,θελήματος,θελήματος,,7,1CO 1:1!7,n46001001007,noun,will,30004,common
7,92356,1CO,1,1,8,θεοῦ,N-GSM,G2316,θεός,subs,...,θεοῦ,θεοῦ,,8,1CO 1:1!8,n46001001008,noun,God,12001,common
8,92357,1CO,1,1,9,καὶ,CONJ,G2532,καί,conj,...,καὶ,καὶ,,9,1CO 1:1!9,n46001001009,conj,and,89017,
9,92358,1CO,1,1,10,Σωσθένης,N-NSM,G4988,Σωσθένης,subs,...,Σωσθένης,Σωσθένης,,10,1CO 1:1!10,n46001001010,noun,Sosthenes,93001,proper


## Some additonal data structure prints

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140726 entries, 0 to 140725
Data columns (total 42 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   word_id        140726 non-null  int64  
 1   book           140726 non-null  object 
 2   chapter        140726 non-null  int64  
 3   verse          140726 non-null  int64  
 4   word_rank      140726 non-null  int64  
 5   word           140726 non-null  object 
 6   morph          140726 non-null  object 
 7   strong         140726 non-null  object 
 8   lemma          140726 non-null  object 
 9   sp             140726 non-null  object 
 10  case           80444 non-null   object 
 11  tense          30069 non-null   object 
 12  voice          30069 non-null   object 
 13  mood           29415 non-null   object 
 14  function       57875 non-null   object 
 15  role           107178 non-null  object 
 16  parent         3581 non-null    float64
 17  clause_id      112118 non-nul

## Some stats 

Column list + basic stats.

In [20]:
print("Columns:\n", list(df.columns))

# Numeric summary (if applicable)
df.describe(include="number").T


Columns:
 ['word_id', 'book', 'chapter', 'verse', 'word_rank', 'word', 'morph', 'strong', 'lemma', 'sp', 'case', 'tense', 'voice', 'mood', 'function', 'role', 'parent', 'clause_id', 'phrase_id', 'gloss', 'aligned', 'n1904_node_id', 'nu', 'ps', 'source', 'gn', 'translit', 'lemmatranslit', 'unaccent', 'after', 'ln', 'bookshort', 'text', 'normalized', 'trailer', 'num', 'ref', 'id', 'cls', 'trans', 'domain', 'typems']


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
word_id,140726.0,70363.5,40624.241328,1.0,35182.25,70363.5,105544.75,140726.0
chapter,140726.0,10.23479,7.214113,1.0,4.0,9.0,15.0,28.0
verse,140726.0,18.75278,13.582113,1.0,8.0,16.0,26.0,80.0
word_rank,140726.0,10.61429,7.128545,1.0,5.0,9.0,15.0,59.0
parent,3581.0,68455.6,41353.832029,37.0,29696.0,67686.0,104515.0,140726.0
clause_id,112118.0,1020141.0,11348.714701,1000000.0,1010251.25,1020753.0,1030252.0,1038868.0
phrase_id,107178.0,2033579.0,18983.028648,2000000.0,2016927.25,2034572.0,2050556.0,2064622.0
n1904_node_id,124961.0,68804.08,39783.00212,1.0,34476.0,68879.0,103281.0,137779.0
num,140726.0,10.61429,7.128545,1.0,5.0,9.0,15.0,59.0
