# Test BHSA Dataset

This notebook loads and explores the BHSA (Biblia Hebraica Stuttgartensia Amstelodamensis) dataset.

In [1]:
from cfabric.core.fabric import Fabric

In [2]:
# Load the BHSA dataset
TF = Fabric(locations='/Users/cody/github/etcbc/bhsa/tf/2021')

In [3]:
# Explore available features
TF.explore()

  0.01s Feature overview: 109 for nodes; 6 for edges; 1 configs; 9 computed


{'computeds': ('__boundary__',
  '__characters__',
  '__levDown__',
  '__levUp__',
  '__levels__',
  '__order__',
  '__rank__',
  '__sections__',
  '__structure__'),
 'configs': ('otext',),
 'edges': ('distributional_parent',
  'functional_parent',
  'mother',
  'omap@2017-2021',
  'omap@c-2021',
  'oslots'),
 'nodes': ('book',
  'book@am',
  'book@ar',
  'book@bn',
  'book@da',
  'book@de',
  'book@el',
  'book@en',
  'book@es',
  'book@fa',
  'book@fr',
  'book@he',
  'book@hi',
  'book@id',
  'book@ja',
  'book@ko',
  'book@la',
  'book@nl',
  'book@pa',
  'book@pt',
  'book@ru',
  'book@sw',
  'book@syc',
  'book@tr',
  'book@ur',
  'book@yo',
  'book@zh',
  'chapter',
  'code',
  'det',
  'dist',
  'dist_unit',
  'domain',
  'freq_lex',
  'freq_occ',
  'function',
  'g_cons',
  'g_cons_utf8',
  'g_lex',
  'g_lex_utf8',
  'g_nme',
  'g_nme_utf8',
  'g_pfm',
  'g_pfm_utf8',
  'g_prs',
  'g_prs_utf8',
  'g_uvf',
  'g_uvf_utf8',
  'g_vbe',
  'g_vbe_utf8',
  'g_vbs',
  'g_vbs_utf8',
  

In [5]:
# Load commonly used features
api = TF.loadAll()

  0.00s Loading from ~/github/etcbc/bhsa/tf/2021/.cfm/1
  2.21s All features loaded from .cfm format


In [6]:
# Set up convenient references
F = api.F  # Node features
E = api.E  # Edge features
T = api.T  # Text API
L = api.L  # Locality API
N = api.N  # Node navigation

## Genesis 1:1 - בְּרֵאשִׁית

In [7]:
# Get Genesis 1:1
gen_1_1 = T.nodeFromSection(('Genesis', 1, 1))
print(f"Verse node: {gen_1_1}")
print(f"\nHebrew text: {T.text(gen_1_1)}")

Verse node: 1414389

Hebrew text: בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃ 


In [8]:
gen_1_1

1414389

In [9]:
# Show each word with grammatical features
words = L.d(gen_1_1, otype='word')
print("Word-by-word analysis of Genesis 1:1:\n")
print(f"{'Hebrew':<15} {'Lexeme':<12} {'Gloss':<20} {'POS':<8} {'Gender':<8} {'Number'}")
print("-" * 80)
for w in words:
    hebrew = F.g_word_utf8.v(w)
    lexeme = F.lex_utf8.v(w) or F.lex.v(w)
    gloss = F.gloss.v(w) or ''
    pos = F.sp.v(w) or ''
    gender = F.gn.v(w) or ''
    number = F.nu.v(w) or ''
    print(f"{hebrew:<15} {lexeme:<12} {gloss:<20} {pos:<8} {gender:<8} {number}")

Word-by-word analysis of Genesis 1:1:

Hebrew          Lexeme       Gloss                POS      Gender   Number
--------------------------------------------------------------------------------
בְּ             ב            in                   prep     NA       NA
רֵאשִׁ֖ית       ראשׁית       beginning            subs     f        sg
בָּרָ֣א         ברא          create               verb     m        sg
אֱלֹהִ֑ים       אלהים        god(s)               subs     m        pl
אֵ֥ת            את           <object marker>      prep     NA       NA
הַ              ה            the                  art      NA       NA
שָּׁמַ֖יִם      שׁמים        heavens              subs     m        pl
וְ              ו            and                  conj     NA       NA
אֵ֥ת            את           <object marker>      prep     NA       NA
הָ              ה            the                  art      NA       NA
אָֽרֶץ          ארץ          earth                subs     unknown  sg


## Psalm 23:1 - The LORD is my shepherd

In [10]:
# Get Psalm 23:1
ps_23_1 = T.nodeFromSection(('Psalms', 23, 1))
print(f"Hebrew: {T.text(ps_23_1)}")
print("\nWord analysis:")
for w in L.d(ps_23_1, otype='word'):
    print(f"  {F.g_word_utf8.v(w):<12} = {F.gloss.v(w) or '?'}")

Hebrew: מִזְמֹ֥ור לְדָוִ֑ד יְהוָ֥ה רֹ֝עִ֗י לֹ֣א אֶחְסָֽר׃ 

Word analysis:
  מִזְמֹ֥ור    = psalm
  לְ           = to
  דָוִ֑ד       = David
  יְהוָ֥ה      = YHWH
  רֹ֝עִ֗י      = pasture
  לֹ֣א         = not
  אֶחְסָֽר     = diminish


## Verb Analysis - First 10 verbs in Genesis

In [11]:
# Find first 10 verbs in Genesis
genesis_book = T.nodeFromSection(('Genesis',))
genesis_words = L.d(genesis_book, otype='word')

verbs = [w for w in genesis_words if F.sp.v(w) == 'verb'][:10]

print("First 10 verbs in Genesis:\n")
print(f"{'Hebrew':<12} {'Lexeme':<12} {'Stem':<8} {'Tense':<8} {'Person':<8} {'Gloss'}")
print("-" * 70)
for v in verbs:
    print(f"{F.g_word_utf8.v(v):<12} {F.lex_utf8.v(v):<12} {F.vs.v(v) or '':<8} {F.vt.v(v) or '':<8} {F.ps.v(v) or '':<8} {F.gloss.v(v) or ''}")

First 10 verbs in Genesis:

Hebrew       Lexeme       Stem     Tense    Person   Gloss
----------------------------------------------------------------------
בָּרָ֣א      ברא          qal      perf     p3       create
הָיְתָ֥ה     היה          qal      perf     p3       be
מְרַחֶ֖פֶת   רחף          piel     ptca     unknown  shake
יֹּ֥אמֶר     אמר          qal      wayq     p3       say
יְהִ֣י       היה          qal      impf     p3       be
יְהִי        היה          qal      wayq     p3       be
יַּ֧רְא      ראה          qal      wayq     p3       see
טֹ֑וב        טוב          qal      perf     p3       be good
יַּבְדֵּ֣ל   בדל          hif      wayq     p3       separate
יִּקְרָ֨א    קרא          qal      wayq     p3       call


## The Shema - Deuteronomy 6:4-5

In [12]:
# The Shema
deut_6_4 = T.nodeFromSection(('Deuteronomy', 6, 4))
deut_6_5 = T.nodeFromSection(('Deuteronomy', 6, 5))

print("שְׁמַע יִשְׂרָאֵל - Hear, O Israel!\n")
print(f"Deut 6:4: {T.text(deut_6_4)}")
print(f"Deut 6:5: {T.text(deut_6_5)}")

שְׁמַע יִשְׂרָאֵל - Hear, O Israel!

Deut 6:4: שְׁמַ֖ע יִשְׂרָאֵ֑ל יְהוָ֥ה אֱלֹהֵ֖ינוּ יְהוָ֥ה׀ אֶחָֽד׃ 
Deut 6:5: וְאָ֣הַבְתָּ֔ אֵ֖ת יְהוָ֣ה אֱלֹהֶ֑יךָ בְּכָל־לְבָבְךָ֥ וּבְכָל־נַפְשְׁךָ֖ וּבְכָל־מְאֹדֶֽךָ׃ 


In [13]:
# The Shema
deut_6_4 = T.nodeFromSection(('Deuteronomy', 6, 4))
deut_6_5 = T.nodeFromSection(('Deuteronomy', 6, 5))

print("שְׁמַע יִשְׂרָאֵל - Hear, O Israel!\n")
print(f"Deut 6:4: {T.text(deut_6_4)}")
print(f"Deut 6:5: {T.text(deut_6_5)}")

שְׁמַע יִשְׂרָאֵל - Hear, O Israel!

Deut 6:4: שְׁמַ֖ע יִשְׂרָאֵ֑ל יְהוָ֥ה אֱלֹהֵ֖ינוּ יְהוָ֥ה׀ אֶחָֽד׃ 
Deut 6:5: וְאָ֣הַבְתָּ֔ אֵ֖ת יְהוָ֣ה אֱלֹהֶ֑יךָ בְּכָל־לְבָבְךָ֥ וּבְכָל־נַפְשְׁךָ֖ וּבְכָל־מְאֹדֶֽךָ׃ 


## Clause Structure - Genesis 1:1

In [14]:
# Explore clause and phrase structure of Genesis 1:1
clauses = L.d(gen_1_1, otype='clause')
print(f"Clauses in Genesis 1:1: {len(clauses)}\n")

for clause in clauses:
    clause_type = F.typ.v(clause)
    print(f"Clause (type={clause_type}):")
    
    phrases = L.d(clause, otype='phrase')
    for phrase in phrases:
        phrase_type = F.typ.v(phrase)
        phrase_func = F.function.v(phrase)
        phrase_words = [F.g_word_utf8.v(w) for w in L.d(phrase, otype='word')]
        print(f"  Phrase [{phrase_func}] ({phrase_type}): {' '.join(phrase_words)}")

Clauses in Genesis 1:1: 1

Clause (type=xQtX):
  Phrase [Time] (PP): בְּ רֵאשִׁ֖ית
  Phrase [Pred] (VP): בָּרָ֣א
  Phrase [Subj] (NP): אֱלֹהִ֑ים
  Phrase [Objc] (PP): אֵ֥ת הַ שָּׁמַ֖יִם וְ אֵ֥ת הָ אָֽרֶץ


## Corpus Statistics

In [15]:
# Corpus statistics
from collections import Counter

# Count node types
otype_counts = Counter()
for n in N.walk():
    otype_counts[F.otype.v(n)] += 1

print("BHSA Corpus Statistics:\n")
print(f"{'Node Type':<20} {'Count':>10}")
print("-" * 32)
for otype, count in sorted(otype_counts.items(), key=lambda x: -x[1]):
    print(f"{otype:<20} {count:>10,}")

BHSA Corpus Statistics:

Node Type                 Count
--------------------------------
word                    426,590
phrase_atom             267,532
phrase                  253,203
subphrase               113,850
clause_atom              90,704
clause                   88,131
sentence_atom            64,514
sentence                 63,717
half_verse               45,179
verse                    23,213
lex                       9,230
chapter                     929
book                         39
