In [1]:
# Import Text-Fabric and load a dataset
from tf.app import use

# Load the Hebrew Bible (ETCBC dataset)
A = use('etcbc/bhsa', hoist=globals())

**Locating corpus resources ...**

   |       51s T oslots               from ~/text-fabric-data/github/etcbc/bhsa/tf/2021
   |     0.01s T book@fa              from ~/text-fabric-data/github/etcbc/bhsa/tf/2021
   |     5.29s T g_word               from ~/text-fabric-data/github/etcbc/bhsa/tf/2021
   |     0.01s T book@ko              from ~/text-fabric-data/github/etcbc/bhsa/tf/2021
   |     4.94s T g_lex_utf8           from ~/text-fabric-data/github/etcbc/bhsa/tf/2021
   |     0.25s T book                 from ~/text-fabric-data/github/etcbc/bhsa/tf/2021
   |     0.00s T book@ru              from ~/text-fabric-data/github/etcbc/bhsa/tf/2021
   |     0.00s T book@el              from ~/text-fabric-data/github/etcbc/bhsa/tf/2021
   |     0.00s T book@pa              from ~/text-fabric-data/github/etcbc/bhsa/tf/2021
   |     5.18s T phono                from ~/text-fabric-data/github/etcbc/phono/tf/2021
   |     4.34s T trailer_utf8         from ~/text-fabric-data/github/etcbc/bhsa/tf/2021
   |     4.99s T voc_lex_utf8  

Name,# of nodes,# slots / node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [29]:
from tqdm.notebook import tqdm
from collections import defaultdict

def get_scroll_and_chapter_info(word):
    """Get scroll and chapter information for a given word node."""
    scroll_and_chapter = A.sectionStrFromNode(word)
    scroll, chapter_info = scroll_and_chapter.split(" ")
    frag_label, frag_line_num = chapter_info.split(":")
    return scroll_and_chapter, scroll, frag_label, frag_line_num

def get_morphological_features(word):
    """Extract morphological features for a given word node."""
    return {
        "sp": F.sp.v(word),
        "cl": F.ls.v(word),
        "ps": F.ps.v(word),
        "gn": F.gn.v(word),
        "nu": F.nu.v(word),
        "st": F.st.v(word),
        "vs": F.vs.v(word),
        "vt": F.vt.v(word),
        # "md": F.md.v(word),
    }

def process_word(scroll_node, word_line_num, sub_word_num):
    filtered_data = defaultdict(list)
    for word in L.d(scroll_node, otype="word"):
        (
            scroll_and_chapter,
            scroll,
            frag_label,
            frag_line_num,
        ) = get_scroll_and_chapter_info(word)
        transcript = T.text(word)
        lexeme = F.lex.v(word)
        morphological_features = get_morphological_features(word)
        # biblical_info = get_biblical_info(word)
        # lang = F.lang.v(word)
        # srcLn = F.srcLn.v(word)
        # word_type = F.type.v(word)
        if transcript:
            after = True if transcript[-1] == " " else False
        else:
            after = False

        word_entry = {
            "frag_label": frag_label,
            "frag_line_num": frag_line_num,
            # "word_line_num": str(word_line_num),
            # "sub_word_num": str(sub_word_num),
            # "book_and_chapter": scroll_and_chapter,
            # "scroll_name": scroll,
            "transcript": transcript,
            "lex": lexeme,
            "parsed_morph": morphological_features,
            # "lang": lang,
            # "srcLn": srcLn,
            # "type_of": word_type,
            "after": after,
        }
        # word_entry.update(biblical_info)

        if (
            not after
        ):  # If there is no space after the word, it means it's a conjunction like ו or ב.
            sub_word_num += 1
        else:
            sub_word_num = 1
            word_line_num += 1

        scroll_name = F.book.v(scroll_node)
        filtered_data[scroll_name].append(word_entry)
    return filtered_data


filtered_data = defaultdict(list)
for scroll_node in tqdm(F.otype.s('book')[:2]):
    # if specific_scrolls:
    #     if not A.sectionStrFromNode(scroll_node) in specific_scrolls:
    #         continue
    word_line_num = 1
    sub_word_num = 1
    scroll_data = process_word(scroll_node, word_line_num, sub_word_num)
    filtered_data.update(scroll_data)




  0%|          | 0/2 [00:00<?, ?it/s]

In [30]:
filtered_data["Genesis"][10:30]

[{'frag_label': '1',
  'frag_line_num': '1',
  'transcript': 'אָֽרֶץ׃ ',
  'lex': '>RY/',
  'parsed_morph': {'sp': 'subs',
   'cl': 'none',
   'ps': 'NA',
   'gn': 'unknown',
   'nu': 'sg',
   'st': 'a',
   'vs': 'NA',
   'vt': 'NA'},
  'after': True},
 {'frag_label': '1',
  'frag_line_num': '2',
  'transcript': 'וְ',
  'lex': 'W',
  'parsed_morph': {'sp': 'conj',
   'cl': 'none',
   'ps': 'NA',
   'gn': 'NA',
   'nu': 'NA',
   'st': 'NA',
   'vs': 'NA',
   'vt': 'NA'},
  'after': False},
 {'frag_label': '1',
  'frag_line_num': '2',
  'transcript': 'הָ',
  'lex': 'H',
  'parsed_morph': {'sp': 'art',
   'cl': 'none',
   'ps': 'NA',
   'gn': 'NA',
   'nu': 'NA',
   'st': 'NA',
   'vs': 'NA',
   'vt': 'NA'},
  'after': False},
 {'frag_label': '1',
  'frag_line_num': '2',
  'transcript': 'אָ֗רֶץ ',
  'lex': '>RY/',
  'parsed_morph': {'sp': 'subs',
   'cl': 'none',
   'ps': 'NA',
   'gn': 'unknown',
   'nu': 'sg',
   'st': 'a',
   'vs': 'NA',
   'vt': 'NA'},
  'after': True},
 {'frag_label'

In [20]:
from collections import Counter

l = []

for w in F.otype.s("word"):
    l.append(w)
counter = Counter(l)    
    

In [23]:
F.otype

<tf.core.otypefeature.OtypeFeature at 0x14b305420>