This notebook studies the variation in the ordering of the diacritics in UHB

Author: J de Joode

- Date: June 16, 2021
- Updated: Nov 30, 2021

In [1]:
import glob
from collections import Counter
import unicodedata
import pandas as pd
import re

This notebook assumes that you have a data directory setup one level up.

In [2]:
files = glob.glob('../data/hbo_uhb/*.usfm')
contents = []
for f in files:
    with open(f) as ipf:
        contents.append(ipf.read())

In [3]:
len(contents)

39

Read all the contents

In [4]:
ot = '\n\n'.join(contents)

In [5]:
ot_clean = ' '.join(ot.split()) # make whitespace consistent
lemmata = re.findall(r'\|lemma=".*?"', ot_clean)  # extract the lemmata
ot_no_lemmas = re.sub(r'\|lemma=".*?"', ' ', ot_clean)  # remove the lemmata

Count the individual characters, this also includes characters that are part of the USFM markup

In [6]:
counter = Counter(ot_no_lemmas).most_common()
counter

[(' ', 1594684),
 ('"', 1314480),
 ('w', 721749),
 ('r', 714614),
 ('o', 663254),
 ('=', 657240),
 ('\\', 645779),
 ('H', 602627),
 ('s', 508488),
 ('m', 496752),
 ('t', 490341),
 ('p', 487672),
 ('e', 405596),
 ('c', 356648),
 ('*', 353967),
 ('x', 353635),
 ('h', 352865),
 ('-', 350977),
 ('n', 337408),
 (':', 325226),
 ('g', 314892),
 ('/', 309852),
 (',', 306488),
 ('3', 218887),
 ('ּ', 170579),
 ('0', 169603),
 ('1', 168347),
 ('a', 168270),
 ('\u2060', 163919),
 ('ָ', 151785),
 ('N', 150550),
 ('ְ', 150189),
 ('י', 138624),
 ('b', 137297),
 ('2', 134176),
 ('5', 131928),
 ('ו', 130383),
 ('8', 124999),
 ('i', 120458),
 ('d', 117358),
 ('ַ', 117124),
 ('4', 113204),
 ('ִ', 108848),
 ('6', 103838),
 ('ה', 102232),
 ('7', 99510),
 ('א', 95957),
 ('9', 88586),
 ('ל', 88565),
 ('l', 76021),
 ('ֶ', 75199),
 ('V', 74009),
 ('ֹ', 73403),
 ('ר', 68297),
 ('ב', 65458),
 ('R', 64492),
 ('ת', 63481),
 ('ֵ', 58969),
 ('ש', 58387),
 ('מ', 57844),
 ('C', 57668),
 ('q', 56862),
 ('T', 54582),
 (

# Extract unicode metadata

We'll be using the unicodedata module to gain more insight into the actual form/shape of the characters

In [7]:
information = 'char frequency name category bidirectional decomposition mirrored'.split()

output = []
for char in counter:
    c = char[0]
    freq = char[1]
    tmp = [c, 
           freq, 
           unicodedata.name(c), 
           unicodedata.category(c), 
           unicodedata.bidirectional(c), 
           unicodedata.decomposition(c),
           unicodedata.mirrored(c),
          ]
    output.append(tmp)
           

We can now convert them to a dataframe

In [8]:
df = pd.DataFrame(output, columns=information)

This dataframe now as the character, its frequency, its name, category, whether it is bidir, its decomposition where possible, and whether it has a mirror equivalent

In [9]:
df

Unnamed: 0,char,frequency,name,category,bidirectional,decomposition,mirrored
0,,1594684,SPACE,Zs,WS,,0
1,"""",1314480,QUOTATION MARK,Po,ON,,0
2,w,721749,LATIN SMALL LETTER W,Ll,L,,0
3,r,714614,LATIN SMALL LETTER R,Ll,L,,0
4,o,663254,LATIN SMALL LETTER O,Ll,L,,0
...,...,...,...,...,...,...,...
154,ׅ,3,HEBREW MARK LOWER DOT,Mn,NSM,,0
155,X,1,LATIN CAPITAL LETTER X,Lu,L,,0
156,[,1,LEFT SQUARE BRACKET,Ps,ON,,1
157,],1,RIGHT SQUARE BRACKET,Pe,ON,,1


NB: note that the dataframe above shows that there is a case where the shin and shin dot are combined into a single character.

**This needs a fix**
    
To find other possible cases that can be decomposed, we run the following query

In [10]:
df.loc[-(df.decomposition == '')]

Unnamed: 0,char,frequency,name,category,bidirectional,decomposition,mirrored
158,שׁ,1,HEBREW LETTER SHIN WITH SHIN DOT,Lo,R,05E9 05C1,0


We can now extract all the Hebrew characters by filtering on the bidirectional column

In [11]:
hebrew = df.loc[df.bidirectional.isin(('NSM', 'R'))].char.tolist()
hebrew

['ּ',
 'ָ',
 'ְ',
 'י',
 'ו',
 'ַ',
 'ִ',
 'ה',
 'א',
 'ל',
 'ֶ',
 'ֹ',
 'ר',
 'ב',
 'ת',
 'ֵ',
 'ש',
 'מ',
 'ׁ',
 'ע',
 '־',
 'ם',
 '֣',
 'ֽ',
 'נ',
 '֖',
 'כ',
 '֥',
 'ד',
 'ח',
 'ֲ',
 '֔',
 '׃',
 '֑',
 '֙',
 'פ',
 'ק',
 'ן',
 'ך',
 '֗',
 '֤',
 'ׂ',
 'צ',
 '֨',
 'ג',
 'ס',
 'ז',
 '֛',
 'ט',
 'ֱ',
 'ֻ',
 '֜',
 'ץ',
 '֧',
 '֝',
 '֭',
 'ף',
 '׀',
 '֩',
 '֞',
 '֕',
 '֮',
 '֠',
 '֚',
 '֒',
 '֡',
 'ֳ',
 'ֺ',
 '֫',
 '֪',
 '֘',
 '֬',
 'ׄ',
 '֓',
 '֟',
 '֦',
 '׆',
 'ֿ',
 'ׅ',
 'שׁ']

This also makes it possible to get rid of all the markup and only keep the Hebrew.
We'll disregard the weird spacing as that's not the focus of our analysis

In [12]:
tokens = ot_no_lemmas.split()
tokens_hb = []
for token in tokens:
    tmp = ''.join(char for char in token if char in hebrew)
    tokens_hb.append(tmp)
hb = ' '.join(tokens_hb)

In [13]:
print(hb[:100])

                            וּבִשְׁנַ֣ת    אַחַ֗ת    לְכ֨וֹרֶשׁ֙     מֶ֣לֶךְ     פָּרַ֔ס     לִכְל֥ו


In [14]:
hb = ' '.join(hb.split())
hb[:100]

'וּבִשְׁנַ֣ת אַחַ֗ת לְכ֨וֹרֶשׁ֙ מֶ֣לֶךְ פָּרַ֔ס לִכְל֥וֹת דְּבַר ־ יְהוָ֖ה מִפִּ֣י יִרְמְיָ֑ה הֵעִ֣יר'

# Extract the different categories: punctuation, vowels, accents, etc.

Now we can start to focus on the different categories 

In [15]:
df.loc[df.char.isin(hebrew)].category.value_counts()

Mn    48
Lo    28
Po     3
Pd     1
Name: category, dtype: int64

Vowels and accents are part of the same category

In [16]:
df.loc[df.char.isin(hebrew) & (df.category=="Mn")]

Unnamed: 0,char,frequency,name,category,bidirectional,decomposition,mirrored
24,ּ,170579,HEBREW POINT DAGESH OR MAPIQ,Mn,NSM,,0
29,ָ,151785,HEBREW POINT QAMATS,Mn,NSM,,0
31,ְ,150189,HEBREW POINT SHEVA,Mn,NSM,,0
40,ַ,117124,HEBREW POINT PATAH,Mn,NSM,,0
42,ִ,108848,HEBREW POINT HIRIQ,Mn,NSM,,0
50,ֶ,75199,HEBREW POINT SEGOL,Mn,NSM,,0
52,ֹ,73403,HEBREW POINT HOLAM,Mn,NSM,,0
57,ֵ,58969,HEBREW POINT TSERE,Mn,NSM,,0
64,ׁ,46010,HEBREW POINT SHIN DOT,Mn,NSM,,0
69,֣,41281,HEBREW ACCENT MUNAH,Mn,NSM,,0


Consonants are holy, they are set apart

In [17]:
df.loc[df.char.isin(hebrew) & (df.category=="Lo")]

Unnamed: 0,char,frequency,name,category,bidirectional,decomposition,mirrored
32,י,138624,HEBREW LETTER YOD,Lo,R,,0
36,ו,130383,HEBREW LETTER VAV,Lo,R,,0
44,ה,102232,HEBREW LETTER HE,Lo,R,,0
46,א,95957,HEBREW LETTER ALEF,Lo,R,,0
48,ל,88565,HEBREW LETTER LAMED,Lo,R,,0
53,ר,68297,HEBREW LETTER RESH,Lo,R,,0
54,ב,65458,HEBREW LETTER BET,Lo,R,,0
56,ת,63481,HEBREW LETTER TAV,Lo,R,,0
58,ש,58387,HEBREW LETTER SHIN,Lo,R,,0
59,מ,57844,HEBREW LETTER MEM,Lo,R,,0


In [18]:
df.loc[df.char.isin(hebrew) & (df.category=="Po")]

Unnamed: 0,char,frequency,name,category,bidirectional,decomposition,mirrored
81,׃,23193,HEBREW PUNCTUATION SOF PASUQ,Po,R,,0
113,׀,2273,HEBREW PUNCTUATION PASEQ,Po,R,,0
150,׆,9,HEBREW PUNCTUATION NUN HAFUKHA,Po,R,,0


In [19]:
df.loc[df.char.isin(hebrew) & (df.category=="Pd")]

Unnamed: 0,char,frequency,name,category,bidirectional,decomposition,mirrored
67,־,42579,HEBREW PUNCTUATION MAQAF,Pd,R,,0


Let's create three columns to capture the metadata: point, accent, and vowel

In [20]:
df['point'] = False
df.loc[df.char.isin(hebrew) & (df.category=="Mn") & (df.name.str.contains(r'POINT')), 'point'] = True

In [21]:
df['accent'] = False
df.loc[df.char.isin(hebrew) & (df.category=="Mn") & (df.name.str.contains(r'ACCENT')), 'accent'] = True

In [22]:
vowels = ['HEBREW POINT QAMATS',
     'HEBREW POINT SHEVA',
     'HEBREW POINT PATAH',
     'HEBREW POINT HIRIQ',
     'HEBREW POINT SEGOL',
     'HEBREW POINT HOLAM',
     'HEBREW POINT TSERE',
     'HEBREW POINT HATAF PATAH',
     'HEBREW POINT HATAF SEGOL',
     'HEBREW POINT QUBUTS',
     'HEBREW POINT HATAF QAMATS',
     'HEBREW POINT HOLAM HASER FOR VAV',]

In [23]:
df['vowel'] = False
df.loc[(df.point) & (df.name.isin(vowels)), 'vowel'] = True

This allows us to create specific queries

In [24]:
df.loc[(-df.vowel) & (df.point)]  # - vowel, + point

Unnamed: 0,char,frequency,name,category,bidirectional,decomposition,mirrored,point,accent,vowel
27,ּ,170575,HEBREW POINT DAGESH OR MAPIQ,Mn,NSM,,0,True,False,False
65,ׁ,46010,HEBREW POINT SHIN DOT,Mn,NSM,,0,True,False,False
71,ֽ,41262,HEBREW POINT METEG,Mn,NSM,,0,True,False,False
91,ׂ,12069,HEBREW POINT SIN DOT,Mn,NSM,,0,True,False,False
151,ֿ,5,HEBREW POINT RAFE,Mn,NSM,,0,True,False,False


We'll save them in separate lists for easy retrieval later

In [25]:
V = df.loc[(df.vowel)].name.tolist()
V

['HEBREW POINT QAMATS',
 'HEBREW POINT SHEVA',
 'HEBREW POINT PATAH',
 'HEBREW POINT HIRIQ',
 'HEBREW POINT SEGOL',
 'HEBREW POINT HOLAM',
 'HEBREW POINT TSERE',
 'HEBREW POINT HATAF PATAH',
 'HEBREW POINT HATAF SEGOL',
 'HEBREW POINT QUBUTS',
 'HEBREW POINT HATAF QAMATS',
 'HEBREW POINT HOLAM HASER FOR VAV']

In [26]:
P = df.loc[(df.point) & (-df.vowel)].name.tolist()
P

['HEBREW POINT DAGESH OR MAPIQ',
 'HEBREW POINT SHIN DOT',
 'HEBREW POINT METEG',
 'HEBREW POINT SIN DOT',
 'HEBREW POINT RAFE']

In [27]:
A = df.loc[df.accent].name.tolist()
A

['HEBREW ACCENT MUNAH',
 'HEBREW ACCENT TIPEHA',
 'HEBREW ACCENT MERKHA',
 'HEBREW ACCENT ZAQEF QATAN',
 'HEBREW ACCENT ETNAHTA',
 'HEBREW ACCENT PASHTA',
 'HEBREW ACCENT REVIA',
 'HEBREW ACCENT MAHAPAKH',
 'HEBREW ACCENT QADMA',
 'HEBREW ACCENT TEVIR',
 'HEBREW ACCENT GERESH',
 'HEBREW ACCENT DARGA',
 'HEBREW ACCENT GERESH MUQDAM',
 'HEBREW ACCENT DEHI',
 'HEBREW ACCENT TELISHA QETANA',
 'HEBREW ACCENT GERSHAYIM',
 'HEBREW ACCENT ZAQEF GADOL',
 'HEBREW ACCENT ZINOR',
 'HEBREW ACCENT TELISHA GEDOLA',
 'HEBREW ACCENT YETIV',
 'HEBREW ACCENT SEGOL',
 'HEBREW ACCENT PAZER',
 'HEBREW ACCENT OLE',
 'HEBREW ACCENT YERAH BEN YOMO',
 'HEBREW ACCENT ZARQA',
 'HEBREW ACCENT ILUY',
 'HEBREW ACCENT SHALSHELET',
 'HEBREW ACCENT QARNEY PARA',
 'HEBREW ACCENT MERKHA KEFULA']

In [28]:
C = df.loc[df.char.isin(hebrew) & (df.category=="Lo")].name.tolist()
C

['HEBREW LETTER YOD',
 'HEBREW LETTER VAV',
 'HEBREW LETTER HE',
 'HEBREW LETTER ALEF',
 'HEBREW LETTER LAMED',
 'HEBREW LETTER RESH',
 'HEBREW LETTER BET',
 'HEBREW LETTER TAV',
 'HEBREW LETTER SHIN',
 'HEBREW LETTER MEM',
 'HEBREW LETTER AYIN',
 'HEBREW LETTER FINAL MEM',
 'HEBREW LETTER NUN',
 'HEBREW LETTER KAF',
 'HEBREW LETTER DALET',
 'HEBREW LETTER HET',
 'HEBREW LETTER PE',
 'HEBREW LETTER QOF',
 'HEBREW LETTER FINAL NUN',
 'HEBREW LETTER FINAL KAF',
 'HEBREW LETTER TSADI',
 'HEBREW LETTER GIMEL',
 'HEBREW LETTER SAMEKH',
 'HEBREW LETTER ZAYIN',
 'HEBREW LETTER TET',
 'HEBREW LETTER FINAL TSADI',
 'HEBREW LETTER FINAL PE',
 'HEBREW LETTER SHIN WITH SHIN DOT']

# Transform the HB into a CVAPX chain

Let's transform our dear Hebrew Bible (almost sounds like the start of a letter) into a sequence 

In [29]:
output = []
for char in hb:
    name = unicodedata.name(char)
    if name in V:
        output.append('V')
    elif name in A:
        output.append('A')
    elif name in C:
        output.append('C')
    elif name in P:
        output.append('P')
    else:
        output.append('X')

In [30]:
len(output) == len(hb)

True

In [31]:
outputstr = ''.join(output)

This sequence now can be used to study the order of the characters

In [32]:
outputstr[:100]

'CPCVCPVCVACXCVCVACXCVCACVCVCPAXCVACVCVXCPVCVACXCVCVCACVCXCPVCVCXXXCVCCVACXCVCPVACXCVCVCVCVACXCVCVACC'

In [33]:
hb[:100]

'וּבִשְׁנַ֣ת אַחַ֗ת לְכ֨וֹרֶשׁ֙ מֶ֣לֶךְ פָּרַ֔ס לִכְל֥וֹת דְּבַר ־ יְהוָ֖ה מִפִּ֣י יִרְמְיָ֑ה הֵעִ֣יר'

Because the two sequences are of the same length we can zip them

In [34]:
list(zip(outputstr[:100], hb[:100]))

[('C', 'ו'),
 ('P', 'ּ'),
 ('C', 'ב'),
 ('V', 'ִ'),
 ('C', 'ש'),
 ('P', 'ׁ'),
 ('V', 'ְ'),
 ('C', 'נ'),
 ('V', 'ַ'),
 ('A', '֣'),
 ('C', 'ת'),
 ('X', ' '),
 ('C', 'א'),
 ('V', 'ַ'),
 ('C', 'ח'),
 ('V', 'ַ'),
 ('A', '֗'),
 ('C', 'ת'),
 ('X', ' '),
 ('C', 'ל'),
 ('V', 'ְ'),
 ('C', 'כ'),
 ('A', '֨'),
 ('C', 'ו'),
 ('V', 'ֹ'),
 ('C', 'ר'),
 ('V', 'ֶ'),
 ('C', 'ש'),
 ('P', 'ׁ'),
 ('A', '֙'),
 ('X', ' '),
 ('C', 'מ'),
 ('V', 'ֶ'),
 ('A', '֣'),
 ('C', 'ל'),
 ('V', 'ֶ'),
 ('C', 'ך'),
 ('V', 'ְ'),
 ('X', ' '),
 ('C', 'פ'),
 ('P', 'ּ'),
 ('V', 'ָ'),
 ('C', 'ר'),
 ('V', 'ַ'),
 ('A', '֔'),
 ('C', 'ס'),
 ('X', ' '),
 ('C', 'ל'),
 ('V', 'ִ'),
 ('C', 'כ'),
 ('V', 'ְ'),
 ('C', 'ל'),
 ('A', '֥'),
 ('C', 'ו'),
 ('V', 'ֹ'),
 ('C', 'ת'),
 ('X', ' '),
 ('C', 'ד'),
 ('P', 'ּ'),
 ('V', 'ְ'),
 ('C', 'ב'),
 ('V', 'ַ'),
 ('C', 'ר'),
 ('X', ' '),
 ('X', '־'),
 ('X', ' '),
 ('C', 'י'),
 ('V', 'ְ'),
 ('C', 'ה'),
 ('C', 'ו'),
 ('V', 'ָ'),
 ('A', '֖'),
 ('C', 'ה'),
 ('X', ' '),
 ('C', 'מ'),
 ('V', 'ִ'),
 ('C', 'פ'),

We can count the number of cases where we have a vowel followed by a point. This is quite a lot

In [35]:
outputstr.count('VP')

36009

Original:

In [35]:
# don't run
outputstr.count('VP')

50278

# Analysis

In what follows we will focus on sequence of two successive characters

In [36]:
bigrams = list(zip(output, output[1:]))
bigrams_hb = list(zip(hb, hb[1:]))

We'll create a new dataframe

In [37]:
b = pd.DataFrame([bigrams, bigrams_hb]).T

In [38]:
b.loc[b[0]==('V', 'P'), 1].value_counts()

(ָ, ֽ)    11294
(ִ, ֽ)     7456
(ֶ, ֽ)     4735
(ֵ, ֽ)     4370
(ַ, ֽ)     3947
(ֹ, ֽ)     3364
(ְ, ֽ)      614
(ֻ, ֽ)      134
(ֲ, ֽ)       53
(ֱ, ֽ)       30
(ֺ, ֽ)       12
Name: 1, dtype: int64

In [39]:
hb[:100]

'וּבִשְׁנַ֣ת אַחַ֗ת לְכ֨וֹרֶשׁ֙ מֶ֣לֶךְ פָּרַ֔ס לִכְל֥וֹת דְּבַר ־ יְהוָ֖ה מִפִּ֣י יִרְמְיָ֑ה הֵעִ֣יר'

In [40]:
def reverse(input):
    return ''.join(list(reversed(input)))

These are the cases where a point follows a vowel

AFTER the change:

In [41]:
print('{:>15} {:>15} {:>35} {:>35}'.format('Frequency', 'Raw data', 'Point', 'Vowel'))
for row in b.loc[b[0]==('V', 'P'), 1].value_counts().items():
    print('{:>15} {:>15} {:>35} {:>35}'.format(row[1], str(row[0]), unicodedata.name(row[0][1]), unicodedata.name(row[0][0])))

      Frequency        Raw data                               Point                               Vowel
          11294      ('ָ', 'ֽ')                  HEBREW POINT METEG                 HEBREW POINT QAMATS
           7456      ('ִ', 'ֽ')                  HEBREW POINT METEG                  HEBREW POINT HIRIQ
           4735      ('ֶ', 'ֽ')                  HEBREW POINT METEG                  HEBREW POINT SEGOL
           4370      ('ֵ', 'ֽ')                  HEBREW POINT METEG                  HEBREW POINT TSERE
           3947      ('ַ', 'ֽ')                  HEBREW POINT METEG                  HEBREW POINT PATAH
           3364      ('ֹ', 'ֽ')                  HEBREW POINT METEG                  HEBREW POINT HOLAM
            614      ('ְ', 'ֽ')                  HEBREW POINT METEG                  HEBREW POINT SHEVA
            134      ('ֻ', 'ֽ')                  HEBREW POINT METEG                 HEBREW POINT QUBUTS
             53      ('ֲ', 'ֽ')                  HEBREW POINT ME

In [41]:
# original
# do not run
print('{:>15} {:>15} {:>35} {:>35}'.format('Frequency', 'Raw data', 'Point', 'Vowel'))
for row in b.loc[b[0]==('V', 'P'), 1].value_counts().items():
    print('{:>15} {:>15} {:>35} {:>35}'.format(row[1], str(row[0]), unicodedata.name(row[0][1]), unicodedata.name(row[0][0])))

      Frequency        Raw data                               Point                               Vowel
          11046      ('ָ', 'ֽ')                  HEBREW POINT METEG                 HEBREW POINT QAMATS
           7177      ('ִ', 'ֽ')                  HEBREW POINT METEG                  HEBREW POINT HIRIQ
           4607      ('ֶ', 'ֽ')                  HEBREW POINT METEG                  HEBREW POINT SEGOL
           4306      ('ֵ', 'ֽ')                  HEBREW POINT METEG                  HEBREW POINT TSERE
           3888      ('ַ', 'ֽ')                  HEBREW POINT METEG                  HEBREW POINT PATAH
           3314      ('ֹ', 'ֽ')                  HEBREW POINT METEG                  HEBREW POINT HOLAM
           2901      ('ָ', 'ּ')        HEBREW POINT DAGESH OR MAPIQ                 HEBREW POINT QAMATS
           2268      ('ִ', 'ּ')        HEBREW POINT DAGESH OR MAPIQ                  HEBREW POINT HIRIQ
           2134      ('ְ', 'ּ')        HEBREW POINT DAGESH OR MA

Are there any VOWEL to VOWEL combinations?

In [42]:
print('{:>15} {:>15} {:>35} {:>35}'.format('Frequency', 'Raw data', 'Vowel', 'Vowel'))
for row in b.loc[b[0]==('V', 'V'), 1].value_counts().items():
    print('{:>15} {:>15} {:>35} {:>35}'.format(row[1], str(row[0]), unicodedata.name(row[0][1]), unicodedata.name(row[0][0])))

      Frequency        Raw data                               Vowel                               Vowel
            208      ('ַ', 'ִ')                  HEBREW POINT HIRIQ                  HEBREW POINT PATAH
            142      ('ָ', 'ִ')                  HEBREW POINT HIRIQ                 HEBREW POINT QAMATS
             24      ('ִ', 'ַ')                  HEBREW POINT PATAH                  HEBREW POINT HIRIQ
             18      ('ִ', 'ָ')                 HEBREW POINT QAMATS                  HEBREW POINT HIRIQ
              2      ('ַ', 'ְ')                  HEBREW POINT SHEVA                  HEBREW POINT PATAH
              1      ('ְ', 'ִ')                  HEBREW POINT HIRIQ                  HEBREW POINT SHEVA
              1      ('ֹ', 'ָ')                 HEBREW POINT QAMATS                  HEBREW POINT HOLAM
              1      ('ָ', 'ֹ')                  HEBREW POINT HOLAM                 HEBREW POINT QAMATS
              1      ('ֹ', 'ְ')                  HEBREW POINT SH

In [57]:
cases = b.loc[b[0]==('V', 'V'), 1].value_counts().index.tolist()
cases

[('ַ', 'ִ'),
 ('ָ', 'ִ'),
 ('ִ', 'ַ'),
 ('ִ', 'ָ'),
 ('ַ', 'ְ'),
 ('ְ', 'ִ'),
 ('ֹ', 'ָ'),
 ('ָ', 'ֹ'),
 ('ֹ', 'ְ')]

In [71]:
for case in cases:
    print(case)
    print(unicodedata.name(case[0]))
    print(unicodedata.name(case[1]))
    print(re.findall(r'..........{}...........'.format(reverse(''.join(case))), hb))
    print()

('ַ', 'ִ')
HEBREW POINT PATAH
HEBREW POINT HIRIQ
[' יְרוּשָׁלִַ֔ם וְאֵ֖ת ע', 'לִירוּשָׁלִַ֛ם וּלְצִיּ', 'לִירוּשָׁלִַ֨ם֙ בְּֽרַח', 'יְר֣וּשָׁלִַ֔ם לִרְא֥וֹ', ' יְרוּשָׁלִַ֔ם מֵרֹ֥ב א', ' יְרוּשָׁלִַ֨ם֙ יֹשֶׁ֣ב', ' יְרוּשָׁלִַ֨ם֙ עִ֣יר ־', ' יְרוּשָׁלִַ֖ם וְאֶת ־ ', ' יְרוּשָׁלִַ֔ם הִנֵּ֤ה ', 'ִיר֣וּשָׁלִַ֔ם וְנִכְרְ', ' יְרוּשָׁלִַ֧ם סַף ־ רַ', ' יְרוּשָׁלִַ֜ם אֶ֤בֶן מ', ' יְרוּשָׁלִַ֔ם בַּיהוָ֥', ' יְרוּשָׁלִַ֥ם ע֛וֹד תּ', ' יְרוּשָׁלִַ֖ם עַל ־ יְ', ' יְרוּשָׁלִַ֔ם וְהָיָ֞ה', ' יְרוּשָׁלִַ֗ם ר֤וּחַ ח', 'ִיר֣וּשָׁלִַ֔ם כְּמִסְפ', ' יְרוּשָׁלִַם֮ לַמִּלְח', ' יְרוּשָׁלִַם֮ מִקֶּדֶם', 'ִיר֣וּשָׁלִַ֔ם חֶצְיָ֗ם', ' יְרוּשָׁלִַ֖ם לָבֶֽטַח', 'יְר֣וּשָׁלִַ֔ם לְהִֽשְׁ', 'ִּירוּשָׁלִַ֜ם וּבִֽיהו']

('ָ', 'ִ')
HEBREW POINT QAMATS
HEBREW POINT HIRIQ
[' יְרוּשָׁלִָֽם ׃ ע֣וֹד ', 'ִּירוּשָׁלִָֽם ׃ס וָאֶש', 'ִּירוּשָׁלִָֽם ׃ הַ֥ס כ', 'ִירֽוּשָׁלִָ֑ם הֲל֧וֹא ', 'יְרֽוּשָׁלִָ֑ם וְנִקְרְ', ' יְרוּשָׁלִָ֑ם וְאִ֧ישׁ', ' יְרוּשָׁלִָ֑ם וְהָיוּ ', 'ִּירוּשָׁלִָ֑ם וּלְחַלּ', ' יְרוּשָׁלִָֽם ׃ וְהָיָ', 'ִּירוּ