In [1]:
# Install a pip package in the current Jupyter kernel
# import sys
# !{sys.executable} -m pip install pandas

In [2]:
import pandas as pd
import utils.db_utils as db
import utils.file_utils as file
import utils.bible_utils as bible

original_words_table = db.original_words_table
target_words_table = db.target_words_table
alignment_table = db.alignment_table
dbPath = './data/alignmentsData.sqlite'
origLangPathGreek = './data/OrigLangJson/ugnt/v0.14'
origLangPathHebrew = './data/OrigLangJson/uhb/v2.1.15'

connection = db.initAlignmentDB(dbPath)

searchOriginal = True
searchTarget = False
searchLemma = True
caseInsensitive = True



Connection to SQLite DB successful


In [3]:
#################

# get all greek words
items_orig_greek = db.fetchRecords(connection, original_words_table, "morph LIKE 'Gr%'")
print (f"{len(items_orig_greek)} greek words in original_words_table")
original_df = pd.DataFrame(items_orig_greek)
morphs_greek = original_df['morph']

# get count of times each used
frequency_greek = morphs_greek.value_counts()
print("Frequency of greek morph usage:")
print(frequency_greek)

# get alphabetized list of morphs
morphs_list_greek = list(dict(frequency_greek).keys())
morphs_list_greek.sort()
print("Alphabetized greek morph list:")
print(morphs_list_greek)

51861 greek words in original_words_table
Frequency of greek morph usage:
Gr,CC,,,,,,,,    4898
Gr,D,,,,,,,,,    2035
Gr,P,,,,,A,,,    1660
Gr,P,,,,,D,,,    1331
Gr,P,,,,,G,,,    1260
                 ... 
Gr,ER,,,,ANP,       1
Gr,RI,,,,VMP,       1
Gr,V,ILA1,,S,       1
Gr,V,PEA,NFP,       1
Gr,V,PEA,ANP,       1
Name: morph, Length: 923, dtype: int64
Alphabetized greek morph list:
['Gr,AA,,,,AFP,', 'Gr,AA,,,,AFPC', 'Gr,AA,,,,AFS,', 'Gr,AA,,,,AFSC', 'Gr,AA,,,,AFSS', 'Gr,AA,,,,AMP,', 'Gr,AA,,,,AMPC', 'Gr,AA,,,,AMS,', 'Gr,AA,,,,AMSC', 'Gr,AA,,,,ANP,', 'Gr,AA,,,,ANPC', 'Gr,AA,,,,ANS,', 'Gr,AA,,,,ANSC', 'Gr,AA,,,,DFP,', 'Gr,AA,,,,DFS,', 'Gr,AA,,,,DMP,', 'Gr,AA,,,,DMPC', 'Gr,AA,,,,DMS,', 'Gr,AA,,,,DMSC', 'Gr,AA,,,,DMSS', 'Gr,AA,,,,DNP,', 'Gr,AA,,,,DNS,', 'Gr,AA,,,,GFP,', 'Gr,AA,,,,GFS,', 'Gr,AA,,,,GMP,', 'Gr,AA,,,,GMS,', 'Gr,AA,,,,GNP,', 'Gr,AA,,,,GNPC', 'Gr,AA,,,,GNS,', 'Gr,AA,,,,NFP,', 'Gr,AA,,,,NFS,', 'Gr,AA,,,,NMP,', 'Gr,AA,,,,NMPC', 'Gr,AA,,,,NMS,', 'Gr,AA,,,,NMSC', 'Gr,AA,,,,NNP,', '

In [4]:
#################

unique_morph_list = list(frequency_greek.keys())
morph = unique_morph_list[0]
morph

'Gr,CC,,,,,,,,'

In [5]:
import utils.morph_utils as mu

roles = mu.getGreekRoles()
roles

['N', 'A', 'E', 'R', 'V', 'I', 'P', 'D', 'C', 'T']

In [12]:
def extract(text, match, start, end):
    subStr = text[start:end]
    if (subStr == match):
        return True
    else:
        return False

def filterSyntacticalRole(sequence, role):
    def filterFunc(variable):
        results = extract(variable, role, 3, 4)
        return results

    filtered = filter(filterFunc, sequence)
    return filtered

verbs = list(filterSyntacticalRole(unique_morph_list, 'V'))

In [23]:
def morphToDict(morph):
    return {
        'morph': morph,
        'role': morph[3:4],
        'type': morph[4:5],
        'mood': morph[5:6],
        'tense': morph[6:7],
        'voice': morph[7:8],
        'person': morph[8:9],
        'case': morph[9:10],
        'gender': morph[10:11],
        'number': morph[11:12],
    }

morphToDict('Gr,V,IAA3,,S,')

{'morph': 'Gr,V,IAA3,,S,',
 'role': 'V',
 'type': ',',
 'mood': 'I',
 'tense': 'A',
 'voice': 'A',
 'person': '3',
 'case': ',',
 'gender': ',',
 'number': 'S'}

In [42]:
verb_list_ =  list(map(morphToDict, verbs))
print(f"Number of verb types: {len(verb_list_)}")

verbs_frame = pd.DataFrame(verb_list_)

fields = [ 'role','type','mood','tense','voice','person','case','gender','number']

for field in fields:
    field_frequency = verbs_frame[field].value_counts()
    print(f"\nFrequency of {field}:")
    print(field_frequency)

Number of verb types: 332

Frequency of role:
V    332
Name: role, dtype: int64

Frequency of type:
,    332
Name: type, dtype: int64

Frequency of mood:
P    165
I     89
S     35
M     24
O     10
N      9
Name: mood, dtype: int64

Frequency of tense:
P    123
A    103
E     63
F     22
I     14
L      7
Name: tense, dtype: int64

Frequency of voice:
A    128
P    107
M     97
Name: voice, dtype: int64

Frequency of person:
,    174
3     63
2     54
1     41
Name: person, dtype: int64

Frequency of case:
,    167
A     46
N     44
G     39
D     29
V      7
Name: case, dtype: int64

Frequency of gender:
,    167
M     69
F     48
N     48
Name: gender, dtype: int64

Frequency of number:
S    172
P    151
,      9
Name: number, dtype: int64


In [43]:
mood = verbs_frame['mood'].value_counts()
mood

P    165
I     89
S     35
M     24
O     10
N      9
Name: mood, dtype: int64

In [44]:
role = 'N'

roles = mu.getGreekRoles()
def findFieldsForRole(unique_morph_list, role):
    field_data = {}
    print(f"\nFor role: '{role}'")

    role_list = list(filterSyntacticalRole(unique_morph_list, role))

    role_dict =  list(map(morphToDict, role_list))

    role_frame = pd.DataFrame(role_dict)

    fields = [ 'role','type','mood','tense','voice','person','case','gender','number']

    for field in fields:
        field_frequency = role_frame[field].value_counts()
        print(f"\nInstances of {field}:")
        # print(field_frequency)
        field_list = list(dict(field_frequency).keys())
        print(field_list)
        field_list.sort()
        field_data[field] = field_list
    return field_data

field_data = findFieldsForRole(unique_morph_list, role)


For role: 'N'

Instances of role:
['N']

Instances of type:
[',', 'S', 'P']

Instances of mood:
[',']

Instances of tense:
[',']

Instances of voice:
[',']

Instances of person:
[',']

Instances of case:
['N', 'A', 'G', 'D', 'V']

Instances of gender:
['M', 'N', 'F']

Instances of number:
['S', 'P']
