In [26]:
from typing import List
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.util import ngrams
from utils import lemma_glosses, greek_stopwords, MAPS

In [27]:
CONSTANTS = {
  "letter_columns": ['Book_Chapter_Verse', 'Part_Of_Speech', 'Parsing_Code', 'Text', 'Word', 'Normalized_Word', 'Lemma'],
  "parsing_code_features": ['Person', 'Tense', 'Voice', 'Mood', 'Case', 'Number', 'Gender', 'Degree'],
  "lemma_glosses": lemma_glosses,
  "greek_stopwords": greek_stopwords,
  "classifications": {
    "Romans": 1,
    "First Corinthians": 1,
    "Second Corinthians": 0,
    "Galatians": 1,
    "Ephesians": 0,
    "Phillipians": 1,
    "Colossians": 0,
    "First Thessalonians": 1,
    "Second Thessalonians": 0,
    "First Timothy": 0,
    "Second Timothy": 0,
    "Titus": 0,
    "Philemon": 1,
    "Hebrews": 0,
  }
}

# Import Each Letter and Sanitize

In [28]:
def letter_to_dataframe(file: str):
  letter_df = pd.read_csv(file, sep=" ")
  letter_df.columns = CONSTANTS['letter_columns']

  for i, feature in enumerate(CONSTANTS['parsing_code_features']):
    letter_df[feature] = letter_df['Parsing_Code'].str[i]
  
  del letter_df['Parsing_Code']

  letter_df['Person_Label'] = letter_df['Person'].map(MAPS["Person"])
  letter_df['Tense_Label'] = letter_df['Tense'].map(MAPS["Tense"])
  letter_df['Voice_Label'] = letter_df['Voice'].map(MAPS["Voice"])
  letter_df['Mood_Label'] = letter_df['Mood'].map(MAPS["Mood"])
  letter_df['Case_Label'] = letter_df['Case'].map(MAPS["Case"])
  letter_df['Number_Label'] = letter_df['Number'].map(MAPS["Number"])
  letter_df['Gender_Label'] = letter_df['Gender'].map(MAPS["Gender"])
  letter_df['Degree_Label'] = letter_df['Degree'].map(MAPS["Degree"])

  return letter_df

In [29]:
letters = {
  "Romans": letter_to_dataframe('../data/66-Ro-morphgnt.txt'),
  "First Corinthians": letter_to_dataframe('../data/67-1Co-morphgnt.txt'),
  "Second Corinthians": letter_to_dataframe('../data/68-2Co-morphgnt.txt'),
  "Galatians": letter_to_dataframe('../data/69-Ga-morphgnt.txt'),
  "Ephesians": letter_to_dataframe('../data/70-Eph-morphgnt.txt'),
  "Phillipians": letter_to_dataframe('../data/71-Php-morphgnt.txt'),
  "Colossians": letter_to_dataframe('../data/72-Col-morphgnt.txt'),
  "First Thessalonians": letter_to_dataframe("../data/73-1Th-morphgnt.txt"),
  "Second Thessalonians": letter_to_dataframe("../data/74-2Th-morphgnt.txt"),
  "First Timothy": letter_to_dataframe("../data/75-1Ti-morphgnt.txt"),
  "Second Timothy": letter_to_dataframe("../data/76-2Ti-morphgnt.txt"),
  "Titus": letter_to_dataframe("../data/77-Tit-morphgnt.txt"),
  "Philemon": letter_to_dataframe("../data/78-Phm-morphgnt.txt"),
  "Hebrews": letter_to_dataframe("../data/79-Heb-morphgnt.txt"),
}

# Global Top Lemmas

In [30]:
def get_global_top_lemmas(letter_list: dict[str, pd.DataFrame], lemmas_per_letter: int):
  all_lemmas = np.array([])
  for _, df in letter_list.items():
    stopwords_mask = df['Lemma'].isin(greek_stopwords)
    lemmas = df[~stopwords_mask]['Lemma'].value_counts().head(lemmas_per_letter).index
    all_lemmas = np.append(all_lemmas, lemmas)
  unique_lemmas = set(all_lemmas.flatten())
  return list(unique_lemmas)

global_top_lemmas = get_global_top_lemmas(letters, 10)

# Structural Features

In [31]:
def get_structural_features(df: pd.DataFrame):
  verses = df.groupby("Book_Chapter_Verse")

  # Token count
  token_count = len(df)

  # Verse Count
  verse_count = df['Book_Chapter_Verse'].nunique()

  # Tokens per verse
  tokens_per_verse = verses["Word"].count()

  # Verse length ratios
  long_verse_ratio = (tokens_per_verse > 20).sum() / token_count
  short_verse_ratio = (tokens_per_verse < 10).sum() / token_count

  # Token ratios
  text_type_token_ratio = df['Text'].nunique() / len(df['Text'])
  lemma_type_token_ratio = df['Lemma'].nunique() / len(df['Lemma'])

  # Verbs per Verse
  verbs_df = df[df['Part_Of_Speech'].str.startswith('V')]
  verbs_per_verse = verbs_df.groupby("Book_Chapter_Verse")['Word'].count()

  # Sentence Fragment Ratio
  fragments = 0

  for verse_ref, verse_df in verses:
    mood_series = verse_df[verse_df['Part_Of_Speech'].str.startswith('V')]['Mood']
    if not any(mood in {'I', 'D', 'S', 'O'} for mood in mood_series):
      fragments += 1

  return {
    "Token_Count": token_count,
    "Verse_Count": verse_count,
    "Avg_Verse_Length": tokens_per_verse.mean(),
    "Std_Verse_Length": tokens_per_verse.std(),
    "Long_Verse_Ratio": long_verse_ratio,
    "Short_Verse_Ratio": short_verse_ratio,
    "Text_Type_Token_Ratio": text_type_token_ratio,
    "Lemma_Type_Token_Ratio": lemma_type_token_ratio,
    "Avg_Verbs_Per_Verse": verbs_per_verse.mean(),
    "Std_Verbs_Per_Verse": verbs_per_verse.std(),
    "Sentence_Fragment_Ratio": fragments / len(verses)
  }

# Grammatical Features

In [32]:
def get_frequency_dict(df: pd.DataFrame, col: str):
  frequency_dict = df[col].value_counts(normalize=True).to_dict()

  result = {}

  for code, percent in frequency_dict.items():
    label = MAPS[col].get(code, code)
    result[f"{col}_{label}_Frequency"] = percent
    
  return result

In [33]:
def get_grammatical_features(df: pd.DataFrame):
  frequency_dicts = {}
  
  for code in CONSTANTS['parsing_code_features']:
    frequency_dicts = frequency_dicts | get_frequency_dict(df, code)

  return frequency_dicts

# Lexical and Morphological Features

In [None]:
def get_lexical_features(df: pd.DataFrame):
  verses = df.groupby("Book_Chapter_Verse")

  # Tokens per verse
  tokens_per_verse = verses["Word"].count()

  # Verbs per Verse
  verb_mask = df['Part_Of_Speech'].str.startswith('V')
  verbs_df = df[verb_mask]
  verbs_per_verse = verbs_df.groupby("Book_Chapter_Verse")['Word'].count()

  # Verbs per Token
  verbs_per_token = verbs_per_verse.sum() / tokens_per_verse.sum()

  # Noun-to-Verb Ratio
  noun_mask = df['Part_Of_Speech'].str.startswith('N')
  noun_to_verb_ratio = df[noun_mask]['Word'].count() / df[verb_mask]['Word'].count()

  # Pronoun ratio
  pronoun_mask = df['Part_Of_Speech'].str.startswith(('RP', 'RI'))
  pronoun_ratio = df[pronoun_mask]['Word'].count() / df['Word'].count()

  # Participle ratio
  participle_mask = verbs_df['Mood'] == "P"
  participle_ratio = verbs_df[participle_mask]['Word'].count() / verbs_df['Word'].count()

  # Hapax Legomena Ratio
  lemma_counts = df["Lemma"].value_counts()
  hapax_legomena_ratio = lemma_counts[lemma_counts == 1].count() / df['Lemma'].count()
  

  ## μὴ γένοιτο count
  may_df = df[df['Word'] == 'μὴ']

  may_count = 0

  for i in may_df.index:
    if i+1 < df.shape[0] and df.iloc[i+1]['Word'] == 'γένοιτο':
      may_count += 1

  # Top lemma ratios
  lemma_ratios = {}
  for lemma in global_top_lemmas:
    lemma_ratios[f'Top_Lemma_{lemma}'] = df[df['Lemma'] == lemma]['Lemma'].count() / df['Lemma'].count()

  return {
    "Avg_Verbs_Per_Token": verbs_per_token,
    "Noun_To_Verb_Ratio": noun_to_verb_ratio,
    "Pronoun_Ratio": pronoun_ratio,
    "Participle_Ratio": participle_ratio,
    "Hapax_Legomena_Ratio": hapax_legomena_ratio,
    "μὴ_γένοιτο_Count": may_count
  } | lemma_ratios

In [35]:
X = []

for letter_name, letter_df in letters.items():
  features = {}
  features['Classification'] = CONSTANTS['classifications'][letter_name]
  features["letter"] = letter_name
  structural_features = get_structural_features(letter_df)
  grammatical_features = get_grammatical_features(letter_df)
  lexical_features = get_lexical_features(letter_df)
  
  features = features | structural_features | grammatical_features | lexical_features
  X.append(features)

X_df = pd.DataFrame(X).set_index("letter").fillna(0)
X_df
  

Unnamed: 0_level_0,Classification,Token_Count,Verse_Count,Avg_Verse_Length,Std_Verse_Length,Long_Verse_Ratio,Short_Verse_Ratio,Text_Type_Token_Ratio,Lemma_Type_Token_Ratio,Avg_Verbs_Per_Verse,...,Top_Lemma_νόμος,Top_Lemma_ἀδελφός,Top_Lemma_ἔργον,Top_Lemma_θεός,Top_Lemma_κατά,Top_Lemma_ἔχω,Top_Lemma_οἶδα,Top_Lemma_πίστις,Top_Lemma_σάρξ,Degree_Superlative_Frequency
letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Romans,1,7054,430,16.404651,5.957032,0.015736,0.007655,0.371704,0.149277,2.869458,...,0.010491,0.002694,0.002126,0.021123,0.006663,0.003544,0.002268,0.005529,0.003686,0.0
First Corinthians,1,6811,437,15.585812,5.760517,0.012039,0.009397,0.395096,0.139774,3.200489,...,0.001321,0.005579,0.001175,0.015416,0.003524,0.007194,0.003671,0.001028,0.001615,0.000587
Second Corinthians,0,4472,256,17.46875,6.095884,0.016995,0.004696,0.413014,0.17576,3.196653,...,0.0,0.002683,0.000671,0.017665,0.005814,0.004919,0.003578,0.001565,0.00246,0.000447
Galatians,1,2225,149,14.932886,6.036672,0.011236,0.011685,0.488539,0.233258,2.902098,...,0.014382,0.004944,0.003596,0.013483,0.00764,0.002247,0.001348,0.009888,0.00809,0.0
Ephesians,0,2415,155,15.580645,5.318274,0.010352,0.006625,0.443478,0.218634,2.239726,...,0.000414,0.000828,0.001656,0.012836,0.009938,0.003313,0.00207,0.003313,0.003727,0.0
Phillipians,1,1625,104,15.625,5.720322,0.008,0.008615,0.496615,0.271385,2.612245,...,0.001846,0.005538,0.001846,0.014154,0.006769,0.006154,0.003692,0.003077,0.003077,0.0
Colossians,0,1579,95,16.621053,6.112733,0.017099,0.006966,0.497783,0.272324,2.537634,...,0.0,0.003167,0.0019,0.0133,0.008866,0.004433,0.002533,0.003167,0.0057,0.0
First Thessalonians,1,1472,89,16.539326,7.492506,0.018342,0.01019,0.454484,0.245245,2.858824,...,0.0,0.012908,0.001359,0.024457,0.0,0.005435,0.008832,0.005435,0.0,0.0
Second Thessalonians,0,819,47,17.425532,5.678888,0.015873,0.003663,0.533578,0.304029,2.75,...,0.0,0.010989,0.002442,0.021978,0.004884,0.001221,0.003663,0.006105,0.0,0.0
First Timothy,0,1590,113,14.070796,5.224798,0.008176,0.015094,0.595597,0.337736,2.865385,...,0.001258,0.001887,0.003774,0.013836,0.003774,0.008805,0.001887,0.01195,0.000629,0.0


In [36]:
X_df.columns

Index(['Classification', 'Token_Count', 'Verse_Count', 'Avg_Verse_Length',
       'Std_Verse_Length', 'Long_Verse_Ratio', 'Short_Verse_Ratio',
       'Text_Type_Token_Ratio', 'Lemma_Type_Token_Ratio',
       'Avg_Verbs_Per_Verse', 'Std_Verbs_Per_Verse', 'Sentence_Fragment_Ratio',
       'Person_Unspecified_Frequency', 'Person_Third_Frequency',
       'Person_First_Frequency', 'Person_Second_Frequency',
       'Tense_Unspecified_Frequency', 'Tense_Present_Frequency',
       'Tense_Aorist_Frequency', 'Tense_Future_Frequency',
       'Tense_Perfect_Frequency', 'Tense_Imperfect_Frequency',
       'Tense_Pluperfect_Frequency', 'Voice_Unspecified_Frequency',
       'Voice_Active_Frequency', 'Voice_Passive_Frequency',
       'Voice_Middle_Frequency', 'Mood_Unspecified_Frequency',
       'Mood_Indicative_Frequency', 'Mood_Participle_Frequency',
       'Mood_Infinitive_Frequency', 'Mood_Subjunctive_Frequency',
       'Mood_Imperative_Frequency', 'Mood_Optative_Frequency',
       'Case_Unspecifi

In [37]:
X_df.to_pickle("../features/features.pkl")
X_df.to_csv("../features/features.csv")