## Load JSON File

In [26]:
import json

def read_jsonfile(play):
    with open('data/'+play+'.json', 'r', encoding='utf-8') as f:
        play_file = json.load(f)

    return play_file

## Get all lines from json

In [27]:
def get_lines_from_json(play):
    json = read_jsonfile(play)
    all_lines = list()
    for said in json['content']:
        line = {'id':said['id'],'name':said['name'], 'text':[]}
        for verse in said['text']:
            line['text'].append(verse['text'])
        all_lines.append(line)
    return all_lines

## Clean (and lemmatize?) text

In [28]:
from cltk import NLP

nlp = NLP("grc")

‎𐤀 CLTK version '1.1.6'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`.


In [32]:
from unicodedata import normalize

def clean_texts(list_of_texts, lemma, detailed, diacritics):
    purge_char = [",", ".", ";", ":", "—", "†"]
    if lemma is False: 
        purge_char.append("'")

    clean_texts = list()
    for text in list_of_texts:
        ##Basic preprocessing
        clean_text = text[:]
        clean_text = clean_text.replace("\n", " ")
        for char in purge_char:
            clean_text = clean_text.replace(char, "")
        
        ##Lemmatization
        if lemma is True:
            cltk_doc = nlp.analyze(clean_text)
            clean_text = " ".join([word.lemma for word in cltk_doc.words]) 
        
        #Give every info about word
        if detailed is True:
            cltk_doc = nlp.analyze(clean_text)
            clean_text = [clean_word(word) for word in cltk_doc.words] 
        
        ##Diacritics removal
        if diacritics is False and detailed is False:
            clean_text = normalize("NFKD", clean_text).translate({ord(c): None for c in "̓̔́̀͂̈ͅ"})
            
        clean_texts.append(clean_text)
    return clean_texts

In [33]:
import copy

def clean_lines(list_of_lines, lemma=False, detailed=False, diacritics=True):
    clean_lines = list()
    for line in list_of_lines:
        clean_text = clean_texts(line['text'], lemma, detailed, diacritics)
        clean_line = copy.deepcopy(line)
        clean_line['text'] = clean_text
        clean_lines.append(clean_line)
    return clean_lines

In [34]:
def clean_word(word):
    clean_word = dict()
    word_object = vars(word)
    for key in word_object.keys():
        if word_object[key] is not None:
            clean_word[key] = word_object[key]
    return clean_word

## Find diminutives with Regex

In [43]:
import re

In [42]:
neuter_declension_endings = "(ον|ου|ῳ|α|ων|οις)"

In [41]:
dim_regex_no_lemma = "(\w*(ι|ί)(ον|ου|ῳ|α|ων|οις)|\w*(ι|ί)σκος)"

In [40]:
dim_regex_lemma = "(\w*(ι|ί)ον|\w*(ι|ί)σκος)"

In [39]:
def find_matches(regex, lines):
    matches = list()
    for line in lines:
        for text in line['text']:
            match = re.search(regex, text)
            if match is not None:
                matches.append({'matches':match, 'line_id':line['id']})
    return matches      

## Find diminutives

In [44]:
play_results = dict()

In [77]:
for play in play_names:
    play_results[play] = find_matches(dim_regex_lemma,clean_plays[play])
    text = play + ' \n\n'
    
    for line in play_results[play]:
        matches = line['matches']
        text += matches.group(0) + ' '
        text += str(line['line_id']) + ' '
        text += '\n'
    
    with open('results/'+play+'.txt', 'w', encoding="utf-8") as f:
        f.write(text)

In [46]:
play_results

{'Acharnians': [{'matches': <re.Match object; span=(30, 36), match='ὄρθιον'>,
   'line_id': 1},
  {'matches': <re.Match object; span=(11, 17), match='ἔλαιον'>, 'line_id': 1},
  {'matches': <re.Match object; span=(21, 31), match='τριπλάσιον'>,
   'line_id': 31},
  {'matches': <re.Match object; span=(27, 36), match='Σιβύρτιον'>,
   'line_id': 47},
  {'matches': <re.Match object; span=(2, 6), match='Διον'>, 'line_id': 82},
  {'matches': <re.Match object; span=(22, 26), match='Διον'>, 'line_id': 82},
  {'matches': <re.Match object; span=(25, 29), match='Διον'>, 'line_id': 93},
  {'matches': <re.Match object; span=(20, 28), match='τρύβλιον'>,
   'line_id': 94},
  {'matches': <re.Match object; span=(28, 41), match='ταραξικάρδιον'>,
   'line_id': 110},
  {'matches': <re.Match object; span=(24, 36), match='βουλευτήριον'>,
   'line_id': 133},
  {'matches': <re.Match object; span=(22, 30), match='ἐπύλλιον'>,
   'line_id': 140},
  {'matches': <re.Match object; span=(11, 17), match='ῥάκιον'>,
   '

## Get lines by character

In [88]:
def get_lines_by_character(character, play):
    char_lines = list()
    for line in play:
        if line["name"] == character:
            char_lines.append(line)
    return char_lines

In [147]:
def get_lines_all_characters(play):
    all_char_lines = dict()
    for line in all_lines:
        if line['name'] in all_char_lines.keys():
            all_char_lines[line['name']].append(line)
        else:
            all_char_lines[line['name']] = [line]
    return all_char_lines

## Get lines by gender

In [157]:
women = ['Λυσιστράτη', 'Καλονίκη', 'Μυρρίνη', 'Λαμπιτώ', 'ΑλληΑλλήΑλλη', 'Πᾶσαι', 'Χορὸς Γυναικῶν', 'Γυνὴ Α', 'Γυνὴ Β', 'Γυνὴ Γ', 'Γυνὴ Α.', 'Γυνὴ Β.', 'Γυνὴ Γ.', 'Γυνὴ Ξ.', 'Γυνὴ Δ', 'Γυνή', 'Γυνὴ']
men = ['Χορὸς γερόντων', 'Πρόβουλοσ', 'Γέρων', 'Κινησίασ', 'Κῆρυξ Λακεδαιμονίων', 'Λάκων', 'ΑθηναῖοσΑθηναῖοσ', 'Αθηναῖος Α.Αθηναῖος Α.', 'Αθηναῖος ΒΑθηναῖος Β', 'Αθηναῖος Β.Αθηναῖος Β.', 'Χορὸς Λακεδαιμονίων', 'Χορὸς Αθηναίων']

In [158]:
def get_lines_by_gender(play):
    all_lines = {'men':[],'women':[]}
    for line in play:
        if line['name'] in play['women']:
            all_lines['women'].append(line)
        
        if line['name'] in play['men']:
            all_lines['men'].append(line)
    return all_lines