In [1]:
import json
import re

from Utils import get_lines_from_json
from Utils import clean_lines

from Utils import get_character_gender

from Utils import find_matches

‎𐤀 CLTK version '1.1.6'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`.


In [2]:
with open('dimDict.json', 'r', encoding='utf-8') as f:
    dim_dict = json.load(f)

## Import and clean plays

In [3]:
##Define play names
play_names = ['Acharnians','Birds','Clouds','Ecclesiazusae','Frogs','Knights','Lysistrata','Peace','Plutus','Thesmophoriazusae','Wasps']

In [58]:
def import_plays_from_scratch():
    
    plays = dict()
    clean_plays = dict()
    
    for play in play_names:
        plays[play] = get_lines_from_json(play)
        clean_plays[play] = clean_lines(plays[play], lemma=True, diacritics=False)
        
    json_object = json.dumps(clean_plays, indent=4)
 
    with open("fullPlays.json", "w") as outfile:
        outfile.write(json_object)

In [12]:
def import_plays_from_dump():
    with open('fullPlays.json', 'r', encoding='utf-8') as f:
        return json.load(f)

In [8]:
def import_plays_from_xml():
    plays = dict()
    
    for play in play_names:
        plays[play] = get_lines_from_json(play)

    return plays

In [9]:
plays = import_plays_from_xml()
clean_plays = import_plays_from_dump()
add_gender_to_plays()

## Add gender to lines

In [5]:
def add_gender_to_plays():
    for play in play_names:
        for line in clean_plays[play]:
            gender = get_character_gender(play, line["name"])
            line["gender"] = gender

## Find diminutives

In [9]:
##Regex to determine all possible diminutives, with false positives "(\w*(ι|ί)ον|\w*ισκι?(ο(ς|ν)|η|α))"
dim_regex_lemma = "(\w*(ι|ί)ον|\w*ισκι?(ο(ς|ν)|η))"
dim_regex_no_lemma = "(\w*(ι|ί)(ον|ου|ῳ|α|ων|οις)|\w*ισκ(ο(ς|ν)|η))"

In [35]:
play_results = dict()
for play in play_names:
    play_results[play] = find_matches(dim_regex_lemma,clean_plays[play])
    text = play + " "+ str(get_play_gender_ratio(play)) + "\n\n"
    
    text += 'Lemme, Certitude, ID de ligne, genre'
    text += '\n'
    
    for line in play_results[play]:
        certainty = get_certainty(line["matches"].group(0))
        text += line["matches"].group(0) + ', '
        text += str(certainty) + ', '
        text += str(line['line']['id']) + ', '
        text += line["line"]["gender"]
        text += '\n'
    
    with open('results/'+play+'.txt', 'w', encoding="utf-8") as f:
        f.write(text)

## Get diminutive certainty

In [10]:
##Identify possible false positives
##Uncertainty factors: vowel before -ιον (for diphtong), -τήριον ending
possible_exception_regex = "(\w*(α|ο|ε|τηρ)ιον)"

##Try to identify true positives
##Possible true positive factors: -διον, -λιον, -φιον
possible_dim_regex = "(\w*(δ|λ|φ)ιον)"

##Certainty factors: has suffix in list compiled from Chantraine/Monteil/Petersen OR has -ισκ-
certain_dim_regex = "(\w*(ιδ|υλ(λ?)|α(κ|σ|φ|ρ))ιον|\w*ισκ\w*|\w*ισκι?(ο(ς|ν)|η|α))"

In [11]:
def get_certainty(word):
    if word in dim_dict['confirmed']:
        return "Diminutif confirmé"
    
    if word in dim_dict['exceptions']:
        return "Faux positif confirmé"
    
    certainty = "???"
    
    if re.match(possible_dim_regex, word):
        certainty = "Diminutif possible"
    if re.match(certain_dim_regex, word):
        certainty = "Diminutif confirmé"
        
    if re.match(possible_exception_regex, word):
        certainty = "Faux positif possible"

    return certainty

In [32]:
def get_play_gender_ratio(play):
    length = len(clean_plays[play])
    men = 0
    women = 0
    
    for line in clean_plays[play]:
        if line["gender"] == "man":
            men += 1
        elif line["gender"] == "woman":
            women +=1
    
    men = round(men/length*100, 2)
    women = round(women/length*100, 2)
    
    return({"men":men, "women":women})

In [15]:
def get_play_line(play, lineID):
    for line in plays[play]:
        if line["id"] == lineID: return line

In [37]:
get_play_line("Ecclesiazusae",97)

{'id': 97,
 'name': 'Χορός',
 'text': ['ὅρα δʼ ὅπως ὠθήσομεν τούσδε τοὺς ἐξ ἄστεως',
  'ἥκοντας, ὅσοι πρὸ τοῦ',
  'μέν, ἡνίκʼ ἔδει λαβεῖν ',
  'ἐλθόντʼ ὀβολὸν μόνον,',
  'καθῆντο λαλοῦντες ',
  'ἐν τοῖς στεφανώμασιν,',
  'νυνὶ δʼ ἐνοχλοῦσʼ ἄγαν.',
  'ἀλλʼ οὐχί, Μυρωνίδης',
  'ὅτʼ ἦρχεν ὁ γεννάδας,',
  'οὐδεὶς ἂν ἐτόλμα',
  'τὰ τῆς πόλεως διοικεῖν',
  'ἀργύριον φέρων·',
  'ἀλλʼ ἧκεν ἕκαστος',
  'ἐν ἀσκιδίῳ φέρων',
  'πιεῖν ἅμα τʼ ἄρτον αὑτῷ',
  'καὶ δύο κρομμύω',
  'καὶ τρεῖς ἂν ἐλάας.',
  'νυνὶ δὲ τριώβολον',
  'ζητοῦσι λαβεῖν, ὅταν',
  'πράττωσί τι κοινὸν ὥσ-',
  'περ πηλοφοροῦντες.']}