In [1]:
from stanza.server import CoreNLPClient

In [2]:
def gender_processing(sentence_groups):
    def gender(tag):
        gender_sum = sentence_groups[tag]['gender']
        number = sentence_groups[tag]['number']
        if number == 0:
            return "UNKNOWN"
        print(tag, gender_sum, number)
        gender = gender_sum/number
        if gender<-(1/3):
            return "FEMALE"
        elif gender>(1/3):
            return "MALE"
        else:
            return "UNKNOWN"
    for tag in sentence_groups.keys():
        sentence_groups[tag]['gender'] = gender(tag)

In [3]:
def sentence_selection_mod(text, annotations):
    """Groups the sentences based on animate entities
    
    Input:
        text : the story
        annotations : annotators of text
        
    Output:
        sentence_groups : dictionary containing sentences belonging to different animate entities    
    """    
    
    sentence_groups = dict()
    tag_dict = dict()
                
    for chain in ann.corefChain:
        sentences = set()
        rep_mention = chain.mention[chain.representative]
        if rep_mention.animacy == "INANIMATE" or rep_mention.mentionType == "PRONOMINAL":
            continue
        sI = rep_mention.sentenceIndex
        hI = rep_mention.headIndex
        if (sI, hI) not in tag_dict:
            tag_dict[(sI, hI)] = ann.sentence[sI].token[hI].word
        tag = tag_dict[(sI, hI)].lower()
        print(tag, rep_mention.mentionType, chain.chainID)
        if tag not in sentence_groups:
            sentence_groups[tag] = dict()
            sentence_groups[tag]['gender'] = 0
            sentence_groups[tag]['number'] = 0
            sentence_groups[tag]['sentences'] = set()
        for mention in chain.mention:
            if (mention.mentionType == "PRONOMINAL" and mention.number == "SINGULAR") or not (mention.gender == "UNKNOWN"):
                sentence_groups[tag]['gender'] += (0 if mention.gender == "UNKNOWN" else (1 if mention.gender == "MALE" else -1))
                sentence_groups[tag]['number'] +=1
            sentence_groups[tag]['sentences'].add(mention.sentenceIndex)
#         sentence_groups[tag]['sentences'] = sentence_groups[tag]['sentences'].union(sentences.copy())
    gender_processing(sentence_groups)
    return sentence_groups

In [20]:
file = open(r'C:\Users\Sourav\Desktop\BTP\btp-1\Event_Segmentation\BTP Code\Panchatantra\the_story_of_the_merchant_son.txt', errors='ignore')
text = file.read()
file.close()

In [21]:
import time
t1 = time.time()
with CoreNLPClient(annotators = ['tokenize','ssplit','pos','lemma','ner', 'parse', 'depparse','coref'],
    properties={'annotators': 'coref', 'coref.algorithm' : 'neural'},
    memory='5G', be_quiet=True, outputFormat = 'json', max_char_length=500000, timeout=36000000) as client:
    ann = client.annotate(text)
t2 = time.time()
print(t2-t1)

2021-02-16 13:35:50 INFO: Writing properties to tmp file: corenlp_server-4c94d95d3edf468b.props
2021-02-16 13:35:50 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-4c94d95d3edf468b.props -annotators tokenize,ssplit,pos,lemma,ner,parse,depparse,coref -preload -outputFormat serialized


78.06968951225281


In [22]:
grps = sentence_selection_mod(text, ann)

merchant NOMINAL 320
king PROPER 323
son NOMINAL 133
princess NOMINAL 325
sagardatta PROPER 6
man NOMINAL 329
daughter NOMINAL 333
attendant NOMINAL 81
bride LIST 337
watchman NOMINAL 346
son NOMINAL 31
nail NOMINAL 230
father NOMINAL 176
stranger NOMINAL 243
princess LIST 307
prince NOMINAL 182
lover NOMINAL 183
friends LIST 247
groom NOMINAL 248
merchant 1 1
king 9 29
son 14 23
princess -23 32
sagardatta 2 2
man 31 31
daughter -14 14
attendant 3 3
watchman 14 14
nail 0 2
father 2 2
stranger 2 2
prince 8 8
groom 2 2


In [23]:
len(ann.sentence)

66

In [24]:
print(grps)

{'merchant': {'gender': 'MALE', 'number': 1, 'sentences': {0, 3, 13, 24, 62}}, 'king': {'gender': 'UNKNOWN', 'number': 29, 'sentences': {41, 43, 44, 46, 47, 48, 50, 52, 53, 54, 57, 58, 59, 60, 61, 62, 63}}, 'son': {'gender': 'MALE', 'number': 23, 'sentences': {0, 1, 3, 4, 5, 13, 14, 15, 16, 17, 18, 19, 20, 24}}, 'princess': {'gender': 'FEMALE', 'number': 32, 'sentences': {10, 11, 12, 14, 17, 18, 19, 20, 21, 22, 23, 25, 26, 31, 51, 56, 57, 59, 60, 63}}, 'sagardatta': {'gender': 'MALE', 'number': 2, 'sentences': {0, 1}}, 'man': {'gender': 'MALE', 'number': 31, 'sentences': {6, 7, 8, 9, 27, 28, 30, 31, 32, 33, 36, 38, 39, 41, 44, 52, 53, 54, 55, 63}}, 'daughter': {'gender': 'FEMALE', 'number': 14, 'sentences': {64, 33, 34, 37, 38, 51, 56, 57, 58}}, 'attendant': {'gender': 'MALE', 'number': 3, 'sentences': {11, 12, 13}}, 'bride': {'gender': 'UNKNOWN', 'number': 0, 'sentences': {64}}, 'watchman': {'gender': 'MALE', 'number': 14, 'sentences': {64, 65, 33, 34, 51, 56, 57, 58, 29}}, 'nail': {'

In [25]:
import pickle
file = open(r'C:\Users\Sourav\Desktop\BTP\btp-1\Event_Segmentation\BTP Code\Panchatantra\the_story_of_the_merchant_son_ann.gpickle', 'wb')
pickle.dump(grps, file)
file.close()

In [134]:
#metrics
def metrics(obv, data):
    stp = sprec_den = sfn = 0
    gtp = 0
    keys = set(obv.keys()).intersection(set(data.keys()))
    for char in keys:
        stp += len(obv[char]["sentences"].intersection(data[char]["sentences"]))
        gtp += 1 if (obv[char]["gender"]==data[char]["gender"]) else 0
        sprec_den += len(obv[char]["sentences"])
        sfn += len(data[char]["sentences"]) - len(obv[char]["sentences"].intersection(data[char]["sentences"]))
#     sprecision = stp/sprec_den
#     srecall = stp/(stp+sfn)
#     sF1_score = 2*sprecision*srecall/(sprecision+srecall)
    gaccuracy = gtp/len(keys)
    caccuracy = len(keys)/len(data.keys())
    return (caccuracy, gaccuracy)

In [135]:
import pickle
file = open(r'C:\Users\Sourav\BTP Code\Panchatantra\foolish_weaver.gpickle', 'rb')
data = pickle.load(file)
file.close()
print(data)

{'weaver': {'gender': 'MALE', 'sentences': [1]}, 'wife': {'gender': 'FEMALE', 'sentences': [3, 4]}}


In [136]:
metrics(grps, data)

(1.0, 0.5)

In [80]:
#the monkey and the crocodile
metrics(grps, data)

(1.0, 1.0, 0.9836065573770492, 0.5405405405405406, 0.6976744186046512)

In [88]:
#the_story_of_the_merchant_son
metrics(grps, data)

(0.9090909090909091,
 0.7,
 0.7317073170731707,
 0.5504587155963303,
 0.6282722513089005)

In [31]:
#the_thief_and_the_brahmins
metrics(grps, data)

(0.8333333333333334,
 0.8333333333333334,
 0.9047619047619048,
 0.7808219178082192,
 0.8382352941176471)

In [66]:
#the_monkey_the_wedge
metrics(grps, data)

(0.42857142857142855, 1.0, 0.875, 1.0, 0.9333333333333333)

In [27]:
for i, s in enumerate(ann.sentence):
    print(str(i)+": "+text[s.characterOffsetBegin:s.characterOffsetEnd])

0: Sagardatta was a merchant who had a handsome young son.
1: One day, he observed that his young son had bought a costly book.
2: But the book contained only one verse: "You get what is destined for you!"
3: When the merchant realized that his son was a fool to buy a book with so much cost, but only one verse, he grew very angry.
4: He said, "How can you do well in business, when you can buy a book which contains only one verse!
5: Get out of my house, and never show me your face again!
6: "
 
The young man was utterly dejected for being driven out by his father; he started travelling with only his book along with him.
7: He learnt the verse well, and kept repeating it, all the way.
8: On his way, he arrived in a village.
9: The villagers asked him his name and he replied from the book, "You get what is destined for you"
 
Thus, the young man became known as 'You-get-what-is-destined-for-you'.
10: One day, the princess of the country visited a festival and saw a handsome prince, who w