In [1]:
from stanza.server import CoreNLPClient

In [2]:
def gender_processing(sentence_groups):
    def gender(tag):
        gender_sum = sentence_groups[tag]['gender']
        number = sentence_groups[tag]['number']
        if number == 0:
            return "UNKNOWN"
        print(tag, gender_sum, number)
        gender = gender_sum/number
        if gender<-(1/3):
            return "FEMALE"
        elif gender>(1/3):
            return "MALE"
        else:
            return "UNKNOWN"
    for tag in sentence_groups.keys():
        sentence_groups[tag]['gender'] = gender(tag)

In [3]:
def sentence_selection_mod(text, annotations):
    """Groups the sentences based on animate entities
    
    Input:
        text : the story
        annotations : annotators of text
        
    Output:
        sentence_groups : dictionary containing sentences belonging to different animate entities    
    """    
    
    sentence_groups = dict()
    tag_dict = dict()
                
    for chain in ann.corefChain:
        sentences = set()
        rep_mention = chain.mention[chain.representative]
        if rep_mention.animacy == "INANIMATE" or rep_mention.mentionType == "PRONOMINAL":
            continue
        sI = rep_mention.sentenceIndex
        hI = rep_mention.headIndex
        if (sI, hI) not in tag_dict:
            tag_dict[(sI, hI)] = ann.sentence[sI].token[hI].word
        tag = tag_dict[(sI, hI)].lower()
        print(tag, rep_mention.mentionType, chain.chainID)
        if tag not in sentence_groups:
            sentence_groups[tag] = dict()
            sentence_groups[tag]['gender'] = 0
            sentence_groups[tag]['number'] = 0
            sentence_groups[tag]['sentences'] = set()
        for mention in chain.mention:
            if (mention.mentionType == "PRONOMINAL" and mention.number == "SINGULAR") or not (mention.gender == "UNKNOWN"):
                sentence_groups[tag]['gender'] += (0 if mention.gender == "UNKNOWN" else (1 if mention.gender == "MALE" else -1))
                sentence_groups[tag]['number'] +=1
            sentence_groups[tag]['sentences'].add(mention.sentenceIndex)
#         sentence_groups[tag]['sentences'] = sentence_groups[tag]['sentences'].union(sentences.copy())
    gender_processing(sentence_groups)
    return sentence_groups

In [8]:
file = open(r'C:\Users\Sourav\Desktop\BTP\btp-1\Event_Segmentation\BTP Code\Red_Riding_Hood.txt', errors='ignore')
text = file.read()
file.close()

In [9]:
import time
t1 = time.time()
with CoreNLPClient(annotators = ['tokenize','ssplit','pos','lemma','ner', 'parse', 'depparse','coref'],
    properties={'annotators': 'coref', 'coref.algorithm' : 'neural'},
    memory='5G', be_quiet=True, outputFormat = 'json', max_char_length=500000, timeout=36000000) as client:
    ann = client.annotate(text)
t2 = time.time()
print(t2-t1)

2021-02-16 11:04:26 INFO: Writing properties to tmp file: corenlp_server-979966754df14732.props
2021-02-16 11:04:26 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-979966754df14732.props -annotators tokenize,ssplit,pos,lemma,ner,parse,depparse,coref -preload -outputFormat serialized


112.24005460739136


In [10]:
grps = sentence_selection_mod(text, ann)

girl NOMINAL 194
grandma PROPER 104
wolf NOMINAL 233
wolf PROPER 204
woodsman NOMINAL 236
grandmother PROPER 240
mother NOMINAL 53
granny PROPER 123
girl -8 8
grandma -6 6
wolf 27 37
woodsman 5 7
grandmother -10 10
mother -4 4
granny -4 4


In [17]:
len(ann.sentence)

60

In [12]:
print(grps)

{'girl': {'gender': 'FEMALE', 'number': 8, 'sentences': {0, 1, 5, 46}}, 'grandma': {'gender': 'FEMALE', 'number': 6, 'sentences': {16, 17, 20, 6}}, 'wolf': {'gender': 'MALE', 'number': 37, 'sentences': {12, 14, 15, 17, 20, 21, 22, 23, 24, 26, 28, 29, 31, 37, 40, 43, 46, 47, 49, 52, 58}}, 'woodsman': {'gender': 'MALE', 'number': 7, 'sentences': {51, 52, 53, 54, 57, 58}}, 'grandmother': {'gender': 'FEMALE', 'number': 10, 'sentences': {32, 2, 4, 38, 41, 44, 45, 47, 52, 59}}, 'mother': {'gender': 'FEMALE', 'number': 4, 'sentences': {3, 10, 2, 6}}, 'granny': {'gender': 'FEMALE', 'number': 4, 'sentences': {24, 22, 23}}}


In [13]:
import pickle
file = open(r'C:\Users\Sourav\Desktop\BTP\btp-1\Event_Segmentation\BTP Code\Red_Riding_Hood_ann.gpickle', 'wb')
pickle.dump(grps, file)
file.close()

In [134]:
#metrics
def metrics(obv, data):
    stp = sprec_den = sfn = 0
    gtp = 0
    keys = set(obv.keys()).intersection(set(data.keys()))
    for char in keys:
        stp += len(obv[char]["sentences"].intersection(data[char]["sentences"]))
        gtp += 1 if (obv[char]["gender"]==data[char]["gender"]) else 0
        sprec_den += len(obv[char]["sentences"])
        sfn += len(data[char]["sentences"]) - len(obv[char]["sentences"].intersection(data[char]["sentences"]))
#     sprecision = stp/sprec_den
#     srecall = stp/(stp+sfn)
#     sF1_score = 2*sprecision*srecall/(sprecision+srecall)
    gaccuracy = gtp/len(keys)
    caccuracy = len(keys)/len(data.keys())
    return (caccuracy, gaccuracy)

In [135]:
import pickle
file = open(r'C:\Users\Sourav\BTP Code\Panchatantra\foolish_weaver.gpickle', 'rb')
data = pickle.load(file)
file.close()
print(data)

{'weaver': {'gender': 'MALE', 'sentences': [1]}, 'wife': {'gender': 'FEMALE', 'sentences': [3, 4]}}


In [136]:
metrics(grps, data)

(1.0, 0.5)

In [80]:
#the monkey and the crocodile
metrics(grps, data)

(1.0, 1.0, 0.9836065573770492, 0.5405405405405406, 0.6976744186046512)

In [88]:
#the_story_of_the_merchant_son
metrics(grps, data)

(0.9090909090909091,
 0.7,
 0.7317073170731707,
 0.5504587155963303,
 0.6282722513089005)

In [31]:
#the_thief_and_the_brahmins
metrics(grps, data)

(0.8333333333333334,
 0.8333333333333334,
 0.9047619047619048,
 0.7808219178082192,
 0.8382352941176471)

In [66]:
#the_monkey_the_wedge
metrics(grps, data)

(0.42857142857142855, 1.0, 0.875, 1.0, 0.9333333333333333)