In [1]:
from stanza.server import CoreNLPClient

In [2]:
def gender_processing(sentence_groups):
    def gender(tag):
        gender_sum = sentence_groups[tag]['gender']
        number = sentence_groups[tag]['number']
        if number == 0:
            return "UNKNOWN"
        print(tag, gender_sum, number)
        gender = gender_sum/number
        if gender<-(1/3):
            return "FEMALE"
        elif gender>(1/3):
            return "MALE"
        else:
            return "UNKNOWN"
    for tag in sentence_groups.keys():
        sentence_groups[tag]['gender'] = gender(tag)

In [3]:
def sentence_selection_mod(text, annotations):
    """Groups the sentences based on animate entities
    
    Input:
        text : the story
        annotations : annotators of text
        
    Output:
        sentence_groups : dictionary containing sentences belonging to different animate entities    
    """    
    
    sentence_groups = dict()
    tag_dict = dict()
                
    for chain in ann.corefChain:
        sentences = set()
        rep_mention = chain.mention[chain.representative]
        if rep_mention.animacy == "INANIMATE" or rep_mention.mentionType == "PRONOMINAL":
            continue
        sI = rep_mention.sentenceIndex
        hI = rep_mention.headIndex
        if (sI, hI) not in tag_dict:
            tag_dict[(sI, hI)] = ann.sentence[sI].token[hI].word
        tag = tag_dict[(sI, hI)].lower()
        print(tag, rep_mention.mentionType, chain.chainID)
        if tag not in sentence_groups:
            sentence_groups[tag] = dict()
            sentence_groups[tag]['gender'] = 0
            sentence_groups[tag]['number'] = 0
            sentence_groups[tag]['sentences'] = set()
        for mention in chain.mention:
            if (mention.mentionType == "PRONOMINAL" and mention.number == "SINGULAR") or not (mention.gender == "UNKNOWN"):
                sentence_groups[tag]['gender'] += (0 if mention.gender == "UNKNOWN" else (1 if mention.gender == "MALE" else -1))
                sentence_groups[tag]['number'] +=1
            sentence_groups[tag]['sentences'].add(mention.sentenceIndex)
#         sentence_groups[tag]['sentences'] = sentence_groups[tag]['sentences'].union(sentences.copy())
    gender_processing(sentence_groups)
    return sentence_groups

In [4]:
file = open(r'C:\Users\Sourav\BTP Code\Panchatantra\premchand11.txt', errors='ignore')
text = file.read()
file.close()

In [5]:
import time
t1 = time.time()
with CoreNLPClient(annotators = ['tokenize','ssplit','pos','lemma','ner', 'parse', 'depparse','coref'],
    properties={'annotators': 'coref', 'coref.algorithm' : 'neural'},
    memory='5G', be_quiet=True, outputFormat = 'json', max_char_length=500000, timeout=36000000) as client:
    ann = client.annotate(text)
t2 = time.time()
print(t2-t1)

2021-02-11 16:31:09 INFO: Writing properties to tmp file: corenlp_server-dc026bc7d4384b60.props
2021-02-11 16:31:09 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Sourav\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-dc026bc7d4384b60.props -annotators tokenize,ssplit,pos,lemma,ner,parse,depparse,coref -preload -outputFormat serialized


1430.514174938202


In [6]:
grps = sentence_selection_mod(text, ann)

fellows NOMINAL 5123
gentleman NOMINAL 3093
widows LIST 6172
men NOMINAL 6178
hai PROPER 6182
man NOMINAL 4135
man NOMINAL 1066
mahatma NOMINAL 1068
˜friend PROPER 1075
husband NOMINAL 6197
man NOMINAL 5174
pandey PROPER 55
moteram PROPER 1084
driver NOMINAL 3132
children NOMINAL 2110
boys NOMINAL 2116
police NOMINAL 3140
god PROPER 4164
man NOMINAL 5192
people NOMINAL 4172
butcher NOMINAL 2135
brahmans PROPER 2136
widow NOMINAL 91
chintamani PROPER 1116
drivers NOMINAL 3165
man NOMINAL 3167
chintamani PROPER 1123
others NOMINAL 4203
sharecroppers NOMINAL 2159
parching NOMINAL 130
men NOMINAL 1159
chintamani PROPER 1162
servants NOMINAL 141
pan NOMINAL 2192
men NOMINAL 1173
this NOMINAL 156
widow NOMINAL 6302
˜jhingur NOMINAL 2212
drivers NOMINAL 3240
master LIST 3246
councillors NOMINAL 6318
members NOMINAL 6319
widow NOMINAL 6322
people NOMINAL 183
pandit PROPER 1209
these NOMINAL 4300
driver NOMINAL 3279
jhingur PROPER 2266
those NOMINAL 1246
˜just PROPER 4319
buddhu PROPER 2272
wom

In [7]:
ann.corefChain

[chainID: 5123
mention {
  mentionID: 5117
  mentionType: "NOMINAL"
  number: "PLURAL"
  gender: "UNKNOWN"
  animacy: "ANIMATE"
  beginIndex: 0
  endIndex: 3
  headIndex: 2
  sentenceIndex: 1028
  position: 1
}
mention {
  mentionID: 5123
  mentionType: "PRONOMINAL"
  number: "PLURAL"
  gender: "UNKNOWN"
  animacy: "ANIMATE"
  beginIndex: 3
  endIndex: 4
  headIndex: 3
  sentenceIndex: 1029
  position: 5
}
representative: 0
, chainID: 1032
mention {
  mentionID: 1032
  mentionType: "NOMINAL"
  number: "SINGULAR"
  gender: "NEUTRAL"
  animacy: "INANIMATE"
  beginIndex: 1
  endIndex: 3
  headIndex: 2
  sentenceIndex: 225
  position: 1
}
mention {
  mentionID: 955
  mentionType: "NOMINAL"
  number: "SINGULAR"
  gender: "NEUTRAL"
  animacy: "INANIMATE"
  beginIndex: 11
  endIndex: 13
  headIndex: 12
  sentenceIndex: 207
  position: 4
}
representative: 1
, chainID: 1036
mention {
  mentionID: 995
  mentionType: "PRONOMINAL"
  number: "SINGULAR"
  gender: "UNKNOWN"
  animacy: "ANIMATE"
  beg

In [8]:
print(grps)

{'fellows': {'gender': 'UNKNOWN', 'number': 0, 'sentences': {1028, 1029}}, 'gentleman': {'gender': 'MALE', 'number': 17, 'sentences': {640, 628, 631, 632, 633, 634, 637, 639}}, 'widows': {'gender': 'UNKNOWN', 'number': 0, 'sentences': {1280}}, 'men': {'gender': 'FEMALE', 'number': 1, 'sentences': {1280, 673, 674, 675, 678, 679, 680, 237, 238, 1008, 241, 690, 248, 249, 251}}, 'hai': {'gender': 'MALE', 'number': 15, 'sentences': {1280, 1281, 1272, 1273, 1274, 1275, 1276, 1278, 1279}}, 'man': {'gender': 'MALE', 'number': 69, 'sentences': {655, 1040, 1044, 287, 288, 290, 176, 827, 828, 829, 325, 838, 202, 857, 861, 862, 864, 866, 867, 228, 869, 870, 871, 232, 873, 872, 875, 868, 877, 878, 879, 880, 1019, 1020}}, 'mahatma': {'gender': 'UNKNOWN', 'number': 2, 'sentences': {233}}, '˜friend': {'gender': 'UNKNOWN', 'number': 0, 'sentences': {200, 234}}, 'husband': {'gender': 'MALE', 'number': 2, 'sentences': {1283, 1284}}, 'pandey': {'gender': 'MALE', 'number': 2, 'sentences': {9}}, 'moteram': 

In [134]:
#metrics
def metrics(obv, data):
    stp = sprec_den = sfn = 0
    gtp = 0
    keys = set(obv.keys()).intersection(set(data.keys()))
    for char in keys:
        stp += len(obv[char]["sentences"].intersection(data[char]["sentences"]))
        gtp += 1 if (obv[char]["gender"]==data[char]["gender"]) else 0
        sprec_den += len(obv[char]["sentences"])
        sfn += len(data[char]["sentences"]) - len(obv[char]["sentences"].intersection(data[char]["sentences"]))
#     sprecision = stp/sprec_den
#     srecall = stp/(stp+sfn)
#     sF1_score = 2*sprecision*srecall/(sprecision+srecall)
    gaccuracy = gtp/len(keys)
    caccuracy = len(keys)/len(data.keys())
    return (caccuracy, gaccuracy)

In [135]:
import pickle
file = open(r'C:\Users\Sourav\BTP Code\Panchatantra\foolish_weaver.gpickle', 'rb')
data = pickle.load(file)
file.close()
print(data)

{'weaver': {'gender': 'MALE', 'sentences': [1]}, 'wife': {'gender': 'FEMALE', 'sentences': [3, 4]}}


In [136]:
metrics(grps, data)

(1.0, 0.5)

In [80]:
#the monkey and the crocodile
metrics(grps, data)

(1.0, 1.0, 0.9836065573770492, 0.5405405405405406, 0.6976744186046512)

In [88]:
#the_story_of_the_merchant_son
metrics(grps, data)

(0.9090909090909091,
 0.7,
 0.7317073170731707,
 0.5504587155963303,
 0.6282722513089005)

In [31]:
#the_thief_and_the_brahmins
metrics(grps, data)

(0.8333333333333334,
 0.8333333333333334,
 0.9047619047619048,
 0.7808219178082192,
 0.8382352941176471)

In [66]:
#the_monkey_the_wedge
metrics(grps, data)

(0.42857142857142855, 1.0, 0.875, 1.0, 0.9333333333333333)