In [1]:
from empath import Empath
import requests
import json
from nltk.stem import WordNetLemmatizer

In [2]:
def create_category(self,name,seeds,model="fiction",size=100,write=True):
    resp = requests.post(self.backend_url + "/create_category", json={"terms":seeds,"size":size,"model":model})
    results = json.loads(resp.text)
    lemma_words = list()
    lemmatizer = WordNetLemmatizer() 
    for word in results:
        lemma_words.append(lemmatizer.lemmatize(word))
    self.cats[name] = list(set(lemma_words))
    if write:
        with open(self.base_dir+"/data/user/"+name+".empath","w") as f:
            f.write("\t".join([name]+results))

Empath.create_lemma_category = create_category

In [3]:
import nltk
from nltk.corpus import wordnet
lexicon = Empath()
lemmatizer = WordNetLemmatizer()
from stanza.server import CoreNLPClient

In [13]:
def create_lexicons(rb,lv,fp,ct):
    lexicon.create_lemma_category("religious_buildings", ["church","mosque", "temple"], model="fiction", size = rb)
    lexicon.create_lemma_category("loc_verbs", ["arrive", "visit", "travel", "return"], model = "fiction", size= lv)
    lexicon.create_lemma_category("fictional_places", ["place","buildings"], model ="fiction", size =fp)
    lexicon.create_lemma_category("custom_times", ["once_upon_a_time", "next_day","that_evening"], size = ct)

In [14]:
create_lexicons(30,14,300,300)

In [5]:
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

In [6]:
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [7]:
story_names = []
file = open("D:\Jupyter\BTP\Panchatantra\Storynames.txt")
file_story_names = file.readlines()
for name in file_story_names:
    story_names.append(name.strip('\n'))
file.close()
print(story_names)

['the_story_of_the_merchant_son', 'the_aged_mother', 'the_thief_and_the_brahmins', 'the_monkey_and_the_crocodile', 'the_monkey_the_wedge']


In [8]:
def open_story(Storyname):
    file = open("D:\Jupyter\BTP\Panchatantra\\"+Storyname+'.txt')
    text = file.read()
    file.close()
    return text
def annotate_story(text):
    with CoreNLPClient(annotators = ['tokenize','ssplit'],
        memory='5G', be_quiet=True, outputFormat = 'json', max_char_length=500000, timeout=36000000) as client:
        annotated_story = client.annotate(text)
    return annotated_story
def open_and_annotate(Storyname):
    text = open_story(Storyname)
    annotated_story = annotate_story(text)
    return text, annotated_story

In [9]:
def events_by_location_and_time(text,ann):
    """
    Non-hierarchy model
    """
    #This function finds sum of dictionary returned by lexicon.analyze i.e., it finds the presence of location words.
    def sum_of_locs_dict(dictionary):
        sum_ = 0
        for key in dictionary.keys():
            sum_ = sum_ + dictionary[key]
        return sum_
       
    lexicon = Empath()   #Part of code used to bring Empath in
    locations_dict = dict()     #Dictionary that holds
    location = "Unknown"    #The variable place will hold latest location word.
                            #It is initilized to "unknown" beacuse till now we haven't encountered any location word.
    loaction_by_sentence = []
    location_to_number = dict() # Convert location words to numbers for better representation
    loc_num = 0 # Will be used to put location words as numbers in the location_to_number dict
    total_sentences = 0
    
    #Take each sentence of the story one by one (ann.sentence returns individual sentences of the story as objects)
    for i, sentence in enumerate(ann.sentence):
        # Remove comma and fullstop beacuse lexicon.analyze cannot identify words if they are followd by a fullstop or comma.
        # text[characterOffsetBegin:characterOffserEnd] is the actual sentence (as a string) of the sentence object returned
        sentence_for_empath = text[sentence.characterOffsetBegin:sentence.characterOffsetEnd].replace(", "," ").replace(".","").replace("-"," ").replace("?","").replace("!","").replace(":"," ")
        #Lemmatize the words you encounter for better identification when being analysed by lexicon.analyze
        #May be commented out because lexicon.create_category does not give good words when singular words are used
        sentence_for_empath = lemmatize_sentence(sentence_for_empath) # Sentences are all lemmatized now
        # Analyze the things
        lexicon_locations_dict = lexicon.analyze(sentence_for_empath,
                                                categories=["religious_buildings", "loc_verbs", "fictional_places", "custom_times"])
        
        s = sum_of_locs_dict(lexicon_locations_dict)
        if s>0:
            words = sentence_for_empath.split(" ")
            # Find if place is same as previous
            for word in words:
                # If the word is a location word
                if sum_of_locs_dict(lexicon.analyze(word,
                                                   categories=["religious_buildings", "loc_verbs", "fictional_places", "custom_times"]))>0:
                    #if new location word encountered is the same as the last location word encountered
                    if word == location:
                        break
                    else:
                        location = word
                        if location in locations_dict:
                            location+="1"
                        locations_dict[location]=[i]
                        location_to_number[location] = loc_num
                        loc_num += 1
        else: 
            if location not in locations_dict:
                locations_dict[location] = list()
            locations_dict[location].append(i)
        total_sentences = i
    return locations_dict, location_to_number, total_sentences

In [10]:
def construct_event_list(locations_dict):
    events= []
    for location in locations_dict:
        events.append(locations_dict[location][0])
    events.sort()
    events.append(total_sentences)
    while events[0] == 0:
        del events[0]
    return set(events)

In [11]:
my_list_of_events = []
for i,name in enumerate(story_names):
    text , annotated_story = open_and_annotate(name)
    locations_dict, location_number_map, total_sentences = events_by_location_and_time(text, annotated_story)
    my_list_of_events.append(construct_event_list(locations_dict))

2021-03-16 01:38:34 INFO: Writing properties to tmp file: corenlp_server-8416e896f90440c7.props
2021-03-16 01:38:35 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Giri\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-8416e896f90440c7.props -annotators tokenize,ssplit -preload -outputFormat serialized
2021-03-16 01:38:36 INFO: Writing properties to tmp file: corenlp_server-2b0f74dd77aa4ae5.props
2021-03-16 01:38:36 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Giri\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-2b0f74dd77aa4ae5.props -annotators tokenize,ssplit -preload -outputFormat serialized
2021-03-16 01:38:38 INFO: Writing properties to tmp file: corenlp_server-2dfa9d027d3b4e23.props
2021-03-16 01:38:38 INFO: 

In [12]:
my_list_of_events

[{6, 7, 8, 10, 15, 27, 30, 33, 46, 51, 62, 65},
 {1, 10, 11, 15, 17, 23, 28, 30, 54},
 {2, 8, 16, 18, 23, 25, 48},
 {9, 24, 47, 56, 60},
 {1, 2, 4, 11}]