In [None]:
from empath import Empath
import requests
import json
from nltk.stem import WordNetLemmatizer

In [None]:
import nltk
from nltk.corpus import wordnet
lexicon = Empath()
lemmatizer = WordNetLemmatizer()
from stanza.server import CoreNLPClient

In [None]:
# lexicon.create_category("custom_times_3", ["when", "next_day","one_time"], size = 300)

In [None]:
def create_lexicons(rb,lv,fp,ct):
    lexicon.create_category("religious_buildings", ["church","mosque", "temple"], model="fiction", size = rb)
    lexicon.create_category("loc_verbs", ["arrive", "visit", "travel", "return"], model = "fiction", size= lv)
    lexicon.create_category("fictional_places", ["place","buildings"], model ="fiction", size =fp)
    lexicon.create_category("custom_times", ["once_upon_a_time", "next_day","that_evening"], size = ct)

In [None]:
create_lexicons(30,14,300,300)

In [6]:
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

In [7]:
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [8]:
story_names = []
file = open("D:\Jupyter\BTP\Panchatantra\Storynames_old.txt")
file_story_names = file.readlines()
for name in file_story_names:
    story_names.append(name.strip('\n'))
file.close()
print(story_names)

['the_story_of_the_merchant_son', 'the_thief_and_the_brahmins', 'the_monkey_and_the_crocodile', 'the_monkey_the_wedge']


In [9]:
def open_story(Storyname):
    file = open("D:\Jupyter\BTP\Panchatantra\\"+Storyname+'.txt')
    text = file.read()
    file.close()
    return text
def annotate_story(text):
    with CoreNLPClient(annotators = ['tokenize','ssplit'],
        memory='5G', be_quiet=True, outputFormat = 'json', max_char_length=500000, timeout=36000000) as client:
        annotated_story = client.annotate(text)
    return annotated_story
def open_and_annotate(Storyname):
    text = open_story(Storyname)
    annotated_story = annotate_story(text)
    return text, annotated_story

In [10]:
no_splitting = [[4, 14, 15, 16, 21, 22, 30, 46, 47, 52, 53],
[15, 16, 19, 20, 25, 26, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 45, 46],
[4, 5, 16, 17, 20, 26, 27, 28, 29, 39, 40, 44, 45, 46, 47, 53, 54, 55, 56, 57],
[]]
no_splitting_alternate = [[4, 5, 14, 15, 16, 17, 21, 22, 23, 30, 31, 46, 47, 48, 52, 53, 54],
[15, 16, 17, 19, 20, 21, 25, 26, 27, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 45, 46, 47],
[4, 5, 6, 16, 17, 18, 20, 21, 26, 27, 28, 29, 30, 39, 40, 41, 44, 45, 46, 47, 48, 53, 54, 55, 56, 57, 58],
[]]

In [11]:
def events_by_location_and_time(text,ann):
    """
    Non-hierarchy model
    """
    #This function finds sum of dictionary returned by lexicon.analyze i.e., it finds the presence of location words.
    def sum_of_locs_dict(dictionary):
        sum_ = 0
        for key in dictionary.keys():
            sum_ = sum_ + dictionary[key]
        return sum_
       
    lexicon = Empath()   #Part of code used to bring Empath in
    locations_dict = dict()     #Dictionary that holds
    location = "Unknown"    #The variable place will hold latest location word.
                            #It is initilized to "unknown" beacuse till now we haven't encountered any location word.
    loaction_by_sentence = []
    location_to_number = dict() # Convert location words to numbers for better representation
    loc_num = 0 # Will be used to put location words as numbers in the location_to_number dict
    total_sentences = 0
    
    #Take each sentence of the story one by one (ann.sentence returns individual sentences of the story as objects)
    for i, sentence in enumerate(ann.sentence):
        # Remove comma and fullstop beacuse lexicon.analyze cannot identify words if they are followd by a fullstop or comma.
        # text[characterOffsetBegin:characterOffserEnd] is the actual sentence (as a string) of the sentence object returned
        sentence_for_empath = text[sentence.characterOffsetBegin:sentence.characterOffsetEnd].replace(", "," ").replace(".","").replace("-"," ").replace("?","").replace("!","").replace(":"," ")
        #Lemmatize the words you encounter for better identification when being analysed by lexicon.analyze
        #May be commented out because lexicon.create_category does not give good words when singular words are used
        sentence_for_empath = lemmatize_sentence(sentence_for_empath) # Sentences are all lemmatized now
        # Analyze the things
        lexicon_locations_dict = lexicon.analyze(sentence_for_empath,
                                                categories=["religious_buildings", "loc_verbs", "fictional_places", "custom_times"])
        
        s = sum_of_locs_dict(lexicon_locations_dict)
        if s>0:
            words = sentence_for_empath.split(" ")
            # Find if place is same as previous
            for word in words:
                # If the word is a location word
                if sum_of_locs_dict(lexicon.analyze(word,
                                                   categories=["religious_buildings", "loc_verbs", "fictional_places", "custom_times"]))>0:
                    #if new location word encountered is the same as the last location word encountered
                    if word == location:
                        break
                    else:
                        location = word
                        if location in locations_dict:
                            location+="1"
                        locations_dict[location]=[i]
                        location_to_number[location] = loc_num
                        loc_num += 1
        else: 
            if location not in locations_dict:
                locations_dict[location] = list()
            locations_dict[location].append(i)
        total_sentences = i
    return locations_dict, location_to_number, total_sentences

In [12]:
def construct_event_list(locations_dict, no_split):
    events= []
    for location in locations_dict:
        if locations_dict[location][0] not in no_split:
            events.append(locations_dict[location][0])
    events.sort()
    events.append(total_sentences)
    while events[0] == 0:
        del events[0]
    return set(events)

In [13]:
my_list_of_events = []
for i,name in enumerate(story_names):
    text , annotated_story = open_and_annotate(name)
    locations_dict, location_number_map, total_sentences = events_by_location_and_time(text, annotated_story)
    my_list_of_events.append(construct_event_list(locations_dict, no_splitting[i]))

2021-03-30 04:20:29 INFO: Writing properties to tmp file: corenlp_server-a6310a731f75453a.props
2021-03-30 04:20:29 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Giri\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-a6310a731f75453a.props -annotators tokenize,ssplit -preload -outputFormat serialized
2021-03-30 04:20:32 INFO: Writing properties to tmp file: corenlp_server-e9629631fa9047ed.props
2021-03-30 04:20:32 INFO: Starting server with command: java -Xmx5G -cp C:\Users\Giri\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 36000000 -threads 5 -maxCharLength 500000 -quiet True -serverProperties corenlp_server-e9629631fa9047ed.props -annotators tokenize,ssplit -preload -outputFormat serialized
2021-03-30 04:20:34 INFO: Writing properties to tmp file: corenlp_server-4a0c52c4c1ba42e7.props
2021-03-30 04:20:34 INFO: 

In [14]:
my_list_of_events

[{1, 6, 7, 8, 10, 19, 27, 28, 31, 33, 41, 51, 58, 62, 65},
 {2, 8, 18, 22, 23, 28, 48},
 {1, 2, 9, 24, 33, 60},
 {1, 2, 4, 11}]

In [15]:
my_list_of_events_2 = []
for i,event in enumerate(my_list_of_events):    
    x = sorted(list(event))
    my_list_of_events_2.append(x)
print(my_list_of_events_2)

[[1, 6, 7, 8, 10, 19, 27, 28, 31, 33, 41, 51, 58, 62, 65], [2, 8, 18, 22, 23, 28, 48], [1, 2, 9, 24, 33, 60], [1, 2, 4, 11]]
