### Setup

In [170]:
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
from spacy.symbols import nsubj, VERB, amod, acomp
import spacy
import string

## load spacy and neural coref
nlp = spacy.load("en_core_web_sm")
trf_nlp = spacy.load("en_core_web_trf")
nlp.max_length = 2000000

In [171]:
## Load AllenNLP models
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
from allennlp_models import pretrained

coref_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")
sentiment_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/stanford-sentiment-treebank-roberta.2021-03-11.tar.gz")
open_info_predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz")

In [374]:
## Load story
story = "Mother's_Day"
file_name = "../static/data/story/"+ story + "/"+ story +".txt"
book = open(file_name, encoding='utf-8-sig').read()
book[:50]

'Mother’s Day\nWorking on any Sunday was bad enough.'

In [375]:
book = book.replace('“','"').replace('”','"')

In [376]:
book1 = " ".join(book.split())

### Split using regex and coreference resolution
##### We only need to run this one time and save the clusters in a json file. The json file is then fed into the interface to assign names to the cluster and merge them.

In [377]:
import re

In [378]:
## regex to split text into parts and chapters
regex1 = "PART\s+(?:ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT.*)"
regex2 = "Chapter\s+\d+"
# regex2 = "Chapter\s+?(IX|IV|V?I{0,3}).*"

In [379]:
def get_cluster_range(book):
    all_clusters = []
    for m in re.finditer(regex1, book):
        part_start = m.start()
        ## start of a part
        all_clusters.append({"start_char": part_start})

    for i, part in enumerate(all_clusters):
        chapters = []
        if i == (len(all_clusters) -1):
            part['end_char'] = len(book)
            for m in re.finditer(regex2, book[part['start_char']:]):
                ## start of a chapter
                chapter_start = m.end()
                chapters.append({'start_char':chapter_start + part['start_char'], 'span':m.span()})
#         else:
#             part['end_char'] = all_clusters[i + 1]['start_char']
#             for m in re.finditer(regex2, book[part['start_char']: all_clusters[i + 1]['start_char']]):
#                 chapter_start = m.start()
#                 chapters.append({'start_char':chapter_start + part['start_char']})

        all_clusters[i]['chapters'] = chapters
        
    ## get the end of the chapters
    for i, part in enumerate(all_clusters):
        for j, chapter in enumerate(part['chapters']):
            if j == (len(part['chapters']) -1):
                chapter['end_char'] = part['end_char']   
            else:
                chapter['end_char'] = part['chapters'][j+1]['span'][0]

    return all_clusters

def get_default_range(book, split_equally = False):
    d = {}
    d["start_char"] = 0
    d['end_char'] = len(book)
    d['chapters'] = [{"start_char": 0, "end_char": len(book)}]
    
    return [d]
    

def get_coref_clusters2(book):
    '''
        This function detects coref_clusters by splitting 
        the story to parts and clusters
    '''
    all_clusters = get_cluster_range(book)
    
    if not len(all_clusters):
        print("no chapters detect.")
        all_clusters = get_default_range(book, split_equally = False)

    for i, part in enumerate(all_clusters):
        for j, chapter in enumerate(part['chapters']):
            print(i,j)
            coref_res = coref_predictor.predict(book[chapter['start_char']:chapter['end_char']])
            #chapter['clusters'] =  coref_res['clusters']
            s = chapter['start_char']
            e = chapter['end_char']
            local_doc = nlp(book[s:e])
            token_offset = []
            for token in local_doc:
                token_offset.append(token.idx)
                
            adjusted_cluster = []
            for cl in coref_res['clusters']:
                new_cl = []
                for c in cl:
                    cs = token_offset[c[0]]+ s
                    ce = token_offset[c[1] + 1]+ s
                    new_cl.append([cs, ce, c[0], c[1] + 1])
                adjusted_cluster.append(new_cl)
                
            chapter['clusters'] = adjusted_cluster

    return all_clusters

In [346]:
all_clusters = get_coref_clusters2(book)

no chapters detect.
0 0


In [347]:
# all_clusters[0]['chapters'][36]

In [348]:
import json
with open('../static/data/story/'+story+ '/' + story +'_annotation.json', 'w') as outfile:
    outfile.write(json.dumps(all_clusters, indent=2))

### Load annotation and characters from json

In [380]:
import json
f = open( '../static/data/story/'+story+ '/' + story + '_annotation.json')
all_clusters = json.load(f)

In [381]:
f = open( '../static/data/story/'+story+ '/' + story + '_characters.json')
data = json.load(f)

characters = {}
tags = {}
for k in data:
    if data[k]['type'] == 'character':
        characters[data[k]['name']] = data[k]['data']
    elif data[k]['type'] == 'tag':
        tags[data[k]['name']] = data[k]['data']

In [382]:
# characters.keys()

### Sentiment and Emotion analysis

In [383]:
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
from scipy.special import softmax
import csv
import urllib.request
import numpy as np

tokenizer = AutoTokenizer.from_pretrained(f"cardiffnlp/twitter-roberta-base-emotion")
model = AutoModelForSequenceClassification.from_pretrained(f"cardiffnlp/twitter-roberta-base-emotion")

In [384]:
# download label mapping
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [385]:
def get_emotion(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]

    l = labels[ranking[0]]
        
    return l

In [386]:
sentence_mapping = []
global_ind = 0
for i, part in enumerate(all_clusters):
    for j, chapter in enumerate(part['chapters']):
        print(i,j)
#         if j < 17:
        s = chapter['start_char']
        e = chapter['end_char']
        local_doc = nlp(book[s:e])
        local_ind = 0
        for sent in local_doc.sents:
            d = {}
            d['text'] = sent.text
            d['clean_text'] = " ".join(sent.text.split())
            d['sentiment'] = sentiment_predictor.predict(d['clean_text'])['probs'][0]
            d['emotion'] = get_emotion(d['clean_text'])
            d['start_char'] = s + sent.start_char
            d['end_char'] = s + sent.end_char
            d['part'] = i
            d['chapter'] = j
            d['global_sent_id'] = global_ind
            d['local_sent_id'] = local_ind ## index within a chapter

            d['mentions'] = []
            for c in characters:
                for mention in characters[c]:
                    if (mention[0] >= (sent.start_char + s)) and (mention[1] < (sent.end_char + s)):
                        d['mentions'].append({'start_char': mention[0], 
                                              'end_char': mention[1],
                                              'character': c})

            sentence_mapping.append(d)
            global_ind += 1
            local_ind += 1

0 0


In [387]:
sentence_mapping[-1]

{'text': '\n"The person you are calling has blocked your number."',
 'clean_text': '"The person you are calling has blocked your number."',
 'sentiment': 0.0003652545274235308,
 'emotion': 'anger',
 'start_char': 22955,
 'end_char': 23009,
 'part': 0,
 'chapter': 0,
 'global_sent_id': 319,
 'local_sent_id': 319,
 'mentions': [{'start_char': 22996, 'end_char': 23001, 'character': 'Sophie'}]}

In [10]:
with open( '../static/data/story/'+story+ '/' + story + '_annotation.json', 'w') as outfile:
    outfile.write(json.dumps(all_clusters, indent=2))

In [391]:
f = open( '../static/data/story/'+story+ '/' + story + '_annotation.json')
all_clusters = json.load(f)

In [388]:
sentiment_emotion_data = []

for sent_id, sent in enumerate(sentence_mapping):
    character_dict = {}
    for mention in sent['mentions']:
        ## only need one mention in a sentence, not all of them.
        if mention['character'] not in character_dict:
            d = {"part": sent['part'], "chapter": sent['chapter'],
                 "character": mention['character'],
                 "start_char": sent['start_char'],
                 "end_char": sent['end_char'],
                 "global_sent_id": sent['global_sent_id'],
                 "local_sent_id": sent['local_sent_id'],
                 "sentiment": sent['sentiment'],
                 "emotion": sent['emotion'],
                 "presence": 1}

            sentiment_emotion_data.append(d)
            character_dict[mention['character']] = 1

In [389]:
data = {"sentiment_emotion_data": sentiment_emotion_data, 
        "characters": list(characters.keys()),
        "characters_map": characters,
        "sentence_mapping": sentence_mapping}

In [392]:
data['chapters_map'] = all_clusters

### presence of tags

In [393]:
tags_presence = []
for sent_id, sent in enumerate(sentence_mapping):   
    for t in tags:
        flag = False
        for mention in tags[t]:
            if (mention[0] >= sent['start_char']) and (mention[1] <= sent['end_char']):
                d = {"part": sent['part'], "chapter": sent['chapter'],
                     "tag": t,
                     "start_char": sent['start_char'],
                     "end_char": sent['end_char'],
                     "global_sent_id": sent['global_sent_id'], 
                     "local_sent_id": sent['local_sent_id'],
                     "presence": 1}
                tags_presence.append(d)
                break;
                        

In [394]:
data["tags_map"] = tags
data["tags_presence"] = tags_presence

In [187]:
with open('../static/data/story/'+story+ '/' + story +'.json', 'w') as outfile:
    outfile.write(json.dumps(data, indent=2))

### identify verbs

In [188]:
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))

from nltk.stem import SnowballStemmer
stemmer = nltk.stem.SnowballStemmer('english')

In [189]:
import numpy as np
from numpy.linalg import norm
from scipy.spatial.distance import cosine 
trf_nlp = spacy.load("en_core_web_trf")

In [350]:
import json
f = open( '../static/data/story/'+story+ '/' + story + '.json')
data = json.load(f)
sentence_mapping = data['sentence_mapping']

In [331]:
sentence_mapping[-1]

{'text': '\n\n\n\n\n',
 'clean_text': '',
 'sentiment': 0.47096481919288635,
 'emotion': 'anger',
 'start_char': 259993,
 'end_char': 259998,
 'part': 0,
 'chapter': 16,
 'global_sent_id': 3138,
 'local_sent_id': 175,
 'mentions': []}

In [190]:
def create_frame(arg_type, arg_text, s, e, c=False):
    d = {
        'arg_type': arg_type,
        'arg_text': arg_text , 
        'start_char': s,
        'start_end': e
    }
    if c:
        d['character'] = c
        
    return d

In [89]:
from numpy import linalg as LA
import numpy as np
from scipy.spatial.distance import cosine 
def get_bert_embedding(text, v):
    try:
        temp_doc = trf_nlp(text)
        v_ind = -1
        for i, t in enumerate(temp_doc):
            if t.text == v:
                v_ind = i
    #             print("#### matched")
    #             print(text, v)

        if v_ind >= 0:
            #print(v_ind)
            bert_tokens = temp_doc._.trf_data.align[v_ind].data
            #print(bert_tokens)
            embds = temp_doc._.trf_data.tensors[0]
            embds = embds.reshape(-1, embds.shape[-1])
            a = np.zeros(768)
            for i, t in enumerate(bert_tokens):
                a = np.add(a, embds[t])

            return a/LA.norm(a)
        else:
            print("#### did not matched")
            print(text, v)

        return []
    except Exception as e:
        print("Exception")
        print(text, v)
        
        return []

In [137]:
## function for getting embedding for a verb using GLOVE
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

wv = api.load('glove-wiki-gigaword-50')

def get_glove_embedding(text, v):
    try:
        return wv[v]
    except Exception as e:
        print("Exception")
        print(text, v)
        
        return []

In [91]:
# get_verb_embedding("Build something", "Build")

In [395]:
frames = []
frame = ""

for sent_id, sent in enumerate(sentence_mapping): 
    sent_doc = nlp(sent['text'])
    
    token_offset = []
    tokens = []     
    c = 0
    for tok in sent_doc:
        if "\n" in tok.text:
            c += 1
        if "\n" not in tok.text:
            #print(tok.text, c)
            token_offset.append([tok.idx, c])
            tokens.append(tok)

    ## predict oie
    oie = open_info_predictor.predict(sent['text'])
    words = oie['words']

    for res in oie['verbs']:
        if ((res['verb'] in stops)):
            continue

        ## initialize object for current frame
        frame = {"part": sent['part'], "chapter": sent['chapter'],
                 "global_sent_id": sent['global_sent_id'], "local_sent_id": sent['local_sent_id'],
                 'verb': res['verb'], 'lemmatized_verb':stemmer.stem(res['verb']), 
                 'args': [], 'got_character_as_agent': []}
        temp = ""

        for t_id, t in enumerate(res['tags']):
            if t != 'O':
                spl = t.split('-')
                if ((spl[0] == 'B')): ## tag starts with a 'B' and has ARG0, ARG1, etc. start of a potential match
                    if len(temp): ## there is something to save
                        tok_start = token_offset[arg_s][0] +  sent['start_char']
                        tok_end = token_offset[t_id][0] +  sent['start_char']
                        for mention in sent['mentions']:
                            if ( (mention['start_char'] >= tok_start) and \
                                     (mention['end_char'] <= tok_end) and \
                                     ('ARG0' in saved_type )):

                                    frame['got_character_as_agent'].append(mention['character'])
                                    break;

                        if abs(t_id - arg_s) < 12:
                            frame['args'].append(create_frame(saved_type, temp, tok_start, tok_end))

                    ## save the word, argument type, and index
                    temp = words[t_id]
                    saved_type = spl[1:]
                    arg_s = t_id
                    #print("1", temp, saved_type, arg_s)

                ## if tag starts with an "I", we are inside 
                ## a continuous argument
                elif ((spl[0] == 'I')):

                    ## append to existing argument
                    temp += ' '+words[t_id]

            else:
                ## check for any unsaved frame
                if len(temp): ## there is something to save
                    tok_start = token_offset[arg_s][0]  + sent['start_char']
                    tok_end = token_offset[t_id][0]  + sent['start_char']
                    for mention in sent['mentions']:
                        if ( (mention['start_char'] >= tok_start) and \
                                 (mention['end_char'] <= tok_end) and \
                                 ('ARG0' in saved_type )):

                                frame['got_character_as_agent'].append(mention['character'])
                                break;

                    if abs(t_id - arg_s) < 12:
                        frame['args'].append(create_frame(saved_type, temp, tok_start, tok_end))    

                ## reinitialize temp to empty string
                temp = ""


        if ( (len(frame['args'])) and (len(frame['got_character_as_agent'])) and (len(frame['verb']) >=4) ):
            emd = get_glove_embedding("", frame['verb'])
            if len(emd):
                frame['embedding'] = emd
            
                frames.append(frame)

                if len(frames) % 100 == 0:
                    print(len(frames))

Exception
 Glancing
100
Exception
 Enjoying
200
Exception
 Dumping


In [396]:
for f in frames:
    f['embedding'] = f['embedding'].tolist()

In [397]:
# for f in frames:
#     f['embedding'] = get_glove_embedding("", f['verb']).tolist()

In [398]:
data['frames'] = frames

In [399]:
# frames[0]

### Bert embedding

In [76]:
text = '''It was a very fine November day, and the Miss Musgroves came through
the little grounds, and stopped for no other purpose than to say, that
they were going to take a long walk, and therefore concluded Mary could
not like to go with them; and when Mary immediately replied, with some
jealousy at not being supposed a good walker, "Oh, yes, I should like
to join you very much, I am very fond of a long walk;" Anne felt
persuaded, by the looks of the two girls, that it was precisely what
they did not wish, and admired again the sort of necessity which the
family habits seemed to produce, of everything being to be
communicated, and everything being to be done together, however
undesired and inconvenient.'''
v = "persuaded"
emd = get_verb_embedding(text, "persuaded")

In [99]:
# frames = data['frames']

In [400]:
chapters = data['chapters_map'][0]['chapters']

In [401]:
def filter_frames(part, chapter, c):
    res = [frame for frame in frames if frame['part'] == part \
           and frame['chapter'] == chapter \
           and (c in frame['got_character_as_agent']) and (len(frame['embedding']))]
    
    return res

In [402]:
def get_count_matrix(curr_frames, prev_frames):
    l1 = len(curr_frames)
    l2 = len(prev_frames)
    m = np.zeros((l1, l2))
    for i1 in range(l1):
        for j1 in range(l2):
            try:
                d = cosine(curr_frames[i1]['embedding'], prev_frames[j1]['embedding'])
                if d > 1:
                    d = 1
                m[i1][j1] = d
            except Exception as e:
                print(i1,j1)
                
    return m

In [361]:
curr_frames = filter_frames(0,  1, "Sir Walter Elliot")
prev_frames = filter_frames(0,  0, "Sir Walter Elliot")

In [364]:
m = get_count_matrix(curr_frames, prev_frames)

In [371]:
m[8][7]

0.41403920161581365

In [373]:
curr_frames[8]['verb']

'remove'

In [403]:
actions = []
for i, part in enumerate(all_clusters):
    for j, chapter in enumerate(part['chapters']):
#         if j < 17:
            for c in characters:
                #print(i, j, c)
                if j > 0:
                    curr_frames = filter_frames(i,  j, c)
                    prev_frames = filter_frames(i,  j-1, c)
                    if (j == 0) or (len(curr_frames) == 0) or (len(prev_frames) == 0):
                        d = {"part": i, "chapter":j, "character": c, 
                             'start_char': chapters[j]['start_char'], 'end_char': chapters[j]['end_char'],
                             'change_in_action': 0}
                        actions.append(d)
                    else:
                        m = get_count_matrix(curr_frames, prev_frames)
                        # Convert it into a 1D array
                        a_1d = m.flatten()

                        # Find the indices in the 1D array
                        idx_1d = a_1d.argsort()[::-1]

                        # convert the idx_1d back into indices arrays for each dimension
                        x_idx, y_idx = np.unravel_index(idx_1d, m.shape)

                        _, idx = np.unique(x_idx, return_index=True)
                        x_idx = x_idx[np.sort(idx)]

                        for i1, index in enumerate(x_idx):
                            curr_frames[index]['sorted_idx'] = i1
    #                         curr_frames[index]['embedding'] = []

                        _, idx = np.unique(y_idx, return_index=True)
                        y_idx = y_idx[np.sort(idx)]

                        for i1, index in enumerate(y_idx):
                            prev_frames[index]['sorted_idx'] = i1
    #                         prev_frames[index]['embedding'] = []
    #                         prev_frames[index].pop('embedding')

                        d = {"part": i, "chapter":j, "character": c, 
                             'start_char': chapters[j]['start_char'], 'end_char': chapters[j]['end_char'],
                             'change_in_action': float(np.mean(m)),
                             'curr_frames': curr_frames, 'prev_frames': prev_frames }
                        actions.append(d)
                    

In [404]:
data['actions'] = actions

### Actions only

In [405]:
frames = data['frames']

In [406]:
actions_only = []
for f in frames:
    sent = sentence_mapping[f['global_sent_id']]
    for arg in f['args']:
        if 'V' in arg['arg_type']:
            actions_only.append({
                "actions_only": arg['arg_text'],
                "sent_start_char": sent['start_char'],
                "sent_end_char": sent['end_char'],
                "character": f['got_character_as_agent'][0],
                "action_idx":(arg['start_char'], arg['start_end'])})
    #adj[t.text] = tmp #h.text
    

In [407]:
from sklearn.cluster import KMeans

In [408]:
all_emd = [f['embedding'] for f in frames]
all_words = [f['verb'] for f in frames]

In [409]:
import numpy as np
X = np.array(all_emd)

In [410]:
km = KMeans(
    n_clusters=5, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)
y_km = km.fit_predict(X)

In [411]:
for i, y in enumerate(y_km):
    actions_only[i]['category'] = int(y)

In [412]:
actions_only[:10]

[{'actions_only': 'checked',
  'sent_start_char': 234,
  'sent_end_char': 268,
  'character': 'Sophie',
  'action_idx': (251, 259),
  'category': 0},
 {'actions_only': 'busied',
  'sent_start_char': 777,
  'sent_end_char': 847,
  'character': 'Sophie',
  'action_idx': (781, 788),
  'category': 4},
 {'actions_only': 'resolving',
  'sent_start_char': 777,
  'sent_end_char': 847,
  'character': 'Sophie',
  'action_idx': (807, 817),
  'category': 2},
 {'actions_only': 'survive',
  'sent_start_char': 777,
  'sent_end_char': 847,
  'character': 'Sophie',
  'action_idx': (827, 835),
  'category': 2},
 {'actions_only': 'liked',
  'sent_start_char': 848,
  'sent_end_char': 1157,
  'character': 'Sophie',
  'action_idx': (855, 861),
  'category': 3},
 {'actions_only': 'found',
  'sent_start_char': 848,
  'sent_end_char': 1157,
  'character': 'Sophie',
  'action_idx': (938, 944),
  'category': 0},
 {'actions_only': 'loved',
  'sent_start_char': 848,
  'sent_end_char': 1157,
  'character': 'Sophie'

In [413]:
data['actions_only'] = actions_only

### Direct quotes

In [414]:
regex3 = '"([^"]*)"'
regex4 = '"I\s+|\s+I\s+|\s+me\s+|\s+my\s+|\s+me(;|.|,|!|\?)'

ds = []

for i, part in enumerate(all_clusters):
    for j, chapter in enumerate(part['chapters']):
#         if j < 17:
            s = chapter['start_char']
            e = chapter['end_char']
            ds_dict = {}
            for m in re.finditer(regex3, book[s:e]):
                m_span = m.span()
                #print("......",m.group())
                d = {'part':i, 'chapter':j,
                       'start_char': m_span[0] + s,
                       'end_char': m_span[1] + s,
                       'discourse_text': m.group(),
                       'direct_discourse': 1}

                for sub_m in re.finditer(regex4, m.group()):
                    sub_m_span = sub_m.span()
                    for c in characters:
                        for mention in characters[c]:
                            if (mention[0] >= (s + m_span[0] + sub_m_span[0] - 2)) \
                                and (mention[1] < (s + m_span[0] + sub_m_span[1] + 2)) :

                                if m_span not in ds_dict:
                                    d['character'] =  c
                                    d['mention_start'] = mention[0]
                                    d['mention_end'] =  mention[1]
                                    ds_dict[m_span] = c
                                    break;
#                 if 'character' in d:
                ds.append(d)       

In [415]:
len(ds)

54

In [416]:
ds

[{'part': 0,
  'chapter': 0,
  'start_char': 2529,
  'end_char': 2550,
  'discourse_text': '"It’s for my mother,"',
  'direct_discourse': 1},
 {'part': 0,
  'chapter': 0,
  'start_char': 2605,
  'end_char': 2663,
  'discourse_text': '"I’m going\nto make the cupcakes myself, all from scratch."',
  'direct_discourse': 1},
 {'part': 0,
  'chapter': 0,
  'start_char': 2664,
  'end_char': 2670,
  'discourse_text': '"Aww,"',
  'direct_discourse': 1},
 {'part': 0,
  'chapter': 0,
  'start_char': 2720,
  'end_char': 2740,
  'discourse_text': '"What a Mama’s boy."',
  'direct_discourse': 1},
 {'part': 0,
  'chapter': 0,
  'start_char': 2741,
  'end_char': 2760,
  'discourse_text': '"Isn’t that sweet?"',
  'direct_discourse': 1},
 {'part': 0,
  'chapter': 0,
  'start_char': 2780,
  'end_char': 2856,
  'discourse_text': '"I’ve never even seen you make that kind of effort\nfor your fiancée before."',
  'direct_discourse': 1},
 {'part': 0,
  'chapter': 0,
  'start_char': 4373,
  'end_char': 4388,
 

In [417]:
data['direct_discourse_data'] = ds

### Adjectives

In [418]:
def adjectives(sent):
    doc = nlp(sent['text'])
    adj = []
    for t in doc:
        if t.pos_ == "ADJ":
            h = t.head
            found = False
            #print("t: ", t.text, t.pos_)
            while not found:
                #print("h: ", h.text, h.pos_)
                if h.pos_=="VERB":
                    for c in h.children:
                        #print("child: ", c.text, c.dep_)
                        if str(c.dep_)=="nsubj":
                            h = c
                            noun = c.text
                            noun_st_idx = c.idx
                            if len(list(c.children))>0:
                                noun = " ".join([d.text for d in c.children]) + " " + noun
                                noun_st_idx = list(c.children)[0].idx
#                             print("Found! ",t.text, h.text)
                            adj.append({"adjective":t.text,
                                        "embedding": get_glove_embedding("", t.text),
                                        "adjective_idx":(t.idx + sent['start_char'], t.idx+ sent['start_char'] +len(t)), 
                                        "subject":noun, 
                                        "subject_idx":(noun_st_idx + sent['start_char'], noun_st_idx+ sent['start_char'] +len(noun))})
                            #adj[t.text] = tmp #h.text
                            found = True
                            break

                if h.text == h.head.text:
#                     print("BREAK: ", h.text, h.head.text)
                    break
                h = h.head

#             if found==False:
#                 print(t.text, " - Not Found!")

    #print(adj)
    return adj

In [419]:
adjective_data = []

for sent_id, sent in enumerate(sentence_mapping):
    for adj in adjectives(sent):
        for mention in sent['mentions']:
#             print(mention, adj['subject_idx'], adj['subject'])
            try:
                if len(adj['embedding']):
                    if (adj['subject_idx'][0] >= mention['start_char']) and (adj['subject_idx'][1] <= mention['end_char']):
                        adj['character'] = mention['character']
                        adj["part"] = sent['part']
                        adj["chapter"] =  sent['chapter']
                        adj["sent_start_char"] = sent['start_char']
                        adj["sent_end_char"] = sent['end_char']
                        adj["global_sent_id"] =  sent['global_sent_id']
                        adj["local_sent_id"] = sent['local_sent_id']
                        adj['embedding'] = adj['embedding'].tolist()
                        adjective_data.append(adj)
            except Exception as e:
                print(adj['embedding'])

Exception
 English
[0.6050599813461304, 0.18190999329090118, 0.16393999755382538, -0.17321999371051788, 0.39002999663352966, 0.14191000163555145, -0.18749000132083893, 0.22791999578475952, -0.13433000445365906, -0.03973900154232979, -0.2924799919128418, -0.11434999853372574, -0.612280011177063, -0.11084000021219254, -0.08680599927902222, 0.4319700002670288, 0.46268999576568604, -0.9127900004386902, 0.0032685999758541584, -0.8772199749946594, -0.4365600049495697, 0.41110000014305115, 0.30052998661994934, -0.47099998593330383, 0.4814099967479706, -1.6452000141143799, -0.00877629965543747, 1.0281000137329102, 0.6123899817466736, -0.057787999510765076, 3.1659998893737793, -0.06929799914360046, -0.6797299981117249, -0.7535099983215332, -0.038444001227617264, -0.42412999272346497, 0.178849995136261, 0.2813200056552887, -0.38947999477386475, -0.2701700031757355, 0.2206300050020218, -0.018678000196814537, 0.18915000557899475, 0.16609999537467957, -0.20282000303268433, 0.10659000277519226, -0.3

In [420]:
adjective_data[10]

{'adjective': 'single',
 'embedding': [-0.22134000062942505,
  0.4932299852371216,
  0.5309500098228455,
  0.03496599942445755,
  0.8447200059890747,
  1.2430000305175781,
  -0.6565399765968323,
  0.11010000109672546,
  0.5758799910545349,
  0.5105100274085999,
  0.42149001359939575,
  -0.11423999816179276,
  -0.32943999767303467,
  1.2482999563217163,
  0.2196899950504303,
  -0.7825899720191956,
  0.011030999943614006,
  -0.25457999110221863,
  -0.2687399983406067,
  -0.4099299907684326,
  -0.2869499921798706,
  -0.7466400265693665,
  0.08694600313901901,
  0.47804000973701477,
  -0.13755999505519867,
  -1.0214999914169312,
  -0.5839899778366089,
  0.0527539998292923,
  0.6379299759864807,
  -0.5322800278663635,
  3.6308999061584473,
  -0.6694999933242798,
  0.10321000218391418,
  0.6220999956130981,
  0.3382500112056732,
  -0.19047999382019043,
  1.2381000518798828,
  -0.18893000483512878,
  -0.32234999537467957,
  -0.365090012550354,
  -0.041478998959064484,
  -0.608269989490509,
  

In [421]:
all_emd = [f['embedding'] for f in adjective_data]
all_words = [f['adjective'] for f in adjective_data]

X = np.array(all_emd)
y_km = km.fit_predict(X)

In [422]:
for i, y in enumerate(y_km):
    adjective_data[i]['category'] = int(y)

In [423]:
data['adjective_data'] = adjective_data

In [424]:
data.keys()

dict_keys(['sentiment_emotion_data', 'characters', 'characters_map', 'sentence_mapping', 'chapters_map', 'tags_map', 'tags_presence', 'frames', 'actions', 'actions_only', 'direct_discourse_data', 'adjective_data'])

In [425]:
# sent = "The King was young, handsome, and wealthy; the Queen had a nature as good and gentle as her face was beautiful; and they adored one another, having married for love—which among kings and queens is not always the rule."
# adjectives(sent)

### Export Json file

In [40]:
## combine all features (agency, sentiment) together
for i, entry in enumerate(data['analyzed_data']):
    a = agency[i]
    d = {}
    if ((a['part'] == entry['part']) and (a['chapter'] == entry['chapter']) and (a['character'] == entry['character'])):
        entry['agency'] = a['agency']
        
        

In [241]:
import json
f = open( '../static/data/story/'+story+ '/' + story + '.json')
data = json.load(f)

In [246]:
data['sentiment_emotion_data'] = sentiment_emotion_data
data['sentence_mapping'] = sentence_mapping

In [82]:
data['chapters_map'] = all_clusters

In [426]:
import json
with open('../static/data/story/'+story+ '/'+ story +'.json', 'w') as outfile:
    outfile.write(json.dumps(data, indent=2))