In [1]:
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
import spacy
import math
import difflib
from spacy.symbols import nsubj, VERB
from num2words import num2words
import gensim
import string
word2vec = gensim.models.Word2Vec
from os.path import join

In [2]:
GLOVE_DIR = '/home/vishesh/TUM/Thesis/glove6B'

In [46]:
glove_model = gensim.models.KeyedVectors.load_word2vec_format(join(GLOVE_DIR, 'glove.6B.50d.w2vformat.txt'), binary=False)

In [4]:
INPUT_DIM = 1237
HIDDEN_DIM1 = 1000
HIDDEN_DIM2 = 500
HIDDEN_DIM3 = 500
OUTPUT_DIM = 2

In [5]:
class FFNN(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, hidden_dim3, output_dim):
        super(FFNN, self).__init__()

        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.relu1 = nn.ReLU()

        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.relu2 = nn.ReLU()

        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.relu3 = nn.ReLU()

        self.fc4 = nn.Linear(hidden_dim2, output_dim)
    
    def forward(self, x):
        #x = x.view(-1, 1337)
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        out = self.relu3(out)
        out = self.fc4(out)
        
        return out

In [6]:
model = FFNN(INPUT_DIM, HIDDEN_DIM1, HIDDEN_DIM2, HIDDEN_DIM3, OUTPUT_DIM)

In [106]:
model = torch.load('/home/vishesh/TUM/Thesis/Coreference-Resolution/models/bestModel.pt')

In [8]:
input_dev = np.load('/home/vishesh/TUM/Thesis/Coreference-Resolution/data/processed/ffnn_input_dev.npy')

In [9]:
output_dev = np.load('/home/vishesh/TUM/Thesis/Coreference-Resolution/data/processed/ffnn_output_dev.npy')

In [10]:
i = (torch.from_numpy(input_dev[9].reshape(INPUT_DIM))).float()

In [11]:
o = torch.from_numpy(output_dev[7])

In [12]:
output = model(Variable(i))
nnjk, predicted = torch.max(output.data, 0)

In [13]:
predicted


 0
[torch.LongTensor of size 1]

In [14]:
nlp = spacy.load('en')

In [15]:
sentence = 'Standing tall on the Taihang Mountain is the Monument to the Hundred Regiments Offensive.'
s2 = 'The child is running very fast.'
s3 = 'This map reflected the European battlefield situation.'
s4 = 'From one side, it siezed an important city in China called Yichang.'
s5 = 'It was this year that the Japanese Army developed the strategy.'

In [16]:
def get_mentions(sentence):
    '''
    This function returns all the possible mentions for any given sentence.
    
    Args:
    sentence: The sentence for which we want to find the mentions.
    
    Returns:
    mentions: list of all the mentions in the sentence.
    '''
    verbs = []
    mentions = []
    tokens = nlp(sentence)
    for token, np in zip(tokens, tokens.noun_chunks):
        if token.pos_ == 'PRON':
            mentions.append(token.text)
            if token.dep == nsubj and token.head.pos == VERB:
                mentions.append(token.head.text)
        if np.text not in mentions:
            mentions.append(np.text)
    return mentions
    
    

In [17]:
get_mentions(s5)

['It', 'was', 'the Japanese Army', 'the strategy']

In [18]:
import sys
sys.path.insert(0,'/home/vishesh/TUM/Thesis/huggingface/neuralcoref') 
from neuralcoref import Coref

In [19]:
coref = Coref()

Loading spacy model

    [93mInfo about model en_core_web_sm[0m

    lang               en             
    pipeline           ['tagger', 'parser', 'ner']
    accuracy           {'token_acc': 99.8698372794, 'ents_p': 84.9664503965, 'ents_r': 85.6312524451, 'uas': 91.7237657538, 'tags_acc': 97.0403350292, 'ents_f': 85.2975560875, 'las': 89.800872413}
    name               core_web_sm    
    license            CC BY-SA 3.0   
    author             Explosion AI   
    url                https://explosion.ai
    vectors            {'keys': 0, 'width': 0, 'vectors': 0}
    sources            ['OntoNotes 5', 'Common Crawl']
    version            2.0.0          
    spacy_version      >=2.0.0a18     
    parent_package     spacy          
    speed              {'gpu': None, 'nwords': 291344, 'cpu': 5122.3040471407}
    email              contact@explosion.ai
    description        English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vect

In [20]:
mention_list = []
clusters = coref.one_shot_coref(utterances=s5)
mention_list.append(coref.get_mentions())

In [21]:
mention_list

[[It, this year, the Japanese Army, the strategy]]

In [22]:
def get_pre_words(passage, mention):
    '''
    This function returns the previous 5 words of the mention from the passage
    
    Args:
    passage: The passage from which the previous words have to be chosen.
    mention: The word whose previous words have to be found.
    
    Returns:
    pre_words: Previous 5 words from the passage with respect to mention
    '''

    tokens = nlp(passage)
    mention_tokens = mention.split()
    for i in range(0, len(tokens)):
        if str(tokens[i]) == mention_tokens[0]:
            flag = True
            c = i
            for m in range(0, len(mention_tokens)):
                if mention.split()[m] != tokens[c].text:
                    flag = False
                c += 1
            if flag == True:
                break
    pre_words = []
    for p in range(i-1, i-6, -1):
        pre_words.append(str(tokens[p]))
        if p == 0:
            break
    return pre_words
    

In [23]:
def get_next_words(passage, mention):
    '''
    This function returns the next 5 words of the mention from the passage.
    
    Args:
    passage: The passage from which the next words have to be chosen.
    mention: The word whose next words have to be found.
    
    Returns:
    next_words: Next 5 words from the passage with respect to mention.
    '''
    tokens = nlp(passage)
    mention_tokens = mention.split()
    for i in range(0, len(tokens)):
        if str(tokens[i]) == mention_tokens[0]:
            flag = True
            for m in range(0, len(mention_tokens)):
                if mention.split()[m] != tokens[i].text:
                    flag = False
                i += 1
            if flag == True:
                break
    next_words = []
    for p in range(i, i + 5):
        next_words.append(str(tokens[p]))
        if p == len(tokens) -1:
            break
    return next_words

In [24]:
def get_mention_sentence(passage, mention):
    '''
    This function returns the sentence in which the mention occurs.
    
    Args:
    passage: The passage from which the next words have to be chosen.
    mention: The word whose next words have to be found
    
    Returns:
    sentence: The sentence in which the mention occurs.
    s: Index of the sentence in the passage.
    '''
    sentences = splitParagraphIntoSentences(passage)
    mention_tokens = mention.split()
    flag = False
    for s in range(0, len(sentences)):
        s_tokens = nlp(sentences[s])
        for i in range(0, len(s_tokens)):
            if str(s_tokens[i]) == mention_tokens[0]:
                flag = True
                for m in range(0, len(mention_tokens)):
                    if mention_tokens[m] != str(s_tokens[i]):
                        flag = False
                    i += 1
                if flag == True:
                    break
        if flag == True:
            break
        
    return sentences[s], s+1

In [26]:
# Inspired from this post.
# https://stackoverflow.com/questions/8465335/a-regex-for-extracting-sentence-from-a-paragraph-in-python
def splitParagraphIntoSentences(paragraph):
    import re
    sentenceEnders = re.compile(r"""
        # Split sentences on whitespace between them.
        (?:               # Group for two positive lookbehinds.
          (?<=[.!?])      # Either an end of sentence punct,
        | (?<=[.!?]['"])  # or end of sentence punct and quote.
        )                 # End group of two positive lookbehinds.
        (?<!  Mr\.   )    # Don't end sentence on "Mr."
        (?<!  Mrs\.  )    # Don't end sentence on "Mrs."
        (?<!  Jr\.   )    # Don't end sentence on "Jr."
        (?<!  Dr\.   )    # Don't end sentence on "Dr."
        (?<!  Prof\. )    # Don't end sentence on "Prof."
        (?<!  Sr\.   )    # Don't end sentence on "Sr."
        \s+               # Split on whitespace between sentences.
        """, 
        re.IGNORECASE | re.VERBOSE)
    sentenceList = sentenceEnders.split(paragraph)
    return sentenceList

In [27]:
get_mention_sentence('It was this year that the Japanese Army developed the strategy. It was great. I loved it.', 'great')

('It was great.', 2)

In [28]:


# pronoun: [1, 0, 0, 0]
# proper:  [0, 1, 0, 0]
# nominal(common noun): [0, 0, 1, 0]
# list:    [0, 0, 0, 1]
def mention_type(doc, mention):
    # pos 0: pronoun, pos 1: proper noun, pos 2: common noun
    '''
    This function returns the type of mention in the above annotated form.
    '''
    token_type = [0, 0, 0]
    for token in doc:
        if token.pos_ == 'PRON':
            token_type[0] += 1
        elif token.pos_ == 'PROPN':
            token_type[1] += 1
        elif token.pos_ == 'NOUN':
            token_type[2] += 1   
    m = max(token_type)
    a = [i for i, j in enumerate(token_type) if j == m]  
    is_dominant = m >= len(mention.split())/2 
    if is_dominant:
        if a[0] == 0:
            return np.array([1, 0, 0, 0])
        if a[0] == 1:
            return np.array([0, 1, 0, 0])
        if a[0] == 2:
            return np.array([0, 0, 1, 0])
    else:
        return np.array([0, 0, 0, 1])

In [29]:
def get_mention_position(mentions, m):
    '''
    This function returns the position of the mention m relative to the all the other mentions.
    
    Args:
    mentions: all the mentions in the passage.
    m: the mention whose relative index is to be found.
    
    Returns:
    position: position of mention m relative to all the other mentions.
    '''
    num_mentions = len(mentions)
    for index in range(num_mentions):
        if mentions[index] == m:
            break
    position = num_mentions/(index + 1)
    return position

In [30]:
def mention_contain(mentions, m):
    '''
    This function checks whether the mention m is contained inside any other mention or not.
    
    Args:
    mentions: all the mentions in the passage.
    m: the mention whose relative index is to be found.
    
    Returns:
    True or False
    '''
    flag = False
    for mention in mentions:
        if m in mention:
            if m == mention:
                continue
            flag = True
            break
    return flag

In [31]:
mentions = get_mentions(sentence)
print (mentions)
mention_contain(mentions, 'Monumentt')

['the Taihang Mountain', 'the Monument', 'the Hundred Regiments Offensive']


False

In [32]:
def get_mention_length(mention):
    '''
    This function returns the length of the mention in words.
    
    Args:
    mention: The mention whose length in words is required.
    
    Returns:
    len_in_words: Length in words like one, two, etc.
    '''
    mention_words = mention.split()
    mention_len = len(mention_words)
    len_in_words = num2words(mention_len)
    return len_in_words

In [33]:
def distance(a):
    '''
    This function takes in an integer and returns a 10 dimentional vector.
    
    Args: 
    a(int): The integer whose vector has to be calculated.
    
    Returns:
    list: a 10 dim vector.
    '''
    d = np.zeros((10))
    d[a == 0, 0] = 1
    d[a == 1, 1] = 1
    d[a == 2, 2] = 1
    d[a == 3, 3] = 1
    d[a == 4, 4] = 1
    d[(5 <= a) & (a < 8), 5] = 1
    d[(8 <= a) & (a < 16), 6] = 1
    d[(16 <= a) & (a < 32), 7] = 1
    d[(a >= 32) & (a < 64), 8] = 1
    d[a >= 64, 9] = 1
    return d.tolist()

In [34]:
def extract_features(passage):
    '''
    This function takes in the passage for which the mention and its features have to be found and 
    returns all the features about each mention in the passage.
    
    Args:
    passage: The passage whose mention and features have to be found.
    
    Returns:
    mention_info: A list with all the mentions along with its features.
    '''
    mention_info = []
    mentions = get_mentions(passage)
    count = 1
    for m in mentions:
        mention_dict = {}
        doc = nlp(m)
        mention_dict['id'] = count
        mention_dict['mention'] = m
        mention_dict['first_word'] = str(doc[0])
        mention_dict['last_word'] = str(doc[-1])
        if m.isdigit() or m == 'its' or m.lower() == 'that' or m.lower() == 'this':
            mention_dict['head_word'] = ''
        else:
            if len(list(doc.noun_chunks)) > 0:
                mention_dict['head_word'] = list(doc.noun_chunks)[0].root.head.text
            else:
                mention_dict['head_word'] = ''
        mention_dict['pre_words'] = get_pre_words(passage, m)
        mention_dict['next_words'] = get_next_words(passage, m)
        sentence, sen_index = get_mention_sentence(passage,m)
        mention_dict['mention_sentence'] = sentence
        mention_dict['sentence_index'] = sen_index
        mention_dict['mention_type'] = mention_type(doc, m)
        mention_dict['speaker'] = 'Speaker#1'
        mention_dict['mention_position'] = get_mention_position(mentions, m)
        mention_dict['contained'] = mention_contain(mentions, m)
        mention_dict['mention_length'] = get_mention_length(m)
        mention_info.append(mention_dict)
        count += 1
    return mention_info

In [35]:
def overlap(mention1, mention2):
    '''
    This function checks if mention2 overlaps the mention1 or not.
    
    Args:
    mention1: Mention
    mention2: Mention
    
    Returns:
    True or False
    '''
    flag = 0
    if mention2 in mention1:
        flag = 1

    return flag

In [36]:
def make_pairs(passage):
    '''
    This function takes in the passage whose mentions have to be found, extracts the mention features 
    and returns a list with the pair of mentions.
    
    Args:
    passage: The text which has to be parsed for mentions
    
    Returns:
    pairs_list: List of pairs of mentions which have to checked for coreference.
    '''
    mentions_features = extract_features(passage)
    pairs_list = []
    num_mentions = len(mentions_features)
    for i in range(num_mentions - 1):
        for j in range(i+1, num_mentions):
            pairs = []
            pairs.append(mentions_features[i])
            pairs.append(mentions_features[j])
            pairs_list.append(pairs)
    return pairs_list
            

In [37]:
def get_pair_features(passage):
    '''
    
    '''
    pairs = make_pairs(passage)
    for p in pairs:
        
        #print (p)
        seq=difflib.SequenceMatcher(None, p[0]['mention'],p[1]['mention'])
        score = seq.ratio()
        p.append({'mention_distance': distance(abs(p[0]['id'] - p[1]['id']))})
        p.append({'sentence_distance': distance(abs(p[0]['sentence_index'] - p[1]['sentence_index']))})
        p.append({'overlap': overlap(p[0]['mention'], p[1]['mention'])})
        p.append({'speaker': 1})
        if p[1]['head_word'] == p[0]['head_word']:
            p.append({'head_match': 1})
        else:
            p.append({'head_match': 0})
        if p[1]['mention'] == p[0]['mention']:
            p.append({'mention_exact_match': 1})
        else:
            p.append({'mention_exact_match': 0})
        if score > 0.6:
            p.append({'mention_partial_match': 1})
        else:
            p.append({'mention_partial_match': 0})
        
    return pairs        

In [38]:
p = get_pair_features(sentence)
p

[[{'contained': False,
   'first_word': 'the',
   'head_word': 'Mountain',
   'id': 1,
   'last_word': 'Mountain',
   'mention': 'the Taihang Mountain',
   'mention_length': 'three',
   'mention_position': 3.0,
   'mention_sentence': 'Standing tall on the Taihang Mountain is the Monument to the Hundred Regiments Offensive.',
   'mention_type': array([0, 1, 0, 0]),
   'next_words': ['is', 'the', 'Monument', 'to', 'the'],
   'pre_words': ['on', 'tall', 'Standing'],
   'sentence_index': 1,
   'speaker': 'Speaker#1'},
  {'contained': False,
   'first_word': 'the',
   'head_word': 'Monument',
   'id': 2,
   'last_word': 'Monument',
   'mention': 'the Monument',
   'mention_length': 'two',
   'mention_position': 1.5,
   'mention_sentence': 'Standing tall on the Taihang Mountain is the Monument to the Hundred Regiments Offensive.',
   'mention_type': array([0, 1, 0, 0]),
   'next_words': ['to', 'the', 'Hundred', 'Regiments', 'Offensive'],
   'pre_words': ['is', 'Mountain', 'Taihang', 'the', '

## Make vector

In [47]:
def get_vector(word):
    '''
    This function takes in the word for which we want to find the GLoVe vector and returns the 
    50 dim representation of the word.
    
    Args:
    word(string): The word for which we want to find the vector.
    
    Returns:
    vector: A 50 dim representation of the word as per GLoVe.    
    '''
    
    table = str.maketrans({key: None for key in string.punctuation})
    word = word.lower()
    if len(word) > 1:
        word = word.translate(table)
    try:
        vec = glove_model[word]
    except:
        vec = np.zeros((50, 1))
    return vec.reshape((50, 1))

In [49]:
#get_vector('hello')

In [50]:
def get_average_vector(word_list):
    '''
    This function takes in a list of words and returns the average GLoVe vector for all the words.
    
    Args:
    word_list(list of string): The list of words whose average vector has to be calculated.
    
    Returns:
    average_vector: A 50 dim GLoVe representation of the average of the list of words.
    '''
    sum = np.zeros((50, 1))
    for i in range(0, len(word_list)):
        sum += get_vector(word_list[i])
    average_vector = sum/(i+1)
    return average_vector

In [51]:
#get_vector('hello')

In [95]:
# p: previous, n: next, w: words, a: average, s: sentence
def get_mention_features_vector(mention_features, passage):
    '''
    This function takes in all the features per mention and returns a vector of those mentions features.
    
    Args:
    mention_features: All the features of the mention which has to be converted to a vector.
    passage: The passage to which the mention belongs.
    
    Returns:
    features: A vector unilizing all the features of the mention.
    '''
    first_w = get_vector(mention_features['first_word'])
    last_w = get_vector(mention_features['last_word'])
    mention_length = get_vector(mention_features['mention_length'])
    mention_type = np.array(mention_features['mention_type']).reshape((4, 1))
    mention_position = np.array(mention_features['mention_position']).reshape((1, 1))
    
    if mention_features['contained'] == False:
        mention_contain = np.zeros((1, 1))
    else:
        mention_contain = np.ones((1, 1))
    if len(mention_features['pre_words']) > 0:
        mention_p_w1 = get_vector(mention_features['pre_words'][0])
    else:
        mention_p_w1 = np.zeros((50, 1))
    
    if len(mention_features['pre_words']) > 1:
        mention_p_w2 = get_vector(mention_features['pre_words'][1])
    else:
        mention_p_w2 = np.zeros((50, 1))
    if len(mention_features['next_words']) > 0:
        mention_n_w1 = get_vector(mention_features['next_words'][0])
    else:
        mention_n_w1 = np.zeros((50, 1))
    if len(mention_features['next_words']) > 1:
        mention_n_w2 = get_vector(mention_features['next_words'][1])
    else:
        mention_n_w2 = np.zeros((50, 1))
    if len(mention_features['pre_words']) > 0:
        mention_p_w_a = get_average_vector(mention_features['pre_words'])
    else:
        mention_p_w_a = np.zeros((50, 1))
    if len(mention_features['next_words']) > 0:
        mention_n_w_a = get_average_vector(mention_features['next_words'])
    else:
        mention_n_w_a = np.zeros((50, 1))
        
        
    mention_s_a = get_average_vector(mention_features['mention_sentence'].split())
    
    doc_avg = get_average_vector(passage)
    
    features = np.concatenate((first_w, last_w, mention_p_w1, mention_p_w2, mention_p_w_a, \
                              mention_n_w1, mention_n_w2, mention_n_w_a, mention_s_a, mention_length, \
                              mention_type, mention_position, mention_contain, doc_avg))
    return features

In [96]:
x = get_mention_features_vector(p[0][0], sentence)

In [97]:
def get_pair_features_vector(features):
    '''
    This function gets all the list of pair features, and returns the vector of the pair features.
    
    Args:
    features: All the features of mentions and pair features.
    
    Returns:
    pair_features: A vector with all the pair features.
    '''
    # distance features
    mention_dist = np.array(features[2]['mention_distance']).reshape((10, 1))
    s_dist = np.array(features[3]['sentence_distance']).reshape((10, 1))
    overlap = np.array(features[4]['overlap']).reshape((1, 1))
    
    # speaker feature
    speaker = np.array(features[5]['speaker']).reshape((1, 1))
    
    # string matching features
    head_match = np.array(features[6]['head_match']).reshape((1, 1))
    mention_exact_match = np.array(features[7]['mention_exact_match']).reshape((1, 1))
    mention_partial_match = np.array(features[8]['mention_partial_match']).reshape((1, 1))
    
    pair_features = np.concatenate((mention_dist, s_dist, overlap, speaker, head_match, \
                                   mention_exact_match, mention_partial_match))
    return pair_features

In [142]:
def make_feature_input_vector(feature_list_all_mentions, passage):
    '''
    This function takes in all the features of mentions(individual features and pair features) and the 
    passage of the mention, and returns the list of vectors for each mention pair.
    '''
    
    input_feature_list = []
    i = 0
    for m in feature_list_all_mentions:
        input_feature_vector = []
        mention_avg = get_average_vector(m[1]['mention'].split())
        antecedent_avg = get_average_vector(m[0]['mention'].split())
        mention_features = get_mention_features_vector(m[1], passage)
        antecedent_features = get_mention_features_vector(m[1], passage)
        pair_features = get_pair_features_vector(m)
        
        input_feature_vector.append(antecedent_avg)
        input_feature_vector.append(antecedent_features)
        input_feature_vector.append(mention_avg)
        input_feature_vector.append(mention_features)
        input_feature_vector.append(pair_features)
        input_feature_list.append(input_feature_vector)
        
    return input_feature_list

In [143]:
def make_input_vector(passage, features):
    '''
    This function takes in the passage and returns the vector for all the mentions to be
    tested for coreference.
    
    Args:
    passage: The passage in which the coreference have to be resolved.
    
    Returns:
    feature_input: The vector which can be fed into the neural network to resolve coreferences.
    '''
    
    #features = get_pair_features(passage)
    feature_input = make_feature_input_vector(features, passage)
    input_vector_list = []
    for f_input in feature_input:
        con = np.concatenate((f_input[0], f_input[1], f_input[2], f_input[3], f_input[4]))
        input_vector_list.append(con)
        del con
    return input_vector_list

In [153]:
#coref_list = make_input_vector(s5)
#len(coref_list)


In [145]:
(coref_list[0]).shape

(1237, 1)

In [186]:
output_dev[:10]

array([[1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0]])

In [190]:
i = (torch.from_numpy(input_dev[4].reshape(INPUT_DIM))).float()

In [191]:
output = model(Variable(i))
_, predicted = torch.max(output.data, 0)

In [192]:
(int(predicted.numpy()))

0

In [173]:
def find_coref_pairs(passage):
    '''
    This function takes in the passage for which we want to resolve the coreferences, and returns a list containing 
    the coreference pairs.
    
    Args:
    passage: The passage/sentence for which we want to resolve the coreferences.
    
    Returns:
    list: A list containing all the possibly resolved coreferences.
    '''
    coreferences = []
    features = get_pair_features(passage)
    vector = make_input_vector(passage, features)
    print (len(features))
    print (len(vector))
    for v, f in zip(vector, features):
        input_to_NN = (torch.from_numpy(input_dev[1].reshape(INPUT_DIM))).float()
        output_from_NN = model(Variable(i))
        _, predicted = torch.max(output.data, 0)
        if int(predicted.numpy()) == 1:
            coreferences.append(({'antecedent': f[0]['mention']}, {'mention': f[1]['mention']}))
    return coreferences 
    

In [175]:
find_coref_pairs('John is in the school. He enjoys it.')

6
6


[({'antecedent': 'John'}, {'mention': 'the school'}),
 ({'antecedent': 'John'}, {'mention': 'He'}),
 ({'antecedent': 'John'}, {'mention': 'it'}),
 ({'antecedent': 'the school'}, {'mention': 'He'}),
 ({'antecedent': 'the school'}, {'mention': 'it'}),
 ({'antecedent': 'He'}, {'mention': 'it'})]