In [1]:
import re
import sys
import spacy
import numpy as np
import json
import difflib
import math
from num2words import num2words
#sys.path.insert(0,'/home/vishesh/TUM/Thesis/huggingface/neuralcoref') 
#from neuralcoref import Coref
from random import randint

In [2]:
from os.path import join
from numpy import array
import string 
import gensim
import time
import os
word2vec = gensim.models.Word2Vec

In [3]:
GLOVE_DIR = "/glove6B"

In [4]:
model = gensim.models.KeyedVectors.load_word2vec_format(join(GLOVE_DIR, 'glove.6B.50d.w2vformat.txt'), binary=False)

In [5]:
nlp = spacy.load('en')

In [6]:
def train_file_to_list(file):
    '''
    Convert the training file to a list of strings.
    '''
    train_list = []
    for line in file:
        train_list.append(line)
    return train_list

In [7]:
def get_documents(train_file):
    '''
    This function returns a list, where each element of the list is a document from the training file.
    '''
    train_list = train_file_to_list(train_file)
    document = []
    part = []
    sentence = ''
    for i in range (len(train_list)):
        if train_list[i] == '\n':
            part.append(sentence)
            sentence = ''
            continue
        cols = train_list[i].split()
        if cols[0] == '#begin' or cols[0] == '#end':
            if len(part) > 0:
                document.append(part)
                part = []
            continue
        else:
            if cols[3] == '\'s' or cols[3] == '.' or cols[3] == ',' or cols[3] == '?':
                sentence = sentence.strip() + cols[3] + ' '
            else:
                sentence += cols[3] + ' '    
    return document             

In [8]:
def create_mention_cluster_list(cluster_start, start_pos, cluster_end, end_pos):
    
    cluster_start_end_list = []
    for start, pos in zip(cluster_start, start_pos):
        cluster = [start, pos]
        for i in range(len(cluster_end)):
            if cluster_end[i] == start:
                cluster.append(end_pos[i])
                break
        del cluster_end[i]
        del end_pos[i]
        cluster_start_end_list.append(cluster)
    return cluster_start_end_list
        

In [9]:
def get_mention(train_list):
    '''
    From the training list, get the start and end position of the mentions in the documents.
    '''
    cluster_start = []
    start_pos = []
    cluster_end = []
    end_pos = []
    i = 1
    for line in train_list:
        if line == '\n' or line == '-':
            i += 1
            continue
        part_number = line.split()[1]
        coref_col = line.split()[-1]
        for j in range (len(coref_col)):
            if coref_col[j] == '(':
                cluster_start.append((str(part_number) + '_' + re.findall(r'\d+', coref_col[j+1:])[0]))
                start_pos.append(i)
            if coref_col[j] == ')':
                cluster_end.append((str(part_number)+ '_' + re.findall(r'\d+', coref_col[:j])[-1]))
                end_pos.append(i)            
        i += 1
    return cluster_start, start_pos, cluster_end, end_pos

In [10]:
def get_mention_words(train_file_as_list, pos1, pos2):
    '''
    Get the words of the mention.
    '''
    mention = ''
    for line_no in range(pos1-1, pos2):
        word = train_file_as_list[line_no].split()[3]
        #if word == '\'s' or word == ',' or word == '.':
        #    mention = mention.strip() + word + ' '
        #else:
        mention += word + ' '
    return mention.strip()

In [11]:
def get_preceding_words(list, pos):
    '''
    Get the 5 preceding words from the starting position of the mention.
    '''
    word_part = list[pos-1].split()[1]
    i = 2
    num_words = 0
    word = []
    while(True):
        if list[pos-i] != '\n':
            if list[pos-i].split()[0] == '#begin' or list[pos-i].split()[0] == '#end':
                break
            part_no = list[pos-i].split()[1]
            if part_no == word_part:
                word.append(list[pos-i].split()[3])
                num_words += 1
            if num_words == 5:
                break
        i += 1
    return word

In [12]:
def get_next_words(list, pos):
    '''
    Get 5 words after the last word of the mention.
    '''
    pos = pos-1
    word_part = list[pos].split()[1]
    i = 1
    num_words = 0
    word = []
    while(True):
        if list[pos+i] != '\n':
            if list[pos+i].split()[0] == '#begin' or list[pos+i].split()[0] == '#end':
                break
            part_no = list[pos+i].split()[1]
            if part_no == word_part:
                word.append(list[pos+i].split()[3])
                num_words += 1
            if num_words == 5:
                break
        i += 1
    return word

In [13]:
def mention_sentence(train_list, pos):
    '''
    Get the sentence in which the mention occurs.
    '''
    pos = pos-1
    i = 1
    start = 0
    end = 0
    while(True):
        if train_list[pos-i] == '\n':
            start = pos-i
            break
        if train_list[pos-i].split()[0] == '#begin':
            start = pos-i
            break
        i += 1
    start += 2
    i = 1
    while(True):
        if train_list[pos+i] == '\n':
            end = pos+i
            break
        i += 1
    sentence = get_mention_words(train_list, start, end)
    return sentence

In [14]:
def document_dictionary(train_file):
    documents = get_documents(train_file)
    doc_sent = ''
    doc_no = 0
    doc_dict = {}
    for document in documents:
        for part in document:
            doc_sent += part
        doc_dict[doc_no] = doc_sent
        doc_sent = ''
        doc_no += 1
    return doc_dict

In [15]:
def get_mention_length(mention):
    '''
    The length of the mention
    '''
    mention_words = mention.split()
    mention_len = len(mention_words)
    len_in_words = num2words(mention_len)
    return len_in_words

In [16]:
# pronoun: [1, 0, 0, 0]
# proper:  [0, 1, 0, 0]
# nominal(common noun): [0, 0, 1, 0]
# list:    [0, 0, 0, 1]
def mention_type(doc, mention):
    # pos 0: pronoun, pos 1: proper noun, pos 2: common noun
    token_type = [0, 0, 0]
    for token in doc:
        if token.pos_ == 'PRON':
            token_type[0] += 1
        elif token.pos_ == 'PROPN':
            token_type[1] += 1
        elif token.pos_ == 'NOUN':
            token_type[2] += 1   
    m = max(token_type)
    a = [i for i, j in enumerate(token_type) if j == m]  
    is_dominant = m >= len(mention.split())/2 
    if is_dominant:
        if a[0] == 0:
            return np.array([1, 0, 0, 0])
        if a[0] == 1:
            return np.array([0, 1, 0, 0])
        if a[0] == 2:
            return np.array([0, 0, 1, 0])
    else:
        return np.array([0, 0, 0, 1])

In [17]:
def check_mention_contain(newlist):
    '''
    Check if there is an overlab in the mentions of if is contained fully inside another mention.
    '''
    for i in range(0, len(newlist)):
        start = newlist[i]['mention_start']
        end = newlist[i]['mention_end']
        for j in range(0, len(newlist)):
            c_start = newlist[j]['mention_start']
            c_end = newlist[j]['mention_end']
            if c_start == start and c_end == end:
                continue
            if c_start >= start and c_end <= end:
                newlist[j]['contained'] = newlist[i]['id']
            if c_start >= start and c_start <= end:
                newlist[j]['overlap'] = newlist[i]['id']

    for k in range(0, len(newlist)):
        if 'contained' in newlist[k]:
            continue
        else:
            newlist[k]['contained'] = False
        if 'overlap' in newlist[k]:
            continue
        else:
            newlist[k]['overlap'] = False
    return newlist

In [18]:
def random_with_N_digits(n):
    range_start = 10**(n-1)
    range_end = (10**n)-1
    return randint(range_start, range_end)

In [19]:
# not used
def get_all_mentions(train_file):
    documents = get_documents(train_file)
    each_doc = ''
    mention_list = []
    for docs in documents:
        for d in docs:
            each_doc += d
        print (each_doc)
        clusters = coref.one_shot_coref(utterances=each_doc)
        mention_list.append(coref.get_mentions())
        each_doc = ''
    return mention_list

In [20]:
#not used
def get_all_mention_cluster(file_path, train_file):
    mention_list = get_all_mentions(file_path)
    train_list = train_file_to_list(train_file)
    start_index = []
    end_index = []
    mention_cluster = []
    for doc_num in range(0, len(mention_list)):
        for men in mention_list[doc_num]:
            for i in range(0, len(train_list)):
                if train_list[i] != '\n' and train_list[i].split()[0] != '#begin' and train_list[i].split()[0] != '#end':
                    if train_list[i].split()[3] == str(men[0]) and train_list[i].split()[1] == str(doc_num):
                        len_mention = len(men)
                        flag = True
                        for j, k in zip(men, train_list[i:i+len_mention]):
                            if k != '\n' and k.split()[0] != '#begin' and k.split()[0] != '#end':
                                if str(j) != k.split()[3]:
                                    flag = False
                        start = i+1
                        end = i+len_mention
                        for s, e in zip(start_index, end_index):
                            if s == start and e == end:
                                flag = False

                        if flag == True:
                            start_index.append(start)
                            end_index.append(end)
                            dummy_list = []
                            dummy_list.append(str(doc_num)+'_' + str(random_with_N_digits(10)))
                            dummy_list.append(start)
                            dummy_list.append(end)
                            mention_cluster.append(dummy_list)
                            break
    return mention_cluster


In [21]:
def get_index(mention_info):
    '''
    The relative position where the mention occurs in the document.
    '''
    doc_count = '0'
    count = 0
    i = 0
    mentions_in_each_doc = []
    for m in mention_info:
        if m['id'].split('_')[0] == doc_count:
            count += 1
        else:
            mentions_in_each_doc.append(count)
            doc_count = m['id'].split('_')[0]
            count = 1
        m['index'] = count
    mentions_in_each_doc.append(count)
    doc_count = '0'
    for m in mention_info:
        if m['id'].split('_')[0] == doc_count:
            m['mention_position'] = m['index']/mentions_in_each_doc[i]
        else:
            doc_count = m['id'].split('_')[0]
            i += 1
            m['mention_position'] = m['index']/mentions_in_each_doc[i]
            
    return mention_info

In [22]:
def train_dictionary(train_file):
    '''
    Create a dictionary with all the linguistic features of the mentions.
    '''
    mention_info = []
    train_list = train_file_to_list(train_file)
    cluster_start, start_pos, cluster_end, end_pos = get_mention(train_list)
    mention_cluster = create_mention_cluster_list(cluster_start, start_pos, cluster_end, end_pos)
    for m in mention_cluster:
        mention_dict = {}
        mention_words = get_mention_words(train_list, m[1], m[2])
        doc = nlp(mention_words)
        mention_dict['id'] = m[0]
        mention_dict['mention_start'] = m[1]
        mention_dict['mention_end'] = m[2]
        mention_dict['mention'] = mention_words
        mention_dict['first_word'] = mention_words.split()[0]
        mention_dict['last_word'] = mention_words.split()[-1]
        if mention_words.isdigit() or mention_words == 'its' or mention_words.lower() == 'that' or mention_words.lower() == 'this':
            mention_dict['head_word'] = ''
        else:
            if len(list(doc.noun_chunks)) > 0:
                mention_dict['head_word'] = list(doc.noun_chunks)[0].root.head.text
            else:
                mention_dict['head_word'] = ''                        
        mention_dict['pre_words'] = get_preceding_words(train_list, m[1])
        mention_dict['next_words'] = get_next_words(train_list, m[2])
        mention_dict['mention_sentence'] = mention_sentence(train_list, m[1])
        mention_dict['mention_type'] = mention_type(doc, mention_words).tolist()
        mention_dict['mention_length'] = get_mention_length(mention_words)
        mention_dict['speaker'] = train_list[m[1] - 1].split()[9]
        mention_info.append(mention_dict)
    
    mention_info = sorted(mention_info, key=lambda k: k['mention_start'])
    mention_info = check_mention_contain(mention_info)
    mention_info = get_index(mention_info)
    return mention_info

In [23]:
def distance(a):
    '''
    Convert the distance into a one-hot vector
    '''
    d = np.zeros((10))
    d[a == 0, 0] = 1
    d[a == 1, 1] = 1
    d[a == 2, 2] = 1
    d[a == 3, 3] = 1
    d[a == 4, 4] = 1
    d[(5 <= a) & (a < 8), 5] = 1
    d[(8 <= a) & (a < 16), 6] = 1
    d[(16 <= a) & (a < 32), 7] = 1
    d[(a >= 32) & (a < 64), 8] = 1
    d[a >= 64, 9] = 1
    return d.tolist()

In [24]:
def get_mention_pairs(train_file):
    '''
    Creating mention and antecedent pairs
    '''
    mention_info = train_dictionary(train_file)
    mention_pair_list = []
    for i in range(1, len(mention_info)):
        for j in range(0, i):
            pair = []
            if mention_info[i]['id'].split('_')[0] == mention_info[j]['id'].split('_')[0]:
                pair.append(mention_info[i])
                pair.append(mention_info[j])
                if mention_info[i]['id'] == mention_info[j]['id']:
                    pair.append({'coref': 1})
                else:
                    if j % 2 == 0 or j % 3 == 0 or j % 5 == 0 or j % 7 == 0 or j % 11 == 0:
                        continue
                    else:
                        pair.append({'coref': 0})
                mention_pair_list.append(pair)
                
    mention_pair_list = get_sentence_dist(mention_pair_list, train_file)
    
    return mention_pair_list        

In [25]:
def get_sentence_dist(mention_pair_list, train_file):
    train_list = train_file_to_list(train_file)
    for m in mention_pair_list:
        count = 0
        m1 = m[0]['mention_start']
        m2 = m[1]['mention_start']
        if m1 < m2:
            for t in range(m1, m2+1):
                if train_list[t] == '\n':
                    count += 1
        seq=difflib.SequenceMatcher(None, m[0]['mention'],m[1]['mention'])
        score = seq.ratio()
        m.append({'sentence_dist_count': distance(count)})
        m.append({'mention_dist_count': distance(m[0]['index'] - m[1]['index'])})
        if m[1]['overlap'] == m[0]['id']:
            m.append({'overlap': 1})
        else:
            m.append({'overlap': 0})
        if m[1]['speaker'] == m[0]['speaker']:
            m.append({'speaker': 1})
        else:
            m.append({'speaker': 0})
        if m[1]['head_word'] == m[0]['head_word']:
            m.append({'head_match': 1})
        else:
            m.append({'head_match': 0})
        if m[1]['mention'] == m[0]['mention']:
            m.append({'mention_exact_match': 1})
        else:
            m.append({'mention_exact_match': 0})
        if score > 0.6:
            m.append({'mention_partial_match': 1})
        else:
            m.append({'mention_partial_match': 0})
    return mention_pair_list

In [26]:
train_file = open('/cctv_0001.gold_conll', 'r')
pairs = get_mention_pairs(train_file)

In [27]:
#with open('/home/vishesh/TUM/Thesis/coref-json/trainfile1.json', 'w') as outfile:
#    json.dump(pairs, outfile)

In [28]:
#with open('/home/vishesh/TUM/Thesis/coref-json/documents.json', 'w') as outfile:
#    json.dump(doc_dict, outfile)

## Create input vector using the dictionary created above.

In [30]:
def get_vector(word):
    '''
    Convert the word into a 50-dim GloVe vector. If the word does not exist in the GloVe embedding, then 
    return a vector of zeros.
    '''
    table = str.maketrans({key: None for key in string.punctuation})
    word = word.lower()
    if len(word) > 1:
        word = word.translate(table)
    try:
        vec = model[word]
    except:
        vec = np.zeros((50, 1))
    return vec.reshape((50, 1))

In [31]:
def get_average_vector(word_list):
    '''
    Return the average of the word embeddings of the list of words.
    '''
    sum = np.zeros((50, 1))
    for i in range(0, len(word_list)):
        sum += get_vector(word_list[i])
    average_vector = sum/(i+1)
    return average_vector

In [32]:
def calculate_docs_average(doc_dict):
    '''
    Return the average of the word embedding of the entire document.
    '''
    doc_avg = []
    for d in doc_dict:
        doc_avg.append(get_average_vector(doc_dict[d].split()))
    return doc_avg

In [33]:
def get_pair_features(feature_list):
    '''
    Get the distance feature, speaker feature and string matching feature in the form of a vector.
    '''
    
    # distance features
    mention_dist = np.array(feature_list[4]['mention_dist_count']).reshape((10, 1))
    s_dist = np.array(feature_list[3]['sentence_dist_count']).reshape((10, 1))
    overlap = np.array(feature_list[5]['overlap']).reshape((1, 1))
    
    # speaker feature
    speaker = np.array(feature_list[6]['speaker']).reshape((1, 1))
    
    # string matching features
    head_match = np.array(feature_list[7]['head_match']).reshape((1, 1))
    mention_exact_match = np.array(feature_list[8]['mention_exact_match']).reshape((1, 1))
    mention_partial_match = np.array(feature_list[9]['mention_partial_match']).reshape((1, 1))
    
    pair_features = np.concatenate((mention_dist, s_dist, overlap, speaker, head_match, \
                                   mention_exact_match, mention_partial_match))
    
    return pair_features

In [34]:
# p: previous, n: next, w: words, a: average, s: sentence
def get_mention_features(mention, doc_average):
    '''
    Get the vector form of all the features extracted for the mentions.
    '''
    features = []
    #head_w = get_vector(mention['head_word'])
    first_w = get_vector(mention['first_word'])
    last_w = get_vector(mention['last_word'])
    mention_length = get_vector(mention['mention_length'])
    mention_type = np.array(mention['mention_type']).reshape((4, 1))
    mention_position = np.array(mention['mention_position']).reshape((1, 1))
    if mention['contained'] == False:
        mention_contain = np.zeros((1, 1))
    else:
        mention_contain = np.ones((1, 1))
    if len(mention['pre_words']) > 0:
        mention_p_w1 = get_vector(mention['pre_words'][0])
    else:
        mention_p_w1 = np.zeros((50, 1))
    if len(mention['pre_words']) > 1:
        mention_p_w2 = get_vector(mention['pre_words'][1])
    else:
        mention_p_w2 = np.zeros((50, 1))
    if len(mention['next_words']) > 0:
        mention_n_w1 = get_vector(mention['next_words'][0])
    else:
        mention_n_w1 = np.zeros((50, 1))
    if len(mention['next_words']) > 1:
        mention_n_w2 = get_vector(mention['next_words'][1])
    else:
        mention_n_w2 = np.zeros((50, 1))
    if len(mention['pre_words']) > 0:
        mention_p_w_a = get_average_vector(mention['pre_words'])
    else:
        mention_p_w_a = np.zeros((50, 1))
    if len(mention['next_words']) > 0:
        mention_n_w_a = get_average_vector(mention['next_words'])
    else:
        mention_n_w_a = np.zeros((50, 1))
    mention_s_a = get_average_vector(mention['mention_sentence'].split())
    doc_id = mention['id'].split('_')[0]
    doc_avg = doc_average[int(doc_id)]
    
    
    features = np.concatenate((first_w, last_w, mention_p_w1, mention_p_w2, mention_p_w_a, \
                               mention_n_w1, mention_n_w2, mention_n_w_a, mention_s_a, mention_length, \
                               mention_type, mention_position, mention_contain, doc_avg))
    return features

In [35]:
def make_feature_input(pairs, doc_dict):
    '''
    Concatenate the features of the mentions, candidate antecedents and the pair features(antecedent, mention pair)
    '''
    docs_avg = calculate_docs_average(doc_dict)
    input_feature_list = []
    i = 0
    for m in pairs:
        i += 1
        input_feature_vector = []
        mention_avg = get_average_vector(m[0]['mention'].split())
        antecedent_avg = get_average_vector(m[1]['mention'].split())
        mention_features = get_mention_features(m[0], docs_avg)
        antecedent_features = get_mention_features(m[1], docs_avg)
        pair_features = get_pair_features(m)
        
        input_feature_vector.append(antecedent_avg)
        input_feature_vector.append(antecedent_features)
        input_feature_vector.append(mention_avg)
        input_feature_vector.append(mention_features)
        input_feature_vector.append(pair_features)
        input_feature_list.append(input_feature_vector)
        
    return input_feature_list

In [36]:
def make_input_vector(pairs, doc_dict):
    feature_input = make_feature_input(pairs, doc_dict)
    len_f_input = len(feature_input)
    input_ = []
    for f_input in feature_input:
        con = np.concatenate((f_input[0], f_input[1], f_input[2], f_input[3], f_input[4]))
        input_.append(con)
        del con
    return input_

In [37]:
def make_output_vector(pairs):
    '''
    The vector for the output labels
    '''
    output = []
    len_mentions = len(pairs)
    for m in pairs:
        output.append(m[2]['coref'])
    output = np.array(output).reshape((len_mentions, 1))
    return output

In [38]:
path_to_train_files = '/english/annotations/'
path_to_dev_files = '/english/annotations/'

In [39]:
#train_file = open(path_to_train_file, 'r')
#doc_dict = document_dictionary(train_file)

In [39]:
list_of_conll_files = []
for path, subdirs, files in os.walk(path_to_train_files):
    for name in files:
        if name.endswith(".gold_conll"):
            list_of_conll_files.append(os.path.join(path, name))

In [40]:
def train_network_data(path):
    train_file = open(path, 'r')
    doc_dict = document_dictionary(train_file)
    train_file = open(path, 'r')
    pairs = get_mention_pairs(train_file)
    input_vector = make_input_vector(pairs, doc_dict)
    output_vector = make_output_vector(pairs)
    return input_vector, output_vector

In [54]:
%%time
# Run this cell for getting training files
FILES = 100
num_train_files = len(list_of_conll_files)
count = 0
num = 0
file_num = 1
for i in range (0, FILES):
    input_files_vector = []
    output_files_vector = []
    for j in range(math.ceil((i/FILES) * num_train_files), math.ceil(((i+1)/FILES) * num_train_files)):
        num += 1
        print ('file num: ' + str(num))
        i_vector, o_vector = train_network_data(list_of_conll_files[j])
        if len(i_vector) > 0:
            input_files_vector.append(i_vector)
            output_files_vector.append(o_vector)
            count += 1
            print ('coref: ' + str(count))
            if count % 5 == 0:
                ffnn_input = []
                ffnn_output = []
                for inp_vector, out_vector in zip(input_files_vector, output_files_vector):
                    for inp, out in zip(inp_vector, out_vector):
                        ffnn_input.append(inp)
                        ffnn_output.append(out)
                np.save('/home/vishesh/TUM/Thesis/Coreference-Resolution/data/processed/new/ffnn_train_' + str(file_num), ffnn_input, allow_pickle=True, fix_imports=True)    
                np.save('/home/vishesh/TUM/Thesis/Coreference-Resolution/data/processed/new/ffnn_labels_' + str(file_num), ffnn_output, allow_pickle=True, fix_imports=True)
                print ('File  created.')
                file_num += 1
    if count == 100:
        break
np.save('/home/vishesh/TUM/Thesis/Coreference-Resolution/data/processed/new/ffnn_train_' + str(i), ffnn_input, allow_pickle=True, fix_imports=True)    
np.save('/home/vishesh/TUM/Thesis/Coreference-Resolution/data/processed/new/ffnn_labels_' + str(i), ffnn_output, allow_pickle=True, fix_imports=True)
print ('File  created.')    

In [None]:
# Run this cell for getting the development files.
input_files_vector = []
output_files_vector = []
num = 0
count = 0
for j in range(0, len(list_of_conll_files)):
    num += 1
    print ('file num: ' + str(num))
    i_vector, o_vector = train_network_data(list_of_conll_files[j])
    if len(i_vector) > 0:
        input_files_vector.append(i_vector)
        output_files_vector.append(o_vector)
        count += 1
        print ('coref: ' + str(count))

In [45]:
ffnn_input = []
ffnn_output = []
for inp_vector, out_vector in zip(input_files_vector, output_files_vector):
    for inp, out in zip(inp_vector, out_vector):
        ffnn_input.append(inp)
        ffnn_output.append(out)

In [None]:
np.save('/home/vishesh/TUM/Thesis/Coreference-Resolution/data/processed/ffnn_input_dev', ffnn_input, allow_pickle=True, fix_imports=True)

In [48]:
np.save('/home/vishesh/TUM/Thesis/Coreference-Resolution/data/processed/ffnn_output_dev', ffnn_output, allow_pickle=True, fix_imports=True)