# 4. AttributionRelationToConll

This notebook links the different attribution labels, numbers them, formats the attribution labels to CONLL format and outputs the output data in a separate file per article, just like in the input data.

#### Logic
1. Loop over testfiles, loop over tokens, first look for CUE sequences and number them. 
2. Then find the closest SOURCE and CONTENTS sequences and give them the same number. For finding SOURCE and CONTENT do an additional loop traveling the dict backwards and forwards at the same time with loop_index and -loop_index.
3. Then also number the continuous SOURCE and CONTENT labels for the label SOURCE or CONTENT label found.

#### Possible additions
Not just take the closest SOURCE and CONTENT, but also check if they are closer to another CUE.
And at the end check and number the SOURCE and CONTENT labels without CUE.

In [1]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
import csv
import numpy as np
import os
import pandas as pd 

start_index_labels = 2  # 2 if we use BIO labels, 0 if we don't use BIO labels

preprocessed_data_files = [
#     "Result/CRF_out_BIO_features2-dev-removed_merged-merged.csv",
#     "Result/CRF_out_BIO_embedding50-dev-removed_merged-merged.csv",
#     "Result/CRF_out_BIO_embedding50-dev_merged-merged.csv",
#     "Result/CRF_out_BIO_baseline-dev_merged-merged.csv",
    "Result/CRF_out_BIO_baseline-dev_polnear-polnear.csv",
#     "Result/CRF_out_BIO_features-dev_polnear-polnear.csv",
#     "Result/CRF_out_BIO_features2-dev_polnear-polnear.csv",
]


In [3]:
class FilesGetter(object):
    
    def __init__(self, dataset):
        self.n_files = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda f: [(a, s, nf, ns, ft, w, l, p, d, td, al) 
                              for a, s, nf, ns, ft, w, l, p, d, td, al 
                              in zip(f["Article_Name"].values.tolist(),
                                     f["Sentence_nr"].values.tolist(),
                                     f["Nr_in_file"].values.tolist(),
                                     f["Nr_in_sentence"].values.tolist(),
                                     f["FromTo"].values.tolist(),
                                     f["Word"].values.tolist(),
                                     f["Lemma"].values.tolist(),
                                     f["POS"].values.tolist(),
                                     f["Dep_label"].values.tolist(),
                                     f["Token_dep_head"].values.tolist(),
                                     f["AR_label"].values.tolist()
                                    )]
        self.grouped = self.dataset.groupby(["Article_Name"]).apply(agg_func)
        self.files = [f for f in self.grouped]
    
    def get_next(self):
        try:
            f = self.grouped["Files: {}".format(self.n_tokens)]
            self.n_files += 1
            return f
        except:
            return None

In [4]:
# testlist = [1, 2, 3, 4, 5, 6]
# start_index = 2
# print(testlist[start_index])
# testlist.reverse()
# start_index = len(testlist) - start_index - 1
# print(start_index)
# print(testlist[start_index])

# for file_index, file in enumerate(file_temp[start_index:]):
#     print(file_index)

In [5]:
def number_continuous_labels(file_temp, start_idx, label_nr, label_type, reverse=False):
    if reverse:
        file_temp.reverse()
        start_idx = (len(file_temp) - 1) - start_idx
        
    for file_index, file_token in enumerate(file_temp):
        if file_index > start_idx:
            label = file_token[10]
            if label[start_index_labels:] == label_type:
                file_token_temp = (file_token[0], file_token[1], file_token[2], file_token[3], file_token[4], file_token[5], 
                                   file_token[6], file_token[7], file_token[8], file_token[9], label + '-' + str(label_nr))
                file_temp[file_index] = file_token_temp
            else:
                break

    # Reverse back once we're done
    if reverse:
        file_temp.reverse()
        
    return file_temp


# Find the CONTENT or SOURCE label for a CUE and add the number to the label
def number_label(token_idx, file_temp, label_nr, label_type):
    label_search_index = 1
    label_found_index = None
    
    file_temp_len = len(file_temp)
    backwards_search_index = token_idx - label_search_index
    forwards_search_index = token_idx + label_search_index

    while backwards_search_index >= 0 or forwards_search_index < file_temp_len:
#         print(label_search_index)

        # Backwards loop
        if backwards_search_index >= 0:
            file_temp_item_backwards = file_temp[backwards_search_index]
            label_backwards = file_temp_item_backwards[10]
            if label_backwards[start_index_labels:] == label_type:
                # Overwrite a possible content_found_index in the forward loop. 
                # We favor the backwards content_found_index over the forward loop.
                # If we find a content, break immediately from while loop
                label_found_index = backwards_search_index
                
                file_token_temp = (file_temp_item_backwards[0], file_temp_item_backwards[1], file_temp_item_backwards[2], 
                                   file_temp_item_backwards[3], file_temp_item_backwards[4], file_temp_item_backwards[5], 
                                   file_temp_item_backwards[6], file_temp_item_backwards[7], file_temp_item_backwards[8], 
                                   file_temp_item_backwards[9], label_backwards + '-' + str(label_nr))
                file_temp[label_found_index] = file_token_temp

                file_temp = number_continuous_labels(file_temp, label_found_index, label_nr, label_type, reverse=True)
                break

        # Forwards loop
        if forwards_search_index < file_temp_len:
            file_temp_item_forwards = file_temp[forwards_search_index]
            label_forwards = file_temp_item_forwards[10]
            if label_forwards[start_index_labels:] == label_type: 
                label_found_index = forwards_search_index
    
                file_token_temp = (file_temp_item_forwards[0], file_temp_item_forwards[1], file_temp_item_forwards[2], 
                                   file_temp_item_forwards[3], file_temp_item_forwards[4], file_temp_item_forwards[5], 
                                   file_temp_item_forwards[6], file_temp_item_forwards[7], file_temp_item_forwards[8], 
                                   file_temp_item_forwards[9], label_forwards + '-' + str(label_nr))
                file_temp[label_found_index] = file_token_temp
            
                file_temp = number_continuous_labels(file_temp, label_found_index, label_nr, label_type)
                break

        label_search_index += 1 
        backwards_search_index = token_idx - label_search_index
        forwards_search_index = token_idx + label_search_index

        # Uncomment for debugging
#         if label_search_index == 1:  # One is first sentence with labels
#             break 

    return file_temp

In [6]:
def label_tokens(files):
    files_labels_numbered = []
    files_labels_max_label_nr = []
    for file_index, file in enumerate(files):
    #     file = file[0:30]  # Uncomment for debugging
        # Copy of the tokens in the file
        file_temp = file.copy()
        label_nr = 1
        max_label_nr = 0

        # Loop over tokens to number labels
        cue_found_index = -1
        for token_idx, file_token in enumerate(file):
            file_token_temp = file_token
            label = file_token[10] 

            # If the label is CUE without a number, number the CUE first
            is_search_content_source = False
            if label[start_index_labels:] == 'CUE':

                # We are only handling continuous CUEs. If CUE is not continuous, then the label nr goes up 
                if token_idx > cue_found_index + 1:
                    label_nr += 1
                    is_search_content_source = True

                file_token_temp = (file_token[0], file_token[1], file_token[2], file_token[3], file_token[4], file_token[5], 
                                   file_token[6], file_token[7], file_token[8], file_token[9], label + '-' + str(label_nr))
                file_temp[token_idx] = file_token_temp

                cue_found_index = token_idx

                if is_search_content_source:
                    # Now search backwards and forwards in file for CONTENT and SOURCE labels
                    file_temp = number_label(token_idx, file_temp, label_nr, 'CONTENT')
                    file_temp = number_label(token_idx, file_temp, label_nr, 'SOURCE')

                max_label_nr = label_nr

        # Append highest label nr found to list
        files_labels_numbered.append(file_temp)
        files_labels_max_label_nr.append(max_label_nr)

    #     if file_index == 0:
    #         break
    
    return files_labels_numbered, files_labels_max_label_nr

In [7]:
# files_labels_numbered[1]

In [8]:
def pad_labels(files):
    files_labels_numbered_padded = []
    for file_idx, file in enumerate(files_labels_numbered):
        max_label_nr = files_labels_max_label_nr[file_idx]
        file_temp = file.copy()
        for token_idx, file_token in enumerate(file):
            label = file_token[10]
            if label != 'O':
                label_nr_str = label.replace('B','').replace('I','').replace('-','').replace('SOURCE','').replace('CONTENT','').replace('CUE','')
                if label_nr_str:
                    try:
                        label_nr = int(label_nr_str)
                        lpad_string = "_ " * (max_label_nr - (max_label_nr - label_nr) - 1)
                        rpad_string = " _" * (max_label_nr - label_nr)
                        label = lpad_string + label + rpad_string
                    except:
                        # 8 CONTENT labels didn't get a number
                        print('error')
            else:
                if max_label_nr == 0:
                    label = 'O'
                else:
                    # For 'O' label, create string of all underscores for the nr of labels in the file.
                    label = ("_ " * (max_label_nr - 1)) + '_'

#             file_temp[token_idx] = (token, label, filename, sentence_idx)
            file_token_temp = (file_token[0], file_token[1], file_token[2], file_token[3], file_token[4], file_token[5], 
                               file_token[6], file_token[7], file_token[8], file_token[9], label)
            file_temp[token_idx] = file_token_temp                    
        files_labels_numbered_padded.append(file_temp)

    #     if file_idx == 1:
    #         break
    
    return files_labels_numbered_padded

In [9]:
def separate_files_and_sentences(files): 
    files_list = []
    for file in files:
        previous_sentence_nr = 1
        token_list = []
        for token_idx, file_token in enumerate(file):
            article_name = file_token[0]
            sentence_nr = file_token[1]
            nr_in_file = file_token[2]
            nr_in_sentence = file_token[3]
            token_dep_head = file_token[9]
            if sentence_nr != previous_sentence_nr or article_name == 'Art_end':
                # Add empty line between sentences and at the end of the file.
                token_list.append({"Article_Name": '', 
                                   "Sentence_nr": '', 
                                   "Nr_in_file": '', 
                                   "Nr_in_sentence": '', 
                                   "FromTo": '', 
                                   "Word": '', 
                                   "Lemma": '', 
                                   "POS": '',
                                   "Dep_label": '', 
                                   "Token_dep_head": '', 
                                   "AR_label": ''})
            
            if not np.isnan(sentence_nr):
                sentence_nr = int(sentence_nr)
            if not np.isnan(nr_in_file):
                nr_in_file = int(nr_in_file)
            if not np.isnan(nr_in_sentence):
                nr_in_sentence = int(nr_in_sentence)
            if not np.isnan(token_dep_head):
                token_dep_head = int(token_dep_head)

            token_list.append({"Article_Name": file_token[0], 
                               "Sentence_nr": sentence_nr, 
                               "Nr_in_file": nr_in_file, 
                               "Nr_in_sentence": nr_in_sentence, 
                               "FromTo": file_token[4], 
                               "Word": file_token[5], 
                               "Lemma": file_token[6], 
                               "POS": file_token[7],
                               "Dep_label": file_token[8], 
                               "Token_dep_head": token_dep_head, 
                               "AR_label": file_token[10]})
            previous_sentence_nr = sentence_nr
        files_list.append(token_list)
    return files_list

In [10]:
def convert(x):
    if x:
        x = str(x).split('.')[0]
    return x

In [11]:
def create_directory(dir_name):
    try:
        os.makedirs(dir_name)    
#         print("Directory " , dir_name ,  " Created ")
    except FileExistsError:
        pass
#         print("Directory " , dir_name ,  " already exists") 

In [12]:
print(csv.list_dialects())

['excel', 'excel-tab', 'unix']


In [13]:
for preprocessed_data_file in preprocessed_data_files:
#     dataset = pd.read_csv(preprocessed_data_file, sep="\t", encoding="latin-1")
    dataset = pd.read_csv(preprocessed_data_file, sep="|", encoding="utf-8")
    dataset['AR_label'] = dataset['AR_label'].apply(str)
    print(dataset.head(2))
    files_getter = FilesGetter(dataset)
    files = files_getter.files
    print(len(files))
#     print(files[1])
    
    # Add numbers to labels
    files_labels_numbered, files_labels_max_label_nr = label_tokens(files)
    print(len(files_labels_numbered))
#     print(files_labels_numbered[1])
    
    # Added _'s to labels
    files_labels_numbered_padded = pad_labels(files)
    print(len(files_labels_numbered_padded))
#     print(files_labels_numbered_padded[1])

    # Add empty lines between sentences
    files_list = separate_files_and_sentences(files_labels_numbered_padded)
    print(len(files_list))
    print(files_list[0][1])
    
    # Write output to file
    for file in files_list:
#         print(file[0])
        filename = file[0].get('Article_Name') 
#         print(filename)
        dir_name = f"CONLL/{preprocessed_data_file.replace('Result/','').replace('.csv','')}/"
        create_directory(dir_name)
        if filename:
            print(dir_name + filename.replace('.txt.xml', '.xml').replace('.xml', '.xml.conll.features.foreval.out'))
            with open(dir_name + filename.replace('.txt.xml', '.xml').replace('.xml', '.xml.conll.features.foreval.out'), "w", encoding="utf-8") as result:
                writer = csv.writer(result, delimiter="\t", dialect='excel-tab',)
                for row in file:
#                     print(row)
                    if row.get("Sentence_nr") == '':
                        # Not using df.to_csv to be able to write empty row between the sentences.
                        writer.writerow(())
#                         writer.writerow(('', '', '', '', '', '', '', '', '', '', '',))
                    else:
                        writer.writerow((row.get("Article_Name"), 
                                         row.get("Sentence_nr"), 
                                         row.get("Nr_in_file"), 
                                         row.get("Nr_in_sentence"), 
                                         row.get("FromTo"), 
                                         row.get("Word"), 
                                         row.get("Lemma"), 
                                         row.get("POS"), 
                                         row.get("Dep_label"), 
                                         row.get("Token_dep_head"), 
                                         row.get("AR_label"),))

                                        Article_Name  Sentence_nr  Nr_in_file  \
0  west-journal_2016-09-29_gold-star-mom-corners-...          1.0         1.0   
1  west-journal_2016-09-29_gold-star-mom-corners-...          1.0         2.0   

   Nr_in_sentence FromTo  Word Lemma  POS Dep_label  Token_dep_head   AR_label  
0             1.0    0,4  Gold  Gold  NNP  compound             5.0  I-CONTENT  
1             2.0    5,9  Star  Star  NNP  compound             5.0  I-CONTENT  
85
85
85
85
{'Article_Name': 'Art_end', 'Sentence_nr': nan, 'Nr_in_file': nan, 'Nr_in_sentence': nan, 'FromTo': nan, 'Word': 'Sent_end', 'Lemma': nan, 'POS': nan, 'Dep_label': nan, 'Token_dep_head': nan, 'AR_label': 'O'}
CONLL/CRF_out_BIO_baseline-dev_polnear-polnear/breitbart_2016-09-12_stealth-over-health-hillary-clin.xml.conll.features.foreval.out
CONLL/CRF_out_BIO_baseline-dev_polnear-polnear/breitbart_2016-09-15_pat-caddell-democrat-voters-worr.xml.conll.features.foreval.out
CONLL/CRF_out_BIO_baseline-d

In [14]:
# files[0:1]
# [[('Gold', 'I-CONTENT', 1.0, 1.0, '0,4'), ('Star', 'I-CONTENT', 1.0, 2.0, '5,9'), ('mom', 'I-CONTENT', 1.0, 3.0, '10,13'), 
# ('Corners', 'I-CONTENT', 1.0, 4.0, '14,21'), ('Obama', 'I-CONTENT', 1.0, 5.0, '22,27'), ('on', 'I-CONTENT', 1.0, 6.0, '28,30'), 
# ('he', 'I-CONTENT', 1.0, 7.0, '31,34'), ('refusal', 'I-CONTENT', 1.0, 8.0, '35,42'), ('to', 'I-CONTENT', 1.0, 9.0, '43,45'),
# ('use', 'I-CONTENT', 1.0, 10.0, '46,49'), ('the', 'I-CONTENT', 1.0, 11.0, '50,53'), ('word', 'I-CONTENT', 1.0, 12.0, '54,59'), 
# ('`', 'I-CONTENT', 1.0, 13.0, '60,61'), etc