# 2. Data_Preparation_Extra_for_CRF

To extract extra features and labels for running the CRF models.

In [1]:
import csv

# Extract features and labels

In [2]:
eval_data_set = 'test'
preprocessed_data=[f'polnear_with_BIO_{eval_data_set}.csv', 
                   "polnear_with_BIO_train.csv",
                   f'parc3_with_BIO_{eval_data_set}.csv', 
                   "parc3_with_BIO_train.csv",
                   f'merged_with_BIO_{eval_data_set}.csv', 
                   "merged_with_BIO_train.csv",
                  ]
filename_addition = '_extra_data.csv'
removed_filename_addition = '_removed' + filename_addition

In [3]:
import networkx as nx

def generate_graph(words):
    edges = []
    root_position = 0
    for word in words:
#         if word != 'Sent_end':
        word_columns = list(word)
        position = word_columns[3]
        parent = word_columns[9]
        dependency_label = word_columns[8]
#         if dependency_label.lower() != 'root':
        if parent != '0':
            edges.append((position, parent))
        else:
            root_position = position
            
#     print(edges)
    sentence_graph = nx.Graph(edges)
    return sentence_graph, root_position

In [4]:
def get_dependency_path(shortest_path, words):
    dependencies_path = []
    for path_word_position in shortest_path:
#         print(f"path_word_position is {path_word_position}.")
        for word in words:
            word_columns = list(word)
            word_position = word_columns[3]
            if word_position == path_word_position:
                dependency_label = word_columns[8]
                dependencies_path.append(dependency_label)
    return ';'.join(dependencies_path[:-1])

In [5]:
cue_lemmas = ['say', 'be', 'to', 'have', 'tell', 'call', 'write', 'accord', 'add', 'ask', 'show', 'support', 'note', 'report', 'suggest', 'argue',
             'expect', 'report', 'believe', 'agree', 'think', 'announce', 'cite', 'suggest']

In [6]:
# to extract extra features, to prepare for using CRF.py, not removing unlabeled sentences
def get_extra_features(filename):
    quote_opening = "``"
    quote_closing = "''"
    colon_opening = ':'
    colon_closing = 'EOS'  # '.'  # Should be EOS later on

    # Read file
    csvinput = open(f"Preprocessed_data/{filename}",'r')
    csvreader = csv.reader(csvinput,delimiter='\t')
#     print(csvreader)
    headers = next(csvreader)
    sents = []
    current_sent = []
    for row in csvreader:
#         if row[5] == "Sent_end":
#             sents.append(current_sent)
#             current_sent = []
        #note that this is a simplification that works well for this particular data, in other situations, you may need to do more advanced preprocessing to identify sentence boundaries
        if row[3] == "1" and row[2] != "1":
            sents.append(current_sent)
            current_sent = []            
        current_sent.append(list(row))
        
#     Add last empty row
    sents.append(current_sent)
    
    # Loop over sentences in file
    in_quote = False
    after_colon = False
    sents_extra_features = []
    for sent in sents:
#         print(type(sent))

        # Generate graph
        sentence_graph, root_position = generate_graph(sent)  

        # Loop over tokens in sentence
        sent_extra_features = []
        for token_idx, token in enumerate(sent):
            token_extra_features = token
            word = token[5]
            label = token[-1]
            word_position = token[3]
            
            if word == quote_opening:
                in_quote = True
            if word == quote_closing:
                in_quote = False
            
            if word == colon_opening:
                after_colon = True
            if word == colon_closing:
                after_colon = False
                
            dependency_distance = 0
            dependency_path = ''
            if word != 'Sent_end':
                try:
                    shortest_path = nx.shortest_path(sentence_graph, source=word_position, target=root_position)
#                     print(f"Shortest path is {shortest_path}.")
                    dependency_distance = len(shortest_path)
                    dependency_path = get_dependency_path(shortest_path, sent)
                except Exception as e:
                    pass
#                     print(e)
#                     print(sent)
#                     print('root_position')   
#                     print(root_position)
#                     print('word_position')   
#                     print(word_position)
#                     print(word)
            
            token_extra_features = (token[0], token[1], token[2], token[3], token[4], token[5], 
                                    token[6], token[7], token[8], token[9], token[10], 
                                    in_quote, after_colon, dependency_distance, dependency_path)  # Tuple
            sent_extra_features.append(token_extra_features)
            
        # Append row with extra features to new list
        sents_extra_features.append(sent_extra_features)
            
    # Write file
    with open(f"Preprocessed_data/{filename.replace('.csv', filename_addition)}", "w") as result:
        headers1 = ["Article_Name", "Sentence_nr", "Nr_in_file", "Nr_in_sentence", "FromTO", "Word", "Lemma", "POS", "Dep_label", "Token_dep_head", 
                    "AR_label", "In_quote", "After_colon", "Dep_distance", "Dep_path"]
        writer = csv.writer(result, delimiter='\t')
        writer.writerow(headers1)
        for f in sents_extra_features:
            writer.writerows(f)
    return

# For every filename, get extra features
for filename in preprocessed_data:  # preprocessed_data_token_label:
    get_extra_features(filename)  # .replace('.csv', filename_addition))

In [7]:
# # to extract only token and labels, to prepare for using CRF.py, removing unlabeled sentences

# def get_removed_file(filename):
# #     print('something')
#     csvinput = open(f"Preprocessed_data/{filename.replace('.csv', filename_addition)}",'r')
#     csvreader = csv.reader(csvinput,delimiter='\t')
# #     print(csvreader)
#     headers=next(csvreader)
#     sents = []
#     current_sent = []
#     for row in csvreader:
#         #note that this is a simplification that works well for this particular data, in other situations, you may need to do more advanced preprocessing to identify sentence boundaries
#         if row[3] == "1" and row[2] != "1":
#             sents.append(current_sent)
#             current_sent = []            
#         current_sent.append(list(row))
        
# #     Add last empty row
#     sents.append(current_sent)
            
#     sents_removed=[]
#     for sent in sents:
# #         print(sents)
#         sent_label_list=[]
#         for f in sent:
#             label = f[-1]
#             sent_label_list.append(label)
#         if set(sent_label_list)!={'O'}:
#             #print(set(sent))
#             sents_removed.append(sent)
#     with open(f"Preprocessed_data/{filename.replace('.csv', removed_filename_addition)}", "w") as result:
#         headers1 = ["Article_Name", "Sentence_nr", "Nr_in_file", "Nr_in_sentence", "FromTo", "Word", "Lemma", "POS", "Dep_label", "Token_dep_head", "AR_label"]
#         writer = csv.writer(result, delimiter='\t')
#         writer.writerow(headers1)
#         for f in sents_removed:
#             writer.writerows(f)
#     return

# for filename in preprocessed_data:
#     if 'train' in filename:
#         get_removed_file(filename)