# 2. Data_Preparation_Extra_for_CRF

To extract extra features and labels for running the CRF models.

In [1]:
import csv
import pandas as pd

# Extract features and labels

In [2]:
eval_data_set = 'dev'
preprocessed_data=[
    f'polnear_with_BIO_{eval_data_set}.csv', 
    "polnear_with_BIO_train.csv",
    f'parc3_with_BIO_{eval_data_set}.csv', 
    "parc3_with_BIO_train.csv",
    f'merged_with_BIO_{eval_data_set}.csv', 
    "merged_with_BIO_train.csv",
]

# This line after is for FeaturesCRF
filename_addition = '_extra_data.csv'
# This line after is for Baseline CRF and EmbeddingCRF
# filename_addition = '.csv'
removed_filename_addition = '_removed' + filename_addition

In [3]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sentences = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda f: [(a, s, nf, ns, ft, w, l, p, d, td, al) 
                              for a, s, nf, ns, ft, w, l, p, d, td, al 
                              in zip(f["Article_Name"].values.tolist(),
                                     f["Sentence_nr"].values.tolist(),
                                     f["Nr_in_file"].values.tolist(),
                                     f["Nr_in_sentence"].values.tolist(),
                                     f["FromTo"].values.tolist(),
                                     f["Word"].values.tolist(),
                                     f["Lemma"].values.tolist(),
                                     f["POS"].values.tolist(),
                                     f["Dep_label"].values.tolist(),
                                     f["Token_dep_head"].values.tolist(),
                                     f["AR_label"].values.tolist(),
                                    )]
        self.grouped = self.dataset.groupby(["Article_Name", "Sentence_nr"]).apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentences: {}".format(self.n_tokens)]
            self.n_sentences += 1
            return f
        except:
            return None

In [12]:
class SentenceExtraGetter(object):
    
    def __init__(self, dataset):
        self.n_sentences = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda f: [(a, s, nf, ns, ft, w, l, p, d, td, al, q, c, dd, dp) 
                              for a, s, nf, ns, ft, w, l, p, d, td, al, q, c, dd, dp
                              in zip(f["Article_Name"].values.tolist(),
                                     f["Sentence_nr"].values.tolist(),
                                     f["Nr_in_file"].values.tolist(),
                                     f["Nr_in_sentence"].values.tolist(),
                                     f["FromTo"].values.tolist(),
                                     f["Word"].values.tolist(),
                                     f["Lemma"].values.tolist(),
                                     f["POS"].values.tolist(),
                                     f["Dep_label"].values.tolist(),
                                     f["Token_dep_head"].values.tolist(),
                                     f["AR_label"].values.tolist(),
                                     f["In_quote"].values.tolist(),
                                     f["After_colon"].values.tolist(),
                                     f["Dep_distance"].values.tolist(),
                                     f["Dep_path"].values.tolist(),
                                    )]
        self.grouped = self.dataset.groupby(["Article_Name", "Sentence_nr"]).apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentences: {}".format(self.n_tokens)]
            self.n_sentences += 1
            return f
        except:
            return None

In [4]:
import networkx as nx

def generate_graph(words):
    edges = []
    root_position = 0
    for word in words:
#         if word != 'Sent_end':
        word_columns = list(word)
        position = word_columns[3]
        parent = word_columns[9]
        dependency_label = word_columns[8]
#         if dependency_label.lower() != 'root':
        if parent != '0':
            edges.append((position, parent))
        else:
            root_position = position
            
#     print(edges)
    sentence_graph = nx.Graph(edges)
    return sentence_graph, root_position

In [5]:
def get_dependency_path(shortest_path, words):
    dependencies_path = []
    for path_word_position in shortest_path:
#         print(f"path_word_position is {path_word_position}.")
        for word in words:
            word_columns = list(word)
            word_position = word_columns[3]
            if word_position == path_word_position:
                dependency_label = word_columns[8]
                dependencies_path.append(dependency_label)
    return ';'.join(dependencies_path[:-1])

In [6]:
def convert(x):
    if x:
        x = str(x).split('.')[0]
    return x

In [7]:
cue_lemmas = ['say', 'be', 'to', 'have', 'tell', 'call', 'write', 'accord', 'add', 'ask', 'show', 'support', 'note', 'report', 'suggest', 'argue',
             'expect', 'report', 'believe', 'agree', 'think', 'announce', 'cite', 'suggest']

In [8]:
# To mark down this cell for baseline and embedding
# to extract extra features, to prepare for using CRF.py, not removing unlabeled sentences

def get_extra_features(filename):
    quote_opening = "``"
    quote_closing = "''"
    colon_opening = ':'
    colon_closing = '.'

    ### Read file
    input_file = f"Preprocessed_data/{filename}"
#     csvinput = open(f"Preprocessed_data/{filename}", 'r', encoding="utf-8")
#     csvreader = csv.reader(csvinput, delimiter='\t')
# #     print(csvreader)
#     headers = next(csvreader)
#     sents = []
#     current_sent = []
    
#     # Try except for nul byte error
#     try:
#         index = 0
#         for i, row in enumerate(csvreader):
#             index = i
#             current_sent.append(list(row))
#             if row[5] == "Sent_end":
#                 sents.append(current_sent)
#                 current_sent = []
#     except Exception as e:
#         print(e)
#         print(index)
#         pass
    df_input = pd.read_csv(input_file, sep='|', encoding="utf-8")
#     print(df_input.head(2))
#     print(df_input.tail(2))
    getter = SentenceGetter(df_input)
    sents = getter.sentences
#     sents = df_input.to_records(index=False)
#     sents = list(sents)
    print(sents[0])
        
    ### Loop over sentences in file to get extra features for the CRF feature models
    in_quote = False
    after_colon = False
    sents_extra_features = []
    for sent in sents:
#         print(type(sent))

        # Generate graph
        sentence_graph, root_position = generate_graph(sent)  

        # Loop over tokens in sentence
        sent_extra_features = []
        for token_idx, token in enumerate(sent):
            token_extra_features = token
            word = token[5]
            postag = token[7]
            word_position = token[3]
            
            if word == quote_opening:
                in_quote = True
#             if postag == quote_opening:
#                 in_quote = True
            
            if word == colon_opening:
                after_colon = True
            if word == colon_closing:
                after_colon = False
                
            dependency_distance = 0
            dependency_path = ''
            if word != 'Sent_end':
                try:
                    shortest_path = nx.shortest_path(sentence_graph, source=word_position, target=root_position)
#                     print(f"Shortest path is {shortest_path}.")
                    dependency_distance = len(shortest_path)
                    dependency_path = get_dependency_path(shortest_path, sent)
                except Exception as e:
                    pass
#                     print(e)
#                     print(sent)
#                     print('root_position')   
#                     print(root_position)
#                     print('word_position')   
#                     print(word_position)
#                     print(word)
            
            token_extra_features = (token[0], token[1], token[2], token[3], token[4], token[5], 
                                    token[6], token[7], token[8], token[9], token[10], 
                                    in_quote, after_colon, dependency_distance, dependency_path)  # Tuple
            sent_extra_features.append(token_extra_features)
            
            if word == quote_closing:
                in_quote = False
#             if postag == quote_closing:
#                 in_quote = False
            
        # Append row with extra features to new list
        sents_extra_features.append(sent_extra_features)
            
    ### Write file
#     with open(f"Preprocessed_data/{filename.replace('.csv', filename_addition)}", "w", encoding="utf-8") as result:
#         headers1 = ["Article_Name", "Sentence_nr", "Nr_in_file", "Nr_in_sentence", "FromTo", "Word", "Lemma", "POS", "Dep_label", "Token_dep_head", 
#                     "AR_label", "In_quote", "After_colon", "Dep_distance", "Dep_path"]
#         writer = csv.writer(result, delimiter='\t')
#         writer.writerow(headers1)
#         for f in sents_extra_features:
#             writer.writerows(f)
    file_extra_features = [token for tokens in sents_extra_features for token in tokens]
    print(file_extra_features[0:1])
    df_file = pd.DataFrame(file_extra_features, columns=["Article_Name", "Sentence_nr", "Nr_in_file", "Nr_in_sentence", "FromTo", "Word", "Lemma", "POS", 
                                                         "Dep_label", "Token_dep_head", "AR_label", "In_quote", "After_colon", "Dep_distance", "Dep_path"])
    df_file['Sentence_nr'] = df_file['Sentence_nr'].apply(convert)
    df_file['Nr_in_file'] = df_file['Sentence_nr'].apply(convert)
    df_file['Nr_in_sentence'] = df_file['Sentence_nr'].apply(convert)
    df_file['Token_dep_head'] = df_file['Token_dep_head'].apply(convert)
    df_file.head(2)
    df_file.tail(2)
    output_file = f"Preprocessed_data/{filename.replace('.csv', filename_addition)}"
    df_file.to_csv(output_file, sep='|', index=False, encoding="utf-8")

# For every filename, get extra features
for filename in preprocessed_data:  # preprocessed_data_token_label:
    print(filename)
    get_extra_features(filename)  # .replace('.csv', filename_addition))

polnear_with_BIO_dev.csv
[('breitbart_2016-09-12_stealth-over-health-hillary-clin.txt.xml', 1.0, 1.0, 1.0, '0,7', 'Stealth', 'Stealth', 'NNP', 'compound', 3.0, 'O'), ('breitbart_2016-09-12_stealth-over-health-hillary-clin.txt.xml', 1.0, 2.0, 2.0, '8,12', 'Over', 'Over', 'NNP', 'compound', 3.0, 'O'), ('breitbart_2016-09-12_stealth-over-health-hillary-clin.txt.xml', 1.0, 3.0, 3.0, '13,19', 'Health', 'Health', 'NNP', 'root', 0.0, 'O'), ('breitbart_2016-09-12_stealth-over-health-hillary-clin.txt.xml', 1.0, 4.0, 4.0, '19,20', ':', ':', ':', 'punct', 3.0, 'O'), ('breitbart_2016-09-12_stealth-over-health-hillary-clin.txt.xml', 1.0, 5.0, 5.0, '21,28', 'Hillary', 'Hillary', 'NNP', 'compound', 6.0, 'O'), ('breitbart_2016-09-12_stealth-over-health-hillary-clin.txt.xml', 1.0, 6.0, 6.0, '29,36', 'Clinton', 'Clinton', 'NNP', 'nsubjpass', 8.0, 'O'), ('breitbart_2016-09-12_stealth-over-health-hillary-clin.txt.xml', 1.0, 7.0, 7.0, '37,40', 'Was', 'be', 'VBD', 'auxpass', 8.0, 'O'), ('breitbart_2016-09-1

In [13]:
# to prepare for using CRF.py, removing unlabeled sentences

def get_removed_file(filename):
    ### Read file
    
    # This line after is for FeaturesCRF
#     csvinput = open(f"Preprocessed_data/{filename.replace('.csv', filename_addition)}",'r', encoding="utf-8")
    # This line after is for Baseline CRF and EmbeddingCRF
    csvinput = open(f"Preprocessed_data/{filename}",'r')

#     csvreader = csv.reader(csvinput,delimiter='\t')
# #     print(csvreader)
#     headers=next(csvreader)
#     sents = []
#     current_sent = []
#     for row in csvreader:
#         current_sent.append(list(row))
#         if row[5] == "Sent_end":
#             sents.append(current_sent)
#             current_sent = []
    df_input = pd.read_csv(csvinput, sep='|', encoding="utf-8")
    # This line after is for FeaturesCRF
#     getter = SentenceExtraGetter(df_input)
    # This line after is for Baseline CRF and EmbeddingCRF
    getter = SentenceGetter(df_input)
    sents = getter.sentences
    print(sents[0])
            
    ### Create a new list with only the labeled sentences
    sents_removed = []
    for sent in sents:
        sent_label_list = []
        for f in sent:
            label = f[10]
            sent_label_list.append(label)
        if set(sent_label_list) != {'O'}:
            sents_removed.append(sent)
            
    ### Write file
#     with open(f"Preprocessed_data/{filename.replace('.csv', removed_filename_addition)}", "w", encoding="utf-8") as result:
#         #this line after is for featuresCRF
#         headers1 = ["Article_Name", "Sentence_nr", "Nr_in_file", "Nr_in_sentence", "FromTo", "Word", "Lemma", "POS", "Dep_label", "Token_dep_head", 
#                     "AR_label", "In_quote", "After_colon", "Dep_distance", "Dep_path"]
#         #this line after is for baseline and embedding
# #         headers1 = ["Article_Name", "Sentence_nr", "Nr_in_file", "Nr_in_sentence", "FromTo", "Word", "Lemma", "POS", "Dep_label", "Token_dep_head", 
# #                     "AR_label"]        
#         writer = csv.writer(result, delimiter='\t')
#         writer.writerow(headers1)
#         for f in sents_removed:
#             writer.writerows(f)
    # Transform the list of sentences back to a list of tokens again.
    file_sents_removed = [token for tokens in sents_removed for token in tokens]
    # This line after is for FeaturesCRF
    df_file = pd.DataFrame(file_sents_removed, columns=["Article_Name", "Sentence_nr", "Nr_in_file", "Nr_in_sentence", "FromTo", "Word", "Lemma", "POS", 
                                                         "Dep_label", "Token_dep_head", "AR_label", "In_quote", "After_colon", "Dep_distance", "Dep_path"])
    # This line after is for Baseline CRF and EmbeddingCRF
    df_file = pd.DataFrame(file_extra_features, columns=["Article_Name", "Sentence_nr", "Nr_in_file", "Nr_in_sentence", "FromTo", "Word", "Lemma", "POS", 
                                                         "Dep_label", "Token_dep_head", "AR_label"])
    df_file['Sentence_nr'] = df_file['Sentence_nr'].apply(convert)
    df_file['Nr_in_file'] = df_file['Sentence_nr'].apply(convert)
    df_file['Nr_in_sentence'] = df_file['Sentence_nr'].apply(convert)
    df_file['Token_dep_head'] = df_file['Token_dep_head'].apply(convert)
    output_file = f"Preprocessed_data/{filename.replace('.csv', removed_filename_addition)}"
    df_file.to_csv(output_file, sep='|', index=False, encoding="utf-8")


for filename in preprocessed_data:
    if 'train' in filename:
        get_removed_file(filename)

[('breitbart_2015-11-11_the-ninth-circle-the-hellish-vie.txt.xml', 1, 1, 1, '0,3', 'The', 'the', 'DT', 'det', 3.0, 'O', False, False, 3, 'det'), ('breitbart_2015-11-11_the-ninth-circle-the-hellish-vie.txt.xml', 1, 1, 1, '4,9', 'Ninth', 'Ninth', 'NNP', 'compound', 3.0, 'O', False, False, 3, 'compound'), ('breitbart_2015-11-11_the-ninth-circle-the-hellish-vie.txt.xml', 1, 1, 1, '10,16', 'Circle', 'Circle', 'NNP', 'root', 0.0, 'O', False, False, 2, nan), ('breitbart_2015-11-11_the-ninth-circle-the-hellish-vie.txt.xml', 1, 1, 1, '16,17', ':', ':', ':', 'punct', 3.0, 'O', False, True, 3, 'punct'), ('breitbart_2015-11-11_the-ninth-circle-the-hellish-vie.txt.xml', 1, 1, 1, '18,21', 'The', 'the', 'DT', 'det', 7.0, 'O', False, True, 4, 'det;dep'), ('breitbart_2015-11-11_the-ninth-circle-the-hellish-vie.txt.xml', 1, 1, 1, '22,29', 'Hellish', 'hellish', 'JJ', 'amod', 7.0, 'O', False, True, 4, 'amod;dep'), ('breitbart_2015-11-11_the-ninth-circle-the-hellish-vie.txt.xml', 1, 1, 1, '30,34', 'View', 

# NB. In removal some Art_end markers are disappearing because the last sentence is an unlabeled sentence.