In [95]:
import spacy
import xml.etree.ElementTree as ET
from spacy.symbols import nsubj, dobj, pobj, iobj, neg, xcomp, VERB
from gensim.parsing.preprocessing import strip_multiple_whitespaces
import pandas as pd
import re
import os
import sys

def read_lines(inputparsed):    
    """takes input from CoreNLP sentence parsed file and returns sentences"""
    #parse all lines from CoreNLP sentence split
    parsed = open(inputparsed, encoding = "utf-8")
    parsedfile = parsed.readlines()
    parsedlines = []

    #Only keep those lines which have Sentence #n in the line before
    for idx, text in enumerate(parsedfile):
        if text.startswith("Sentence #"):
            parsedlines.append(parsedfile[idx+1].replace('\n','').strip())
    
    return parsedlines

def gen_poss(line, verb_match, pre_dict):
    """generates all possibilities of patterns that a multi word line implies,
    by extracting partial patterns and resolving placeholder words"""
    poss = []

    #replace special tokens in text that are clear at this point
    line = line.replace("*", verb_match)
    line = line.replace("- ", "")
    line = line.replace("+","")
    line = line.replace("%","")
    line = line.replace("^","")
    line = line.replace("$","")

    #split line by possibility indicators and code (always ends possibility)
    #example.: "- $ * (P ON KILLING (P OF + [010] #  COMMENT <ELH 07 May 2008>"
    poss_split = re.split("\(P |\[.*]",line) 

    if len(poss_split) > 2: #2 is if no (P in the line
        #only combining the first (P, as they share the same code 
        #and the longer version will never be contained in a text if the shorter isnt
        poss.append(strip_multiple_whitespaces(" ".join(poss_split[:2])).lower().rstrip().lstrip())
    else: 
        poss.append(strip_multiple_whitespaces(poss_split[0].lower().rstrip().lstrip()))

    cleaned = []
    for text in list(set(poss)):
        c = 0
        for tag in list(pre_dict.keys()):
            if tag in text:
                for replacement in pre_dict[tag]:
                    cleaned.append(text.replace(tag, replacement))
                    c += 1
        if c == 0:
            cleaned.append(text)

    return cleaned
    

def verb_code_dict(pico_path, verb_path):
    """reads coding ontology and verb lists, 
    directly matches verbs to their CAMEO codes and returns this verbs:codes dictionairy.
    verb with codes that cannot be read are printed out as full line of the file"""
    #read PETRARCH Internal Coding Ontology (= pico)
    pico_path = os.path.join(os.getcwd(), pico_path)
    pico_file = open(pico_path, 'r')
    pico_lines = pico_file.readlines()

    #get all 20 codes with their respective code
    main_codes = {}                             #we run one iteration for all the main codes, only main codes contain relation name
    for line in pico_lines:
        line = line.split('#')
        if line[0] == "" or line[0] == "\n":    #only intro comments and empty lines
            continue
        else: 
            code_split = line[0].split(":")     #splits into CAMEO code and related hex
            if len(line) > 1 and code_split[0][2] == "0":      #only main categories have 0 in 3rd idx, [cat_num 0] -> [010]
                main_codes[code_split[0][:2]] = line[-1].replace("\n","")
    
    #map code to code we want to use in the training
    map_codes = {"DiplomaticCoop" : "Engage In Diplomatic Cooperation", 
                "MaterialCoop" : "Engage In Material Cooperation",
                "ProvideAid" : "Provide Aid",
                "Exhibit Force Posture": "Exhibit Military Posture",
                "Use Unconventional Mass Violence" : "Engage In Unconventional Mass Violence"}
    main_codes = {k: (map_codes[v] if v in map_codes else v) for k, v in main_codes.items()}
    
    #read single word patterns and match their code to the relation extracted in main_codes
    verb_path = os.path.join(os.getcwd(), verb_path)
    verb_file = open(verb_path, 'r')
    verb_lines = verb_file.readlines()
    
    verb_dict = {}
    for line in verb_lines:
        if line[0] == "#":
            continue
        elif line.startswith("---"):    #main verbs have a lead code, which is applied to all very in the section
                                        #unless a separate code is specified for a specific verb in section
            try: cur_main_code = re.split("\[|\]|---", line)[2].replace(":","")[:2]  #we only need main codes which are first two numbers
                                                                                #sometimes code starts with ":", e.g.: ---  OFFEND   [:110]  ---
                                                                                #we just remove those to get the main code
            except:                     #depending on chosen verb dictionairy, there may be main verbs without lead codes
                print("couldn't finde code in: ", line.replace("\n","")) 
                cur_main_code == "--"
            if cur_main_code == "": cur_main_code = "--"
        elif line == "\n":              #skip empty lines
            continue
        elif line[0] == "-" or line[0] == "~" or line[0] == "+" or line[0] == "&": #removes all special structures we cannot use
            continue
        else:
            if len(re.split("\[|\]", line)) > 1:    #verbs with their own code, e.g.: AFFIRM [051] 
                code = re.split("\[|\]", line)[1].replace(":","")[:2]
                if code != "--":
                    if "{" in line:         #conjugated verbs, e.g. "APPLY {APPLYING APPLIED APPLIES } [020]"
                        line_s = re.split("\{|\}", line)    #split at { and }
                        verb_dict[line_s[0].lower()] = main_codes[code] 
                        for word in line_s[1].split():
                            verb_dict[word.lower()] = main_codes[code]
                    else:
                        word = re.split("\[|\]", line)[0]
                        verb_dict[word.lower()] = main_codes[code]
            else:
                if cur_main_code != "--":
                    if "{" in line:         #e.g. "HURRY {HURRIES HURRYING HURRIED }" 
                        line_s = re.split("\{|\}", line)    #split at { and }
                        verb_dict[line_s[0].lower()] = main_codes[cur_main_code]
                        for word in line_s[1].split():
                            verb_dict[word.lower()] = main_codes[cur_main_code]
                    else:                   #only single words with sometimes comments, e.g.: CENSURE  # JON 5/17/95
                        word = line.split("#")[0].rstrip()    #gets part before "#", removes all whitespaces to the right
                        verb_dict[word.lower()] = main_codes[cur_main_code]

    #read multi word patterns and create a dictionary for their code

    #get filler words that occur in multi word patterns
    verb_file = open(verb_path, 'r')
    verb_lines = verb_file.readlines()

    pre_dict = {}
    filter_list = []
    for line in verb_lines:
        if line.startswith("&"):
            cur_filter = line.rstrip()
        elif line.startswith("\n") and "cur_filter" in locals():
            pre_dict[cur_filter.lower()] = filter_list
            cur_filter = ""
            filter_list = []
        elif line.startswith("+") and cur_filter != "":
            filter_list.append(line.rstrip()[1:].replace("_", "").lower())
    del pre_dict[""]

    #generate dictionaries for multi word patterns
    verb_file = open(verb_path, 'r')
    verb_lines = verb_file.readlines()

    spec_dict = {}
    spec_code = {}

    count = 0
    for line in verb_lines:
        if line.startswith("- "):
            #get main verb as dict key
            try: 
                verb_match = re.search("# *\w+", line).group()
                verb_match = re.search("\w+", verb_match).group()
                verb_match = verb_match.replace("_", " ").lower()
            except: 
                count += 1

            #get code for line
            try:
                code = re.search("\[.*]", line).group()[1:3]
                if code != "--":
                    #get all possibility that the line indicates
                    poss = gen_poss(line, verb_match, pre_dict)
                    for pattern in poss:
                        spec_code[pattern] = main_codes[code]
                    spec_dict[verb_match] = poss
            except:
                count += 1

    print(f"{count} patterns could not be loaded")        

    return verb_dict, spec_dict, spec_code


def get_triples(sentence, verb_dict, spec_dict, spec_code, nlp):
    """create triplet structure for training from text input, 
    verb_dict needs to be loaded before,
    spacy model needs to be initialized before """
    doc = nlp(sentence)
    verbs = []
    dict = {}


    for possible_verb in doc:
        if possible_verb.pos == VERB:
            if neg in [child.dep for child in possible_verb.children]: continue
            else: 
                for possible_subject in possible_verb.children: 
                    if possible_subject.dep == xcomp:   #subj / obj of composed verb should also be subj / obj of main verb
                        main_verb = possible_subject
                        main_idx = possible_subject.idx
                        for token in doc.ents:
                            if token.label_ in ["GPE", "NORP", "EVENTS", "FAC", "LAW", "ORG", "PERSON"]:
                                if token.root.dep_ == "poss":
                                    if token.root.head.head.idx == possible_verb.idx:
                                        verbs.append([main_idx, main_verb.lemma_, token.text, token.root.head.dep_])
                                        if main_idx in dict.keys(): dict[main_idx] += 1
                                        else: dict[main_idx] = 1
                                else:
                                    if token.root.head.idx == possible_verb.idx:
                                        verbs.append([main_idx, main_verb.lemma_, token.text, token.root.dep_])
                                        if possible_verb.idx in dict.keys(): dict[possible_verb.idx] += 1
                                        else: dict[possible_verb.idx] = 1

                for token in doc.ents:
                    if token.label_ in ["GPE", "NORP", "EVENTS", "FAC", "LAW", "ORG", "PERSON"]:
                        if token.root.dep_ == "poss":
                            if token.root.head.head.idx == possible_verb.idx:
                                verbs.append([possible_verb.idx, possible_verb.lemma_, token.text, token.root.head.dep_])
                                if possible_verb.idx in dict.keys(): dict[possible_verb.idx] += 1
                                else: dict[possible_verb.idx] = 1
                        else:
                            if token.root.head.idx == possible_verb.idx:
                                verbs.append([possible_verb.idx, possible_verb.lemma_, token.text, token.root.dep_])
                                if possible_verb.idx in dict.keys(): dict[possible_verb.idx] += 1
                                else: dict[possible_verb.idx] = 1

    trip_idx = [key for key in dict if dict[key] > 1]

    # doc = nlp(sentence)
    # verbs = []
    # dict = {}

    # for possible_verb in doc:           #parses through all words in sentence
    #     if possible_verb.pos == VERB:   #we only care about verbs
    #         if neg in [child.dep for child in possible_verb.children]: continue #we exclude all negated verbs
    #         else: 
    #             for candidate in possible_verb.children: #for composed verbs of verb (e.g. "want to join" -> "want join")
    #                 if candidate.dep == xcomp:   #subj / obj of composed verb should also be subj / obj of main verb
    #                     main_verb = candidate    
    #                     main_idx = candidate.idx
    #                     for chunk in doc.noun_chunks:   #chunks are noun-groups (e.g.: "78 out of 100 people" instead of "people")
    #                         if chunk.root.head.idx == possible_verb.idx:    #if chunk applies to xcomp (want),
    #                                                                         #treat it like it aplles to main verb ("join")
    #                             verbs.append([main_idx, main_verb.lemma_, chunk.text, chunk.root.dep_])
    #                             if main_idx in dict.keys(): dict[main_idx] += 1 #count how often verb is used
    #                             else: dict[main_idx] = 1

    #             for chunk in doc.noun_chunks:       #for normal verbs, check chunks directly
    #                 if chunk.root.head.idx == possible_verb.idx:
    #                     verbs.append([possible_verb.idx, possible_verb.lemma_, chunk.text, chunk.root.dep_])
    #                     if possible_verb.idx in dict.keys(): dict[possible_verb.idx] += 1
    #                     else: dict[possible_verb.idx] = 1
    
    # trip_idx = [key for key in dict if dict[key] > 1]   #if verbs used more than once, its candidate for triplet

    #priority for subj-relation-obj triplets
    mapper = {"nsubj":1,"dobj":2, "pobj":2, "iobj":2}

    #create df from verbs extracted 
    df = pd.DataFrame(verbs, columns = ["idx", "verb", "noun", "noun_type"])
    df["noun_map"] = df.noun_type.map(mapper)  #turn noun_types into priority 
    return df

    # #create groups that resolve around same word
    # gb = df.groupby('idx')    
    # #only keep groups if verb idx was identified as potential triplet before, sort by priority for structure
    # df_l = [gb.get_group(x).sort_values("noun_map") for x in gb.groups if gb.get_group(x).idx.iloc[0] in dict]
    # matches = [merge_trip(group) for group in df_l if not merge_trip(group) == None] #get groups into triplet structure
    
    # #turn matches into triples by only keeping those with coded verbs, return code instead of verb
    # triples = []
    # for match in matches:
    #     if match[1].lower() in spec_dict:
    #         for poss_pattern in spec_dict[match[1].lower()]:
    #             if set(poss_pattern.split()).intersection(sentence.split()) == set(poss_pattern.split()):
    #                 triples.append(f"<triplet> {match[0]} <subj> {match[2]} <obj> {spec_code[poss_pattern]}")
                    
    #     elif match[1].lower() in verb_dict:
    #         triples.append(f"<triplet> {match[0]} <subj> {match[2]} <obj> {verb_dict[match[1].lower()]}")
    #     else: print(f"couldn't match {match[1].lower()}")

    # #triples = [f"<triplet> {match[0]} <subj> {match[2]} <obj> {verb_dict[match[1].lower()]}" for match in matches if match[1].lower() in verb_dict]

    # return triples

def merge_trip(df):
    """helper function to turn two rows of a pandas groupby into subj, verb, obj"""
    if df.shape[0] == 2:
        if df.noun_type.iloc[0] != df.noun_type.iloc[1]:
            return [df.iloc[0].noun, df.iloc[0].verb, df.iloc[1].noun]
    elif df.shape[0] > 2:
        for i in range(df.shape[0] - 1):
            if df.noun_type.iloc[i] != df.noun_type.iloc[i+1]:
                return [df.iloc[0].noun, df.iloc[0].verb, df.iloc[1].noun]

    

In [96]:
pico_path = r"C:/Users/svawe/Thesis_RelationExtraction_PoliticsNews/soft_data/src/add_labels/dictionaries/PETR.Internal.Coding.Ontology.txt"
verb_path = r"C:/Users/svawe/Thesis_RelationExtraction_PoliticsNews/soft_data/src/add_labels/dictionaries/newdict.txt"

In [97]:
nlp = spacy.load('en_core_web_lg')
read = read_lines("data/out_data/articles_url_coref2.csv.xml.out")

In [98]:
verb_dict, spec_dict, spec_code = verb_code_dict(pico_path, verb_path)

couldn't finde code in:  --- DEFEND  ###
couldn't finde code in:  --- REVOKE_   ###
couldn't finde code in:  --- SEND   ###
couldn't finde code in:  --- COLLAPSE  ###
22 patterns could not be loaded


In [99]:
dfs_2 = [[sentence, get_triples(sentence, verb_dict, spec_dict, spec_code, nlp)] for sentence in read]

KeyboardInterrupt: 

In [None]:
len(dfs_2)

21395

In [None]:
dfs_2[:5]

[['AdvertisingRead moreThis live page is no longer being updated.',
  Empty DataFrame
  Columns: [idx, verb, noun, noun_type, noun_map]
  Index: []],
 ['For more on\xa0our coverage of the war in Ukraine, click here.11:44pm:\xa0UN grain coordinator\xa0expects loaded ships to depart Ukraine on ThursdayThe UN coordinator for the Ukraine Black Sea grain deal said UN grain coordinator expects loaded ships to depart Ukrainian ports on ThursdayThe.',
     idx    verb     noun noun_type  noun_map
  0  113  depart  Ukraine      dobj         2],
 ['“Exports of grain and foodstuffs from Ukraine need to continue.',
  Empty DataFrame
  Columns: [idx, verb, noun, noun_type, noun_map]
  Index: []],
 ['Although no movements of vessels are planned for 2 November under the #BlackSeaGrainInitiative, we expect loaded ships to sail on ThursdayThe,” UN coordinator Amir Abdulla posted on Twitter.',
     idx  verb          noun noun_type  noun_map
  0  172  post  Amir Abdulla     nsubj         1],
 ['Exports 

In [None]:
dfs_2[13]

['The UN Secretariat at coordination centreThere reports that the Ukrainian, Turkish and UN delegations agreed not to plan any movement of vessels in the Black Sea Grain Initiative for 2 November," "The UN Secretariat at the Joint Coordination Centre said Tuesday, referring to the July deal brokered by Turkey and the UN.6:15pm:\xa0Russian President Vladimir Putin tells Erdogan Russian President Vladimir Putin wants \'real guarantees\' from Kyiv on grain deal, says Russian President Vladimir Putin told Erdogan Tuesday that Russian President Vladimir Putin wanted "real guarantees" from Kyiv before Kyiv potentially rejoined grain deal.',
    idx    verb                noun noun_type  noun_map
 0   47  report  The UN Secretariat     nsubj         1
 1  361    tell      Vladimir Putin     nsubj         1
 2  408    want      Vladimir Putin     nsubj         1
 3  495    tell      Vladimir Putin     nsubj         1
 4  495    tell             Erdogan      dobj         2
 5  554    want      

In [None]:
candidats_2 = [df for df in dfs_2 if df[1].shape[0] >= 2]

In [None]:
len(candidats_2)

6454

In [None]:
dfs_2[13][1]["idx"]

0     47
1    361
2    408
3    495
4    495
5    554
6    613
Name: idx, dtype: int64

In [None]:
trips_2 = []
for can in candidats_2:
    for idx in can[1]["idx"]:
        if can[1]["idx"].to_list().count(idx) > 1:
            trips_2.append([can[0], can[1]])
            break

In [None]:
len(trips_2)


2395

In [None]:
matches_2 = []
for idx, df in enumerate(trips_2):
    #create groups that resolve around same word
    gb = df[1].groupby('idx')  
    #only keep groups if verb idx was identified as potential triplet before, sort by priority for structure
    for x in gb.groups:
        group = gb.get_group(x).sort_values("noun_map")
        if group.shape[0] == 2:
            if group.noun_type.iloc[0] != group.noun_type.iloc[1]:
                matches_2.append([df[0], group.iloc[0].noun, group.iloc[0].verb, group.iloc[1].noun])
        elif group.shape[0] > 2:
            for i in range(group.shape[0] - 1):
                if group.noun_type.iloc[i] != group.noun_type.iloc[i+1]:
                    matches_2.append([df[0], group.iloc[0].noun, group.iloc[0].verb, group.iloc[1].noun])

In [None]:
triples_2 = []
ma_df = pd.DataFrame(matches_2, columns = ["text","subj", "verb","obj"])
for row in ma_df.iterrows():
    if row[1]["verb"] in spec_dict:
        for poss_pattern in spec_dict[row[1]["verb"]]:
            if set(poss_pattern.split()).intersection(row[1]["text"].split()) == set(poss_pattern.split()):
                triples_2.append([row[1]["text"], row[1]['subj'], row[1]['obj'] , spec_code[poss_pattern]])
    elif row[1]["verb"] in verb_dict:
            triples_2.append([row[1]["text"], row[1]['subj'], row[1]['obj'], verb_dict[row[1]['verb']]])

couldn't match evaluate
couldn't match co
couldn't match roll
couldn't match love
couldn't match feel
couldn't match have
couldn't match convince
couldn't match teach
couldn't match soak
couldn't match think
couldn't match feel
couldn't match bind
couldn't match recruit
couldn't match classify
couldn't match leak
couldn't match begrudge
couldn't match lose
couldn't match become
couldn't match have
couldn't match have
couldn't match renew
couldn't match shift
couldn't match stump
couldn't match modify
couldn't match exert
couldn't match position
couldn't match know
couldn't match suspend
couldn't match pack
couldn't match know
couldn't match have
couldn't match exhaust
couldn't match turn
couldn't match have
couldn't match type
couldn't match post
couldn't match upload
couldn't match pack
couldn't match turn
couldn't match spare
couldn't match to
couldn't match position
couldn't match do
couldn't match evaluate
couldn't match download
couldn't match repopulate
couldn't match discredit
c

In [None]:
len(triples_2)

226

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
nli_model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
probs_l = []
for row in huh.iterrows():
    premise = row[1]["text"]
    subj = row[1]["subj"]
    rel = row[1]["label"]
    obj =  row[1]["obj"]

    hypothesis = f'{subj} does {rel} towards {obj}.'

    # run through model pre-trained on MNLI
    x = tokenizer.encode(premise, hypothesis, return_tensors='pt',
                        truncation_strategy='only_first')
    logits = nli_model(x)[0]

    # we throw away "neutral" (dim 1) and take the probability of
    # "entailment" (2) as the probability of the label being true 
    entail_contradiction_logits = logits[:,[0,2]]
    probs = entail_contradiction_logits.softmax(dim=1)
    prob_label_is_true = probs[:,1]

    probs_l.append([row[0], prob_label_is_true.item()])
    if row[0] % 100 == 0: print(row[0])

# Retrial with chunks instead of tokens

In [5]:
import spacy
import xml.etree.ElementTree as ET
from spacy.symbols import nsubj, dobj, pobj, iobj, neg, xcomp, VERB
from gensim.parsing.preprocessing import strip_multiple_whitespaces
import pandas as pd
import re
import os
import sys

def read_lines(inputparsed):    
    """takes input from CoreNLP sentence parsed file and returns sentences"""
    #parse all lines from CoreNLP sentence split
    parsed = open(inputparsed, encoding = "utf-8")
    parsedfile = parsed.readlines()
    parsedlines = []

    #Only keep those lines which have Sentence #n in the line before
    for idx, text in enumerate(parsedfile):
        if text.startswith("Sentence #"):
            parsedlines.append(parsedfile[idx+1].replace('\n','').strip())
    
    return parsedlines

def gen_poss(line, verb_match, pre_dict):
    """generates all possibilities of patterns that a multi word line implies,
    by extracting partial patterns and resolving placeholder words"""
    poss = []

    #replace special tokens in text that are clear at this point
    line = line.replace("*", verb_match)
    line = line.replace("- ", "")
    line = line.replace("+","")
    line = line.replace("%","")
    line = line.replace("^","")
    line = line.replace("$","")

    #split line by possibility indicators and code (always ends possibility)
    #example.: "- $ * (P ON KILLING (P OF + [010] #  COMMENT <ELH 07 May 2008>"
    poss_split = re.split("\(P |\[.*]",line) 

    if len(poss_split) > 2: #2 is if no (P in the line
        #only combining the first (P, as they share the same code 
        #and the longer version will never be contained in a text if the shorter isnt
        poss.append(strip_multiple_whitespaces(" ".join(poss_split[:2])).lower().rstrip().lstrip())
    else: 
        poss.append(strip_multiple_whitespaces(poss_split[0].lower().rstrip().lstrip()))

    cleaned = []
    for text in list(set(poss)):
        c = 0
        for tag in list(pre_dict.keys()):
            if tag in text:
                for replacement in pre_dict[tag]:
                    cleaned.append(text.replace(tag, replacement))
                    c += 1
        if c == 0:
            cleaned.append(text)

    return cleaned
    

def verb_code_dict(pico_path, verb_path):
    """reads coding ontology and verb lists, 
    directly matches verbs to their CAMEO codes and returns this verbs:codes dictionairy.
    verb with codes that cannot be read are printed out as full line of the file"""
    #read PETRARCH Internal Coding Ontology (= pico)
    pico_path = os.path.join(os.getcwd(), pico_path)
    pico_file = open(pico_path, 'r')
    pico_lines = pico_file.readlines()

    #get all 20 codes with their respective code
    main_codes = {}                             #we run one iteration for all the main codes, only main codes contain relation name
    for line in pico_lines:
        line = line.split('#')
        if line[0] == "" or line[0] == "\n":    #only intro comments and empty lines
            continue
        else: 
            code_split = line[0].split(":")     #splits into CAMEO code and related hex
            if len(line) > 1 and code_split[0][2] == "0":      #only main categories have 0 in 3rd idx, [cat_num 0] -> [010]
                main_codes[code_split[0][:2]] = line[-1].replace("\n","")
    
    #map code to code we want to use in the training
    map_codes = {"DiplomaticCoop" : "Engage In Diplomatic Cooperation", 
                "MaterialCoop" : "Engage In Material Cooperation",
                "ProvideAid" : "Provide Aid",
                "Exhibit Force Posture": "Exhibit Military Posture",
                "Use Unconventional Mass Violence" : "Engage In Unconventional Mass Violence"}
    main_codes = {k: (map_codes[v] if v in map_codes else v) for k, v in main_codes.items()}
    
    #read single word patterns and match their code to the relation extracted in main_codes
    verb_path = os.path.join(os.getcwd(), verb_path)
    verb_file = open(verb_path, 'r')
    verb_lines = verb_file.readlines()
    
    verb_dict = {}
    for line in verb_lines:
        if line[0] == "#":
            continue
        elif line.startswith("---"):    #main verbs have a lead code, which is applied to all very in the section
                                        #unless a separate code is specified for a specific verb in section
            try: cur_main_code = re.split("\[|\]|---", line)[2].replace(":","")[:2]  #we only need main codes which are first two numbers
                                                                                #sometimes code starts with ":", e.g.: ---  OFFEND   [:110]  ---
                                                                                #we just remove those to get the main code
            except:                     #depending on chosen verb dictionairy, there may be main verbs without lead codes
                print("couldn't finde code in: ", line.replace("\n","")) 
                cur_main_code == "--"
            if cur_main_code == "": cur_main_code = "--"
        elif line == "\n":              #skip empty lines
            continue
        elif line[0] == "-" or line[0] == "~" or line[0] == "+" or line[0] == "&": #removes all special structures we cannot use
            continue
        else:
            if len(re.split("\[|\]", line)) > 1:    #verbs with their own code, e.g.: AFFIRM [051] 
                code = re.split("\[|\]", line)[1].replace(":","")[:2]
                if code != "--":
                    if "{" in line:         #conjugated verbs, e.g. "APPLY {APPLYING APPLIED APPLIES } [020]"
                        line_s = re.split("\{|\}", line)    #split at { and }
                        verb_dict[line_s[0].lower()] = main_codes[code] 
                        for word in line_s[1].split():
                            verb_dict[word.lower()] = main_codes[code]
                    else:
                        word = re.split("\[|\]", line)[0]
                        verb_dict[word.lower()] = main_codes[code]
            else:
                if cur_main_code != "--":
                    if "{" in line:         #e.g. "HURRY {HURRIES HURRYING HURRIED }" 
                        line_s = re.split("\{|\}", line)    #split at { and }
                        verb_dict[line_s[0].lower()] = main_codes[cur_main_code]
                        for word in line_s[1].split():
                            verb_dict[word.lower()] = main_codes[cur_main_code]
                    else:                   #only single words with sometimes comments, e.g.: CENSURE  # JON 5/17/95
                        word = line.split("#")[0].rstrip()    #gets part before "#", removes all whitespaces to the right
                        verb_dict[word.lower()] = main_codes[cur_main_code]

    #read multi word patterns and create a dictionary for their code

    #get filler words that occur in multi word patterns
    verb_file = open(verb_path, 'r')
    verb_lines = verb_file.readlines()

    pre_dict = {}
    filter_list = []
    for line in verb_lines:
        if line.startswith("&"):
            cur_filter = line.rstrip()
        elif line.startswith("\n") and "cur_filter" in locals():
            pre_dict[cur_filter.lower()] = filter_list
            cur_filter = ""
            filter_list = []
        elif line.startswith("+") and cur_filter != "":
            filter_list.append(line.rstrip()[1:].replace("_", "").lower())
    del pre_dict[""]

    #generate dictionaries for multi word patterns
    verb_file = open(verb_path, 'r')
    verb_lines = verb_file.readlines()

    spec_dict = {}
    spec_code = {}

    count = 0
    for line in verb_lines:
        if line.startswith("- "):
            #get main verb as dict key
            try: 
                verb_match = re.search("# *\w+", line).group()
                verb_match = re.search("\w+", verb_match).group()
                verb_match = verb_match.replace("_", " ").lower()
            except: 
                count += 1

            #get code for line
            try:
                code = re.search("\[.*]", line).group()[1:3]
                if code != "--":
                    #get all possibility that the line indicates
                    poss = gen_poss(line, verb_match, pre_dict)
                    for pattern in poss:
                        spec_code[pattern] = main_codes[code]
                    spec_dict[verb_match] = poss
            except:
                count += 1

    print(f"{count} patterns could not be loaded")        

    return verb_dict, spec_dict, spec_code


def get_triples(sentence, verb_dict, spec_dict, spec_code, nlp):
    """create triplet structure for training from text input, 
    verb_dict needs to be loaded before,
    spacy model needs to be initialized before """
    doc = nlp(sentence)
    verbs = []
    dict = {}


    for possible_verb in doc:
        if possible_verb.pos == VERB:
            if neg in [child.dep for child in possible_verb.children]: continue
            else: 
                for possible_subject in possible_verb.children: 
                    if possible_subject.dep == xcomp:   #subj / obj of composed verb should also be subj / obj of main verb
                        main_verb = possible_subject
                        main_idx = possible_subject.idx
                        
                        for chunk in doc.noun_chunks:
                            if chunk.root.dep_ == "poss":
                                if chunk.root.head.head.idx == possible_verb.idx:
                                    verbs.append([main_idx, main_verb.lemma_, chunk.text, chunk.root.head.dep_])
                                    if main_idx in dict.keys(): dict[main_idx] += 1
                                    else: dict[main_idx] = 1
                            else:
                                if chunk.root.head.idx == possible_verb.idx:
                                    verbs.append([main_idx, main_verb.lemma_, chunk.text, chunk.root.dep_])
                                    if possible_verb.idx in dict.keys(): dict[possible_verb.idx] += 1
                                    else: dict[possible_verb.idx] = 1

                for chunk in doc.noun_chunks:       #for normal verbs, check chunks directly
        #                 if chunk.root.head.idx == possible_verb.idx:
                    if chunk.root.head.dep_ == "poss":
                        if chunk.root.head.head.idx == possible_verb.idx:
                            verbs.append([possible_verb.idx, possible_verb.lemma_, chunk.text, chunk.root.head.dep_])
                            if possible_verb.idx in dict.keys(): dict[possible_verb.idx] += 1
                            else: dict[possible_verb.idx] = 1
                    else:
                        if chunk.root.head.idx == possible_verb.idx:
                            verbs.append([possible_verb.idx, possible_verb.lemma_, chunk.text, chunk.root.dep_])
                            if possible_verb.idx in dict.keys(): dict[possible_verb.idx] += 1
                            else: dict[possible_verb.idx] = 1

    trip_idx = [key for key in dict if dict[key] > 1]

    # doc = nlp(sentence)
    # verbs = []
    # dict = {}

    # for possible_verb in doc:           #parses through all words in sentence
    #     if possible_verb.pos == VERB:   #we only care about verbs
    #         if neg in [child.dep for child in possible_verb.children]: continue #we exclude all negated verbs
    #         else: 
    #             for candidate in possible_verb.children: #for composed verbs of verb (e.g. "want to join" -> "want join")
    #                 if candidate.dep == xcomp:   #subj / obj of composed verb should also be subj / obj of main verb
    #                     main_verb = candidate    
    #                     main_idx = candidate.idx
    #                     for chunk in doc.noun_chunks:   #chunks are noun-groups (e.g.: "78 out of 100 people" instead of "people")
    #                         if chunk.root.head.idx == possible_verb.idx:    #if chunk applies to xcomp (want),
    #                                                                         #treat it like it aplles to main verb ("join")
    #                             verbs.append([main_idx, main_verb.lemma_, chunk.text, chunk.root.dep_])
    #                             if main_idx in dict.keys(): dict[main_idx] += 1 #count how often verb is used
    #                             else: dict[main_idx] = 1

    #             for chunk in doc.noun_chunks:       #for normal verbs, check chunks directly
    #                 if chunk.root.head.idx == possible_verb.idx:
    #                     verbs.append([possible_verb.idx, possible_verb.lemma_, chunk.text, chunk.root.dep_])
    #                     if possible_verb.idx in dict.keys(): dict[possible_verb.idx] += 1
    #                     else: dict[possible_verb.idx] = 1
    
    # trip_idx = [key for key in dict if dict[key] > 1]   #if verbs used more than once, its candidate for triplet

    #priority for subj-relation-obj triplets
    mapper = {"nsubj":1,"dobj":2, "pobj":2, "iobj":2}

    #create df from verbs extracted 
    df = pd.DataFrame(verbs, columns = ["idx", "verb", "noun", "noun_type"])
    df["noun_map"] = df.noun_type.map(mapper)  #turn noun_types into priority 
    return df

    # #create groups that resolve around same word
    # gb = df.groupby('idx')    
    # #only keep groups if verb idx was identified as potential triplet before, sort by priority for structure
    # df_l = [gb.get_group(x).sort_values("noun_map") for x in gb.groups if gb.get_group(x).idx.iloc[0] in dict]
    # matches = [merge_trip(group) for group in df_l if not merge_trip(group) == None] #get groups into triplet structure
    
    # #turn matches into triples by only keeping those with coded verbs, return code instead of verb
    # triples = []
    # for match in matches:
    #     if match[1].lower() in spec_dict:
    #         for poss_pattern in spec_dict[match[1].lower()]:
    #             if set(poss_pattern.split()).intersection(sentence.split()) == set(poss_pattern.split()):
    #                 triples.append(f"<triplet> {match[0]} <subj> {match[2]} <obj> {spec_code[poss_pattern]}")
                    
    #     elif match[1].lower() in verb_dict:
    #         triples.append(f"<triplet> {match[0]} <subj> {match[2]} <obj> {verb_dict[match[1].lower()]}")
    #     else: print(f"couldn't match {match[1].lower()}")

    # #triples = [f"<triplet> {match[0]} <subj> {match[2]} <obj> {verb_dict[match[1].lower()]}" for match in matches if match[1].lower() in verb_dict]

    # return triples

def merge_trip(df):
    """helper function to turn two rows of a pandas groupby into subj, verb, obj"""
    if df.shape[0] == 2:
        if df.noun_type.iloc[0] != df.noun_type.iloc[1]:
            return [df.iloc[0].noun, df.iloc[0].verb, df.iloc[1].noun]
    elif df.shape[0] > 2:
        for i in range(df.shape[0] - 1):
            if df.noun_type.iloc[i] != df.noun_type.iloc[i+1]:
                return [df.iloc[0].noun, df.iloc[0].verb, df.iloc[1].noun]

    

In [6]:
dfs = [[sentence, get_triples(sentence, verb_dict, spec_dict, spec_code, nlp)] for sentence in read]

In [7]:
len(dfs)

21395

In [8]:
candidats = [df for df in dfs if df[1].shape[0] >= 2]

In [9]:
len(candidats)

16590

In [10]:
trips = []
for can in candidats:
    for idx in can[1]["idx"]:
        if can[1]["idx"].to_list().count(idx) > 1:
            trips.append([can[0], can[1]])
            break

In [11]:
len(trips)


12594

In [12]:
matches = []
for idx, df in enumerate(trips):
    #create groups that resolve around same word
    gb = df[1].groupby('idx')    
    #only keep groups if verb idx was identified as potential triplet before, sort by priority for structure
    for x in gb.groups:
        if gb.get_group(x).shape[0] == 2:
            if gb.get_group(x).noun_type.iloc[0] != gb.get_group(x).noun_type.iloc[1]:
                matches.append([df[0], gb.get_group(x).iloc[0].noun, gb.get_group(x).iloc[0].verb, gb.get_group(x).iloc[1].noun])
        elif gb.get_group(x).shape[0] > 2:
            for i in range(gb.get_group(x).shape[0] - 1):
                if gb.get_group(x).noun_type.iloc[i] != gb.get_group(x).noun_type.iloc[i+1]:
                    matches.append([df[0], gb.get_group(x).iloc[0].noun, gb.get_group(x).iloc[0].verb, gb.get_group(x).iloc[1].noun])

In [13]:
len(matches)

20489

In [14]:
# triples = []
# ma_df = pd.DataFrame(matches, columns = ["text","subj", "verb","obj"])
# for row in ma_df.iterrows():
#     if row[1]["verb"] in spec_dict:
#         for poss_pattern in spec_dict[row[1]["verb"]]:
#             if set(poss_pattern.split()).intersection(row[1]["text"].split()) == set(poss_pattern.split()):
#                 triples.append([row[1]["text"], f"<triplet> {row[1]['subj']} <subj> {row[1]['obj']} <obj> {spec_code[poss_pattern]}"])
#     elif row[1]["verb"] in verb_dict:
#             triples.append([row[1]["text"], f"<triplet> {row[1]['subj']} <subj> {row[1]['obj']} <obj> {verb_dict[row[1]['verb']]}"])

triples = []
ma_df = pd.DataFrame(matches, columns = ["text","subj", "verb","obj"])
for row in ma_df.iterrows():
    if row[1]["verb"] in spec_dict:
        for poss_pattern in spec_dict[row[1]["verb"]]:
            if set(poss_pattern.split()).intersection(row[1]["text"].split()) == set(poss_pattern.split()):
                triples.append([row[1]["text"], row[1]['subj'], row[1]['obj'] , spec_code[poss_pattern]])
    elif row[1]["verb"] in verb_dict:
            triples.append([row[1]["text"], row[1]['subj'], row[1]['obj'], verb_dict[row[1]['verb']]])

In [53]:
key_l = [key for key in spec_code.keys() if "deport" in key]

In [56]:
key_l

['decide deport',
 'stop deport',
 'deport terrorist to',
 'living deport',
 'deport militant to',
 'rounded up and deport',
 'deport &preposit1',
 'deport linked',
 'deport alert',
 'deport despite',
 'deport from',
 'deport by',
 'deport to',
 'plan to deport',
 'arrive home after deport',
 'resume deportation']

In [54]:
for key in key_l:
    print(spec_code[key])

Coerce
Yield
Coerce
Coerce
Coerce
Coerce
Coerce
Coerce
Coerce
Coerce
Coerce
Coerce
Engage In Material Cooperation
Threaten
Coerce
Coerce


In [52]:
spec_code['arrive home after deport']

'Coerce'

In [81]:
huh = pd.DataFrame(triples, columns = ["text", "subj", "obj", "label"])

In [82]:
huh.text.nunique()

1249

In [83]:
huh.head()

Unnamed: 0,text,subj,obj,label
0,For more on our coverage of the war in Ukraine...,loaded ships,Ukraine,Consult
1,For more on our coverage of the war in Ukraine...,loaded ships,Ukrainian ports,Consult
2,"""Russia must clearly understand that Russia wi...",Russia,a harsh global response,Consult
3,"""Russia must clearly understand that Russia wi...","70,000 people",the Kherson region,Yield
4,Russian President Vladimir Putin told Erdogan ...,President Emmanuel Macron,Ukraine's defence needs,Reject


In [84]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
nli_model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')

In [86]:
probs_l = []
for row in huh.iterrows():
    premise = row[1]["text"]
    subj = row[1]["subj"]
    rel = row[1]["label"]
    obj =  row[1]["obj"]

    hypothesis = f'{subj} does {rel} towards {obj}.'

    # run through model pre-trained on MNLI
    x = tokenizer.encode(premise, hypothesis, return_tensors='pt',
                        truncation_strategy='only_first')
    logits = nli_model(x)[0]

    # we throw away "neutral" (dim 1) and take the probability of
    # "entailment" (2) as the probability of the label being true 
    entail_contradiction_logits = logits[:,[0,2]]
    probs = entail_contradiction_logits.softmax(dim=1)
    prob_label_is_true = probs[:,1]

    probs_l.append([row[0], prob_label_is_true.item()])
    if row[0] % 100 == 0: print(row[0])



0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600


In [87]:
huh = pd.merge(huh.reset_index(), pd.DataFrame(probs_l, columns = ["index", "prob"]), on = "index")

In [88]:
dec = huh[huh.prob < 0.7]

In [89]:
keep = huh[huh.prob > 0.7]

In [90]:
keep.groupby("label").index.count()

label
Appeal                               31
Assault                              10
Coerce                               36
Consult                             128
Disapprove                           28
Engage In Diplomatic Cooperation     29
Engage In Material Cooperation       59
Fight                                 6
Intend                               25
Investigate                          20
Make Public Statement                98
Protest                               2
Provide Aid                          24
Reduce Relations                     10
Reject                                2
Threaten                             13
Yield                                73
Name: index, dtype: int64

In [91]:
dec.groupby("label").index.count()

label
Appeal                                     20
Assault                                    62
Coerce                                     66
Consult                                    92
Demand                                      1
Disapprove                                 46
Engage In Diplomatic Cooperation          118
Engage In Material Cooperation            113
Engage In Unconventional Mass Violence      1
Exhibit Military Posture                    6
Fight                                      38
Intend                                     46
Investigate                                 4
Make Public Statement                     118
Protest                                     9
Provide Aid                                65
Reduce Relations                           53
Reject                                     47
Threaten                                    6
Yield                                     128
Name: index, dtype: int64

In [94]:
keep

Unnamed: 0,index,text,subj,obj,label,prob
0,0,For more on our coverage of the war in Ukraine...,loaded ships,Ukraine,Consult,0.937298
3,3,"""Russia must clearly understand that Russia wi...","70,000 people",the Kherson region,Yield,0.810604
5,5,12:04pm Kremlin accuses UK of ‘directing and c...,coordinating,Nord Stream blastsThe Kremlin,Engage In Material Cooperation,0.898151
7,7,"Swedish, Finnish NATO bidsFinland's Prime Mini...",Finland,the NATO defence alliance,Consult,0.716120
8,8,"Swedish, Finnish NATO bidsFinland's Prime Mini...",Finland,the NATO defence alliance,Consult,0.716120
...,...,...,...,...,...,...
1621,1621,"As a result, Lesotho's has been run by coaliti...",no prime minister,a full five-year term,Engage In Material Cooperation,0.912791
1623,1623,The UK delegation will also include Foreign Se...,The UK delegation,Foreign Secretary James Cleverly,Yield,0.746475
1626,1626,""" a spokesperson for the State Department also...",a spokesperson,remarks,Make Public Statement,0.964414
1628,1628,MUMBAI: Mass layoffs at Twitter and also other...,Mass layoffs,also other companies,Consult,0.930462


In [65]:
dec.iloc[10].text

'the nine people arrested include two managers, two ticket clerks, two contractors and three security guards.'

In [66]:
dec.iloc[10]

index                                                   16
text     the nine people arrested include two managers,...
subj                                       the nine people
obj                                           two managers
label                                                Yield
prob                                              0.064997
Name: 16, dtype: object

In [58]:
premise = dec.iloc[6]["text"]
subj = dec.iloc[6]["subj"]
rel = "Coerce"
obj =  dec.iloc[6]["obj"]

hypothesis = f'{subj} does {rel} towards {obj} .'

# run through model pre-trained on MNLI
x = tokenizer.encode(premise, hypothesis, return_tensors='pt',
                    truncation_strategy='only_first')
logits = nli_model(x)[0]

entail_contradiction_logits = logits[:,[0,2]]
probs = entail_contradiction_logits.softmax(dim=1)
prob_label_is_true = probs[:,1]

prob_label_is_true.item()

0.8217246532440186