In [51]:
import spacy
import xml.etree.ElementTree as ET
from spacy.symbols import nsubj, dobj, pobj, iobj, neg, xcomp, VERB
import pandas as pd
import re
import os

#nlp=spacy.load('en_core_web_lg')

def merge_trip(df):
    if df.shape[0] > 1:
        return [df.iloc[0].noun, df.iloc[0].verb, df.iloc[1].noun]

## make sure to allign relation name between this data and own data

In [52]:
def verb_code_dict(pico_path, verb_path):
    """reads coding ontology and verb lists, 
    directly matches verbs to their CAMEO codes and returns this verbs:codes dictionairy.
    verb with codes that cannot be read are printed out as full line of the file"""
    #read PETRARCH Internal Coding Ontology (= pico)
    pico_path = os.path.join(os.getcwd(), pico_path)
    pico_file = open(pico_path, 'r')
    pico_lines = pico_file.readlines()

    #get all 20 codes with their respective code
    main_codes = {}                             #we run one iteration for all the main codes, only main codes contain relation name
    for line in pico_lines:
        line = line.split('#')
        if line[0] == "" or line[0] == "\n":    #only intro comments and empty lines
            continue
        else: 
            code_split = line[0].split(":")     #splits into CAMEO code and related hex
            if len(line) > 1 and code_split[0][2] == "0":      #only main categories have 0 in 3rd idx, [cat_num 0] -> [010]
                main_codes[code_split[0][:2]] = line[-1].replace("\n","")
    
    #map code to code we want to use in the training
    map_codes = {"DiplomaticCoop" : "Engage In Diplomatic Cooperation", 
                "MaterialCoop" : "Engage In Material Cooperation",
                "ProvideAid" : "Provide Aid",
                "Exhibit Force Posture": "Exhibit Military Posture",
                "Use Unconventional Mass Violence" : "Engage In Unconventional Mass Violence"}
    main_codes = {k: (map_codes[v] if v in map_codes else v) for k, v in main_codes.items()}
    
    #read verbs and match their code to the relation extracted in main_codes
    verb_path = os.path.join(os.getcwd(), verb_path)
    verb_file = open(verb_path, 'r')
    verb_lines = verb_file.readlines()
    
    verb_dict = {}
    for line in verb_lines:
        if line[0] == "#":
            continue
        elif line.startswith("---"):    #main verbs have a lead code, which is applied to all very in the section
                                        #unless a separate code is specified for a specific verb in section
            try: cur_main_code = re.split("\[|\]|---", line)[2].replace(":","")[:2]  #we only need main codes which are first two numbers
                                                                                #sometimes code starts with ":", e.g.: ---  OFFEND   [:110]  ---
                                                                                #we just remove those to get the main code
            except:                     #depending on chosen verb dictionairy, there may be main verbs without lead codes
                print("couldn't finde code in: ", line.replace("\n","")) 
                cur_main_code == "--"
            if cur_main_code == "": cur_main_code = "--"
        elif line == "\n":              #skip empty lines
            continue
        elif line[0] == "-" or line[0] == "~" or line[0] == "+" or line[0] == "&": #removes all special structures we cannot use
            continue
        else:
            if len(re.split("\[|\]", line)) > 1:    #verbs with their own code, e.g.: AFFIRM [051] 
                code = re.split("\[|\]", line)[1].replace(":","")[:2]
                if code != "--":
                    if "{" in line:         #conjugated verbs, e.g. "APPLY {APPLYING APPLIED APPLIES } [020]"
                        line_s = re.split("\{|\}", line)    #split at { and }
                        verb_dict[line_s[0]] = main_codes[code] 
                        for word in line_s[1].split():
                            verb_dict[word.lower()] = main_codes[code]
                    else:
                        word = re.split("\[|\]", line)[0]
                        verb_dict[word.lower()] = main_codes[code]
            else:
                if cur_main_code != "--":
                    if "{" in line:         #e.g. "HURRY {HURRIES HURRYING HURRIED }" 
                        line_s = re.split("\{|\}", line)    #split at { and }
                        verb_dict[line_s[0]] = main_codes[cur_main_code]
                        for word in line_s[1].split():
                            verb_dict[word.lower()] = main_codes[cur_main_code]
                    else:                   #only single words with sometimes comments, e.g.: CENSURE  # JON 5/17/95
                        word = line.split("#")[0].rstrip()    #gets part before "#", removes all whitespaces to the right
                        verb_dict[word.lower()] = main_codes[cur_main_code]

    return verb_dict

In [3]:
#version1

# doc = nlp(text2)
# verbs = []
# dict = {}
# for possible_verb in doc:
#     if possible_verb.pos == VERB:
#         if neg in [child.dep for child in possible_verb.children]: continue
#         else: 
#             for chunk in doc.noun_chunks:
#                 if chunk.root.head.idx == possible_verb.idx:
#                     verbs.append([possible_verb.idx, possible_verb, chunk.text, chunk.root.dep_])
#                     if possible_verb.idx in dict.keys(): dict[possible_verb.idx] += 1
#                     else: dict[possible_verb.idx] = 1
        

# trip_idx = [key for key in dict if dict[key] > 1]
# verbs, trip_idx

In [17]:
#version2 - for text2: technically want join is an xcomp so the child entity of want should be treated as child entity of join
def get_triples(sentence, verb_dict):
    """create triplet structure for training from text input, 
    verb_dict needs to be loaded before,
    spacy model needs to be initialized before """
    doc = nlp(sentence)
    verbs = []
    dict = {}

    for possible_verb in doc:
        if possible_verb.pos == VERB:
            if neg in [child.dep for child in possible_verb.children]: continue
            else: 
                for possible_subject in possible_verb.children: 
                    if possible_subject.dep == xcomp:   #subj / obj of composed verb should also be subj / obj of main verb
                        main_verb = possible_subject
                        main_idx = possible_subject.idx
                        for chunk in doc.noun_chunks:
                            if chunk.root.head.idx == possible_verb.idx:
                                verbs.append([main_idx, main_verb.lemma_, chunk.text, chunk.root.dep_])
                                if main_idx in dict.keys(): dict[main_idx] += 1
                                else: dict[main_idx] = 1

                for chunk in doc.noun_chunks:
                    if chunk.root.head.idx == possible_verb.idx:
                        verbs.append([possible_verb.idx, possible_verb.lemma_, chunk.text, chunk.root.dep_])
                        if possible_verb.idx in dict.keys(): dict[possible_verb.idx] += 1
                        else: dict[possible_verb.idx] = 1
    
    trip_idx = [key for key in dict if dict[key] > 1]

    #priority for subj-relation-obj triplets
    mapper = {"nsubj":1,"dobj":2, "pobj":2, "iobj":2}

    #create df from verbs extracted 
    df = pd.DataFrame(verbs, columns = ["idx", "verb", "noun", "noun_type"])
    df["noun_type"] = df.noun_type.map(mapper)  #turn noun_types into priority 

    #create groups that resolve around same word
    gb = df.groupby('idx')    
    #only keep groups if verb idx was identified as potential triplet before, sort by priority for structure
    df_l = [gb.get_group(x).sort_values("noun_type") for x in gb.groups if gb.get_group(x).idx.iloc[0] in dict]
    matches = [merge_trip(group) for group in df_l if not merge_trip(group) == None] #get groups into triplet structure
    
    #turn matches into triples by only keeping those with coded verbs, return code instead of verb
    triples = [[f"<triplet>{match[0]}<subj>{match[2]}<obj>{verb_dict[match[1]]}"] for match in matches if match[1].lower() in verb_dict]

    return triples

In [6]:
text = "According to a poll by Kyodo news agency released Saturday, 78 of 100 people surveyed opposed the military action in Iraq."
text2 = "I want to join together the feelings of each of us as individuals who oppose the war."
text3 = "I am getting hold of you"
text4 = "Russia ends ties with EU, US, Australia"

In [7]:
verb_dict = verb_code_dict("dictionaries/PETR.Internal.Coding.Ontology.txt", "dictionaries/newdict.txt")

#CAMEO.2.0.txt = 1451 words with relations
#CAMEO.2.0_unsorted.txt = 1452 words with relations
#CAMEO.verbpatterns.150430.txt = 1514 words with relations
#newdict.txt = 1522 words with relations
len(verb_dict)

couldn't finde code in:  --- DEFEND  ###
couldn't finde code in:  --- REVOKE_   ###
couldn't finde code in:  --- SEND   ###
couldn't finde code in:  --- COLLAPSE  ###


1522

In [19]:
for sent in [text, text2, text3, text4]:
    print(get_triples(sent, verb_dict = verb_dict))

[['<triplet>78 of 100 people<subj>the military action<obj>Disapprove']]
[['<triplet>I<subj>the feelings<obj>Consult'], ['<triplet>who<subj>the war<obj>Disapprove']]
[]
[]


In [182]:
from spacy import displacy
displacy.render(nlp(text4),jupyter=True)
for chunk in nlp(text4).noun_chunks:
    print(chunk.text)

Russia
ties
EU
US


In [3]:
inputparsed = r"C:\Users\svawe\Thesis_RelationExtraction_PoliticsNews\soft_data\data\in_data\bbc.csv.out"

#parse all lines from CoreNLP sentence split
parsed = open(inputparsed, encoding = "utf-8")
parsedfile = parsed.readlines()
parsedlines = []

#Only keep those lines which have Sentence #n in the line before
for idx, text in enumerate(parsedfile):
    if text.startswith("Sentence #"):
        parsedlines.append(parsedfile[idx+1].replace('\n','').strip())

In [46]:
import spacy
import xml.etree.ElementTree as ET
from spacy.symbols import nsubj, dobj, pobj, iobj, neg, xcomp, VERB
import pandas as pd
import re
import os
import sys

def read_lines(inputparsed):    
    """takes input from CoreNLP sentence parsed file and returns sentences"""
    #parse all lines from CoreNLP sentence split
    parsed = open(inputparsed, encoding = "utf-8")
    parsedfile = parsed.readlines()
    parsedlines = []

    #Only keep those lines which have Sentence #n in the line before
    for idx, text in enumerate(parsedfile):
        if text.startswith("Sentence #"):
            parsedlines.append(parsedfile[idx+1].replace('\n','').strip())
    
    return parsedlines

def verb_code_dict(pico_path, verb_path):
    """reads coding ontology and verb lists, 
    directly matches verbs to their CAMEO codes and returns this verbs:codes dictionairy.
    verb with codes that cannot be read are printed out as full line of the file"""
    #read PETRARCH Internal Coding Ontology (= pico)
    pico_path = os.path.join(os.getcwd(), pico_path)
    pico_file = open(pico_path, 'r')
    pico_lines = pico_file.readlines()

    #get all 20 codes with their respective code
    main_codes = {}                             #we run one iteration for all the main codes, only main codes contain relation name
    for line in pico_lines:
        line = line.split('#')
        if line[0] == "" or line[0] == "\n":    #only intro comments and empty lines
            continue
        else: 
            code_split = line[0].split(":")     #splits into CAMEO code and related hex
            if len(line) > 1 and code_split[0][2] == "0":      #only main categories have 0 in 3rd idx, [cat_num 0] -> [010]
                main_codes[code_split[0][:2]] = line[-1].replace("\n","")
    
    #map code to code we want to use in the training
    map_codes = {"DiplomaticCoop" : "Engage In Diplomatic Cooperation", 
                "MaterialCoop" : "Engage In Material Cooperation",
                "ProvideAid" : "Provide Aid",
                "Exhibit Force Posture": "Exhibit Military Posture",
                "Use Unconventional Mass Violence" : "Engage In Unconventional Mass Violence"}
    main_codes = {k: (map_codes[v] if v in map_codes else v) for k, v in main_codes.items()}
    
    #read verbs and match their code to the relation extracted in main_codes
    verb_path = os.path.join(os.getcwd(), verb_path)
    verb_file = open(verb_path, 'r')
    verb_lines = verb_file.readlines()
    
    verb_dict = {}
    for line in verb_lines:
        if line[0] == "#":
            continue
        elif line.startswith("---"):    #main verbs have a lead code, which is applied to all very in the section
                                        #unless a separate code is specified for a specific verb in section
            try: cur_main_code = re.split("\[|\]|---", line)[2].replace(":","")[:2]  #we only need main codes which are first two numbers
                                                                                #sometimes code starts with ":", e.g.: ---  OFFEND   [:110]  ---
                                                                                #we just remove those to get the main code
            except:                     #depending on chosen verb dictionairy, there may be main verbs without lead codes
                print("couldn't finde code in: ", line.replace("\n","")) 
                cur_main_code == "--"
            if cur_main_code == "": cur_main_code = "--"
        elif line == "\n":              #skip empty lines
            continue
        elif line[0] == "-" or line[0] == "~" or line[0] == "+" or line[0] == "&": #removes all special structures we cannot use
            continue
        else:
            if len(re.split("\[|\]", line)) > 1:    #verbs with their own code, e.g.: AFFIRM [051] 
                code = re.split("\[|\]", line)[1].replace(":","")[:2]
                if code != "--":
                    if "{" in line:         #conjugated verbs, e.g. "APPLY {APPLYING APPLIED APPLIES } [020]"
                        line_s = re.split("\{|\}", line)    #split at { and }
                        verb_dict[line_s[0]] = main_codes[code] 
                        for word in line_s[1].split():
                            verb_dict[word.lower()] = main_codes[code]
                    else:
                        word = re.split("\[|\]", line)[0]
                        verb_dict[word.lower()] = main_codes[code]
            else:
                if cur_main_code != "--":
                    if "{" in line:         #e.g. "HURRY {HURRIES HURRYING HURRIED }" 
                        line_s = re.split("\{|\}", line)    #split at { and }
                        verb_dict[line_s[0]] = main_codes[cur_main_code]
                        for word in line_s[1].split():
                            verb_dict[word.lower()] = main_codes[cur_main_code]
                    else:                   #only single words with sometimes comments, e.g.: CENSURE  # JON 5/17/95
                        word = line.split("#")[0].rstrip()    #gets part before "#", removes all whitespaces to the right
                        verb_dict[word.lower()] = main_codes[cur_main_code]

    return verb_dict


def get_triples(sentence, verb_dict, nlp):
    """create triplet structure for training from text input, 
    verb_dict needs to be loaded before,
    spacy model needs to be initialized before """
    doc = nlp(sentence)
    verbs = []
    dict = {}

    for possible_verb in doc:           #parses through all words in sentence
        if possible_verb.pos == VERB:   #we only care about verbs
            if neg in [child.dep for child in possible_verb.children]: continue #we exclude all negated verbs
            else: 
                for candidate in possible_verb.children: #for composed verbs of verb (e.g. "want to join" -> "want join")
                    if candidate.dep == xcomp:   #subj / obj of composed verb should also be subj / obj of main verb
                        main_verb = candidate    
                        main_idx = candidate.idx
                        for chunk in doc.noun_chunks:   #chunks are noun-groups (e.g.: "78 out of 100 people" instead of "people")
                            if chunk.root.head.idx == possible_verb.idx:    #if chunk applies to xcomp (want),
                                                                            #treat it like it aplles to main verb ("join")
                                verbs.append([main_idx, main_verb.lemma_, chunk.text, chunk.root.dep_])
                                if main_idx in dict.keys(): dict[main_idx] += 1 #count how often verb is used
                                else: dict[main_idx] = 1

                for chunk in doc.noun_chunks:       #for normal verbs, check chunks directly
                    if chunk.root.head.idx == possible_verb.idx:
                        verbs.append([possible_verb.idx, possible_verb.lemma_, chunk.text, chunk.root.dep_])
                        if possible_verb.idx in dict.keys(): dict[possible_verb.idx] += 1
                        else: dict[possible_verb.idx] = 1
    
    trip_idx = [key for key in dict if dict[key] > 1]   #if verbs used more than once, its candidate for triplet

    #priority for subj-relation-obj triplets
    mapper = {"nsubj":1,"dobj":2, "pobj":2, "iobj":2}

    #create df from verbs extracted 
    df = pd.DataFrame(verbs, columns = ["idx", "verb", "noun", "noun_type"])
    df["noun_type"] = df.noun_type.map(mapper)  #turn noun_types into priority 

    #create groups that resolve around same word
    gb = df.groupby('idx')    
    #only keep groups if verb idx was identified as potential triplet before, sort by priority for structure
    df_l = [gb.get_group(x).sort_values("noun_type") for x in gb.groups if gb.get_group(x).idx.iloc[0] in dict]
    matches = [merge_trip(group) for group in df_l if not merge_trip(group) == None] #get groups into triplet structure
    
    #turn matches into triples by only keeping those with coded verbs, return code instead of verb
    triples = [f"<triplet>{match[0]}<subj>{match[2]}<obj>{verb_dict[match[1]]}" for match in matches if match[1].lower() in verb_dict]

    return triples

def merge_trip(df):
    """helper function to turn two rows of a pandas groupby into subj, verb, obj"""
    if df.shape[0] > 1:
        return [df.iloc[0].noun, df.iloc[0].verb, df.iloc[1].noun]

In [55]:
nlp = spacy.load('en_core_web_lg')
read = read_lines(r"C:\Users\svawe\Thesis_RelationExtraction_PoliticsNews\soft_data\data\out_data\articles_url_coref.csv.xml.out")

verb_dict = verb_code_dict(r"C:\Users\svawe\Thesis_RelationExtraction_PoliticsNews\soft_data\src\add_labels\dictionaries\PETR.Internal.Coding.Ontology.txt",
                r"C:\Users\svawe\Thesis_RelationExtraction_PoliticsNews\soft_data\src\add_labels\dictionaries\newdict.txt")

df = pd.DataFrame([[line, " ".join(get_triples(line, verb_dict, nlp))] for line in read if get_triples(line, verb_dict, nlp) != []],
                    columns = ["text", "label"])

df.to_csv(r"C:\Users\svawe\Thesis_RelationExtraction_PoliticsNews\soft_data\data\out_data\articles_url_coref.csv._out.csv")

couldn't finde code in:  --- DEFEND  ###
couldn't finde code in:  --- REVOKE_   ###
couldn't finde code in:  --- SEND   ###
couldn't finde code in:  --- COLLAPSE  ###


KeyError: 'Beat'

In [58]:
verb_dict["beat"]

'Fight'

In [53]:
def read_lines(inputparsed):    
    """takes input from CoreNLP sentence parsed file and returns sentences"""
    #parse all lines from CoreNLP sentence split
    parsed = open(inputparsed, encoding = "utf-8")
    parsedfile = parsed.readlines()
    parsedlines = []

    #Only keep those lines which have Sentence #n in the line before
    for idx, text in enumerate(parsedfile):
        if text.startswith("Sentence #"):
            parsedlines.append(parsedfile[idx+1].replace('\n','').strip())
    
    return parsedlines

def verb_code_dict(pico_path, verb_path):
    """reads coding ontology and verb lists, 
    directly matches verbs to their CAMEO codes and returns this verbs:codes dictionairy.
    verb with codes that cannot be read are printed out as full line of the file"""
    #read PETRARCH Internal Coding Ontology (= pico)
    pico_path = os.path.join(os.getcwd(), pico_path)
    pico_file = open(pico_path, 'r')
    pico_lines = pico_file.readlines()

    #get all 20 codes with their respective code
    main_codes = {}                             #we run one iteration for all the main codes, only main codes contain relation name
    for line in pico_lines:
        line = line.split('#')
        if line[0] == "" or line[0] == "\n":    #only intro comments and empty lines
            continue
        else: 
            code_split = line[0].split(":")     #splits into CAMEO code and related hex
            if len(line) > 1 and code_split[0][2] == "0":      #only main categories have 0 in 3rd idx, [cat_num 0] -> [010]
                main_codes[code_split[0][:2]] = line[-1].replace("\n","")
    
    #map code to code we want to use in the training
    map_codes = {"DiplomaticCoop" : "Engage In Diplomatic Cooperation", 
                "MaterialCoop" : "Engage In Material Cooperation",
                "ProvideAid" : "Provide Aid",
                "Exhibit Force Posture": "Exhibit Military Posture",
                "Use Unconventional Mass Violence" : "Engage In Unconventional Mass Violence"}
    main_codes = {k: (map_codes[v] if v in map_codes else v) for k, v in main_codes.items()}
    
    #read verbs and match their code to the relation extracted in main_codes
    verb_path = os.path.join(os.getcwd(), verb_path)
    verb_file = open(verb_path, 'r')
    verb_lines = verb_file.readlines()
    
    verb_dict = {}
    for line in verb_lines:
        if line[0] == "#":
            continue
        elif line.startswith("---"):    #main verbs have a lead code, which is applied to all very in the section
                                        #unless a separate code is specified for a specific verb in section
            try: cur_main_code = re.split("\[|\]|---", line)[2].replace(":","")[:2]  #we only need main codes which are first two numbers
                                                                                #sometimes code starts with ":", e.g.: ---  OFFEND   [:110]  ---
                                                                                #we just remove those to get the main code
            except:                     #depending on chosen verb dictionairy, there may be main verbs without lead codes
                print("couldn't finde code in: ", line.replace("\n","")) 
                cur_main_code == "--"
            if cur_main_code == "": cur_main_code = "--"
        elif line == "\n":              #skip empty lines
            continue
        elif line[0] == "-" or line[0] == "~" or line[0] == "+" or line[0] == "&": #removes all special structures we cannot use
            continue
        else:
            if len(re.split("\[|\]", line)) > 1:    #verbs with their own code, e.g.: AFFIRM [051] 
                code = re.split("\[|\]", line)[1].replace(":","")[:2]
                if code != "--":
                    if "{" in line:         #conjugated verbs, e.g. "APPLY {APPLYING APPLIED APPLIES } [020]"
                        line_s = re.split("\{|\}", line)    #split at { and }
                        verb_dict[line_s[0]] = main_codes[code] 
                        for word in line_s[1].split():
                            verb_dict[word.lower()] = main_codes[code]
                    else:
                        word = re.split("\[|\]", line)[0]
                        verb_dict[word.lower()] = main_codes[code]
            else:
                if cur_main_code != "--":
                    if "{" in line:         #e.g. "HURRY {HURRIES HURRYING HURRIED }" 
                        line_s = re.split("\{|\}", line)    #split at { and }
                        verb_dict[line_s[0]] = main_codes[cur_main_code]
                        for word in line_s[1].split():
                            verb_dict[word.lower()] = main_codes[cur_main_code]
                    else:                   #only single words with sometimes comments, e.g.: CENSURE  # JON 5/17/95
                        word = line.split("#")[0].rstrip()    #gets part before "#", removes all whitespaces to the right
                        verb_dict[word.lower()] = main_codes[cur_main_code]

    return verb_dict


def get_triples(sentence, verb_dict, nlp):
    """create triplet structure for training from text input, 
    verb_dict needs to be loaded before,
    spacy model needs to be initialized before """
    doc = nlp(sentence)
    verbs = []
    dict = {}

    for possible_verb in doc:           #parses through all words in sentence
        if possible_verb.pos == VERB:   #we only care about verbs
            if neg in [child.dep for child in possible_verb.children]: continue #we exclude all negated verbs
            else: 
                for candidate in possible_verb.children: #for composed verbs of verb (e.g. "want to join" -> "want join")
                    if candidate.dep == xcomp:   #subj / obj of composed verb should also be subj / obj of main verb
                        main_verb = candidate    
                        main_idx = candidate.idx
                        for chunk in doc.noun_chunks:   #chunks are noun-groups (e.g.: "78 out of 100 people" instead of "people")
                            if chunk.root.head.idx == possible_verb.idx:    #if chunk applies to xcomp (want),
                                                                            #treat it like it aplles to main verb ("join")
                                verbs.append([main_idx, main_verb.lemma_, chunk.text, chunk.root.dep_])
                                if main_idx in dict.keys(): dict[main_idx] += 1 #count how often verb is used
                                else: dict[main_idx] = 1

                for chunk in doc.noun_chunks:       #for normal verbs, check chunks directly
                    if chunk.root.head.idx == possible_verb.idx:
                        verbs.append([possible_verb.idx, possible_verb.lemma_, chunk.text, chunk.root.dep_])
                        if possible_verb.idx in dict.keys(): dict[possible_verb.idx] += 1
                        else: dict[possible_verb.idx] = 1
    
    trip_idx = [key for key in dict if dict[key] > 1]   #if verbs used more than once, its candidate for triplet

    #priority for subj-relation-obj triplets
    mapper = {"nsubj":1,"dobj":2, "pobj":2, "iobj":2}

    #create df from verbs extracted 
    df = pd.DataFrame(verbs, columns = ["idx", "verb", "noun", "noun_type"])
    df["noun_type"] = df.noun_type.map(mapper)  #turn noun_types into priority 

    #create groups that resolve around same word
    gb = df.groupby('idx')    
    #only keep groups if verb idx was identified as potential triplet before, sort by priority for structure
    df_l = [gb.get_group(x).sort_values("noun_type") for x in gb.groups if gb.get_group(x).idx.iloc[0] in dict]
    matches = [merge_trip(group) for group in df_l if not merge_trip(group) == None] #get groups into triplet structure
    
    #turn matches into triples by only keeping those with coded verbs, return code instead of verb
    triples = [f"<triplet>{match[0]}<subj>{match[2]}<obj>{verb_dict[match[1]]}" for match in matches if match[1].lower() in verb_dict]

    return triples

def merge_trip(df):
    """helper function to turn two rows of a pandas groupby into subj, verb, obj"""
    if df.shape[0] > 1:
        return [df.iloc[0].noun, df.iloc[0].verb, df.iloc[1].noun]

In [54]:
df

NameError: name 'df' is not defined

In [49]:
res = [[line, " ".join(get_triples(line, verb_dict, nlp))] for line in read if get_triples(line, verb_dict, nlp) != []]

In [50]:
res

[['I tried to help an injured young man when the third explosion happened.',
  '<triplet>I<subj>an injured young man<obj>Engage In Material Cooperation'],
 ['"The attack comes as Russian President Vladimir Putin is preparing for a signing ceremony in Moscow to annex Zaporizhzhia along with Ukraine\'s Donetsk, Luhansk and Kherson regions.The move follows self-styled referendums in the eastern and southern regions, which have been condemned by Ukraine and the West as a sham.Russia invaded Ukraine on 24 February, and Moscow currently controls the majority of the Zaporizhzhia region, including Europe\'s biggest nuclear power plant there - but not the regional capital.Moscow-installed regional official Vladimir Rogov blamed "Ukrainian militants" for the Zaporizhzhia attack, Russian state-run media reported.In a separate development, one person was killed and five injured in overnight Russian strikes by Iskander missiles on the central city of Dnipro, about 70km (43 miles) north of Zaporizhz

In [35]:
" ".join(" ".join(" ".join(res[0])))

TypeError: sequence item 0: expected str instance, list found

In [44]:
res[8][0]

['<triplet>a bomb attack<subj>-<obj>Fight']

In [None]:
df = pd.DataFrame([[line, " ".join(get_triples(line, verb_dict, nlp))] for line in read if get_triples(line, verb_dict, nlp) != []],
                    columns = ["text", "label"])