In [1]:
import copy
from tqdm.auto import tqdm

In [2]:
trainfile = 'data/en_ewt-up-train.conllu'
devfile = 'data/en_ewt-up-dev.conllu'
testfile = 'data/en_ewt-up-test.conllu'

## Read the files

In [3]:
def read_conll(conllfile):
    """
    This function read and process the conllu file into list of sentences lists.
    """
    with open(conllfile, 'r', encoding='utf8') as infile:
        fulllist, sentlist = [],[]
        for line in infile:
            line = line.strip()
            if (line != '\n') & (line.startswith("#") == False): # Not empty and not commented
                sentlist.append(line.split())
            if line.startswith("#") == True:
                sentlist = [i for i in sentlist if i] # Remove empty list
                fulllist.append(sentlist)
                sentlist = []
                continue
        res = [ele for ele in fulllist if ele != []] # remove empty list
    return res

In [4]:
#read the file into list of sentence lists
trainlist = read_conll(trainfile)
devlist = read_conll(devfile)
testlist = read_conll(testfile)

## Preprocess the data
extract potential features and duplicate sentences

In [5]:
def preprocess_list(conlllist):
    """
    This function preprocess the lists into list of sentences list.
    Each sentence list is a list of token lists. Each token list have 13 columns.
        If a sentence have 0 predicates, the column (list item) 12 and 13 (list[11] and list[12]) are set as None.
        If the sentence have multiple predicates, it will be duplicated to align the column number.
        If a sentence does not have record on line 11, it will be filled with '_'
    """
    sentlist = []
    for sentence in conlllist:
        sents = [ [] for _ in range(50) ] # Initialize a large empty list for multiple predicate sentence    
        
        for x in range(len(sentence)): # replace 'for components in sentence' that brings duplicate removal error
            components = []
            for y in range(len(sentence[x])):
                components.append(str(sentence[x][y]))

            # First 11 lines
            for i in range(0,10):
                try:
                    tokendict = {"ID":components[0], "form":components[1], "lemma":components[2], "upos":components[3], "xpos":components[4], "feats":components[5], "head":components[6], 
                             "deprel":components[7], "deps":components[8], "misc":components[9], "pred":components[10]}
                except IndexError: # Wrong sentence in the dataset that have no column 11
                    tokendict['pred'] = '_'

            # If sentence have no predicate: assign the values '_'
            if len(components) <= 11: 
                tokendict['V'], tokendict['ARG'] ,tokendict['dup'] = '_','_','_'
                sents[0].append(tokendict)

            # Sentence have one or more predicate
            if len(components) > 11: 
                dup = len(components)-11 # Times for dpulication
                for k in range(0, dup):
                    tokendictk = copy.deepcopy(tokendict)
                    tokendictk['dup'] = k
                    ARGV = components[k+11]
                    # Following conditons change 'pred' (and ARG, V also) entry for duplicated sentence
                    if ARGV == 'V':
                        tokendictk['V'],tokendictk['ARG'] = 'V','_'
                        try:
                            tokendictk['pred'] = sentence[int(tokendictk['ID'])-1][10]
                        except IndexError:
                            print('The following sentence contains error:',sentence)
                            continue
                    if (ARGV != 'V') & (ARGV != '_'):
                        tokendictk['ARG'],tokendictk['V'],tokendictk['pred'] = ARGV,'_','_'
                    if ARGV == '_':
                        tokendictk['V'],tokendictk['ARG'],tokendictk['pred'] = '_','_','_'
                    sents[k].append(tokendictk)


        res = [ele for ele in sents if ele != []] # remove empty list
        sentlist += res

    return sentlist

In [6]:
preprocessed_train = preprocess_list(trainlist)
preprocessed_dev = preprocess_list(devlist)
preprocessed_test = preprocess_list(testlist)

The following sentence contains error: [['1', 'I', 'I', 'PRON', 'PRP', 'Case=Nom|Number=Sing|Person=1|PronType=Prs', '2', 'nsubj', '2:nsubj|9.1:nsubj|10:nsubj', '_', '_', 'ARG0', '_', '_'], ['2', 'wish', 'wish', 'VERB', 'VBP', 'Mood=Ind|Tense=Pres|VerbForm=Fin', '0', 'root', '0:root', '_', 'wish.01', 'V', '_', '_'], ['3', 'all', 'all', 'DET', 'DT', '_', '2', 'iobj', '2:iobj', '_', '_', 'ARG2', '_', '_'], ['4', 'happy', 'happy', 'ADJ', 'JJ', 'Degree=Pos', '5', 'amod', '5:amod', '_', '_', '_', 'ARGM-ADJ', '_'], ['5', 'holidays', 'holiday', 'NOUN', 'NNS', 'Number=Plur', '2', 'obj', '2:obj', 'SpaceAfter=No', 'holiday.01', 'ARG1', 'V', '_'], ['6', ',', ',', 'PUNCT', ',', '_', '10', 'punct', '9.1:punct|10:punct', '_', '_', '_', '_', '_'], ['7', 'and', 'and', 'CCONJ', 'CC', '_', '10', 'cc', '9.1:cc|10:cc', '_', '_', '_', '_', '_'], ['8', 'moreso', 'moreso', 'ADV', 'RB', '_', '10', 'orphan', '9.1:advmod', 'SpaceAfter=No', '_', '_', '_', '_'], ['9', ',', ',', 'PUNCT', ',', '_', '10', 'punct', '

## Feature Extraction 

In [7]:
import spacy
nlp = spacy.load("en_core_web_sm") 

In [8]:
# def merge_sent (sent):
#     """
#     merge the token into sentences for parsing
#     return a dict with inform of sentence, the predicate being predicted and its lemma
    
#     sent::sentence list containing token dicts 
#     """

#     merged_dict = {}
#     merged_sent = ""
#     for n, i in enumerate(sent):
#         if n != len(sent) -1:
#             merged_sent += sent[n]['form'] + ' ' 
#         else:
#             merged_sent += sent[n]['form']
            
#         if sent[n]['pred'] != "_":
#             merged_dict["pred"] = sent[n]['form']
#             merged_dict["lemma"] = sent[n]['lemma']
#     merged_dict["sent"] = merged_sent

#     return merged_dict



In [9]:
def merge_sent_token (sent):
    """
    merge the token into sentences for parsing
    return a dict with inform of sentence, the predicate being predicted and its lemma
    
    sent::sentence list containing token dicts 
    """

    merged_dict = {}
    merged_sent = ""
    for n, i in enumerate(sent):
        if n != len(sent) -1:
            merged_sent += sent[n]['form'] + ' ' 
        else:
            merged_sent += sent[n]['form']
            
        if sent[n]['pred'] != "_":
            merged_dict["pred"] = sent[n]['form']
            merged_dict["lemma"] = sent[n]['lemma']

            
    merged_dict["sent"] = merged_sent
    merged_dict["token_l"] = [word["form"] for word in sent]
    
#     check the predicate
    if "pred" not in merged_dict:
        merged_dict["pred"] = None
        merged_dict["lemma"] = None

    return merged_dict



In [11]:
# sentence4 =  preprocessed_train[373] 
# merged_dict4 = merge_sent_token(sentence4)
# print(merged_dict4)

In [12]:
# #merge back into sentence to be parsed, create a dictionary save sentence and the predicate of each sentence
# #merged_dict["pred"] stores the predicate being predicted, merged_dict["lemma"] stores the lemma of the predicate
# merged_l = [] #list of dictionaroies, each dict has merged sent and responding target predicate

# for k, sent in enumerate(preprocessed_try):
#     merged_dict = {}
#     merged_sent = ""
#     for n, i in enumerate(sent):
#         if n != len(sent) -1:
#             merged_sent += sent[n]['form'] + ' ' 
#         else:
#             merged_sent += sent[n]['form']
            
#         if sent[n]['pred'] != "_":
#             merged_dict["pred"] = sent[n]['form']
#             merged_dict["lemma"] = sent[n]['lemma']
#     merged_dict["sent"] = merged_sent
    
#     merged_l.append(merged_dict)
        
        
# print(merged_l)

In [13]:
# def dep_path_2lemma (s, nlp):
#     """
#     create the dependency path from target_token to target_predicate with the predicate lemma
#     return a sentence list with dependency path feature of tokens
    
#     sent::dict of merged sentence
#     nlp:: spacy parse
#     """
#     sent = s["sent"]
#     token_text = sent.split(' ') #token_text is the same with dataset
    
#     doc = nlp (sent)
    
#     #get pred and its lemma of the sentence
#     t_pred = s['pred'] 
#     t_lemma = s['lemma']
    
#     #find the nlp_pred corresponding to pred_text
#     for t in doc:
#         if t.text == t_pred:
#             t_pred = t 
#         else:
#             continue
#     #find the nlp_token corresponding to token_text
#     t_token = None 
    
#     token_deps = []
#     for i, t in enumerate(doc):
#         if t.text == token_text[i]:
#             t_token = t
#         else:
#             print(f"unmatched nlp_token {t} and token_text {token_text[i]}")
#             continue
            
#         #find ancestors of each token
#         token_ancestors=[]
#         for anc in t_token.ancestors:
#             token_ancestors.append((anc, anc.dep_))#record each anc and the dep
            
#         # path from the target token to the ancestors
#         token_ancestors.insert(0,(t_token, t_token.dep_))
            
#         pred_ancestors = []
#         for anc in t_pred.ancestors:
#             pred_ancestors.append((anc, anc.dep_))
#         # path from the target predict to the ancestors
#         pred_ancestors.insert(0,(t_pred, t_pred.dep_))
 
#         # Create path to the first common ancestor
#         common_ancs = set(token_ancestors).intersection(pred_ancestors)
        
#         token_path = []
#         pred_path = []
        
#         for t in token_ancestors: #add token and dep_ until meets the common_anc
#             if t in common_ancs:
#                 break
#             token_path.append(t) 
            
#         for t in pred_ancestors:
#             if t in common_ancs:
#                 break
#             pred_path.append(t)
            
#         # revers pred_path for order "t_taget - t_pred"
#         pred_path.reverse()
        
#         #path list with tuple of each token and dep
#         dep_path_l= token_path + pred_path
        
#         #add lemma of root
#         if dep_path_l == []:
#             dep_path_l.append((t_token,''))
            
#         #path string
#         dep_path = ''
#         for t,p in dep_path_l:
#             dep_path += p + '_' 
#         #add the predicate lemma
#         dep_path += t_lemma
        
#         token_deps.append(dep_path) #list of dep path of each token to t_pred 
#     return token_deps

In [31]:
def dep_path_2lemma_c (s, nlp):
    """
    create the dependency path from target_token to target_predicate with the predicate lemma
    return a sentence list with dependency path feature of tokens
    
    sent::dict of merged sentence
    nlp:: spacy parse
    """
    sent = s["sent"]
    token_text = sent.split(' ') #token_text is the same with dataset
    
    doc = nlp (sent)
    
    #get pred and its lemma of the sentence
    
        
    t_pred = s['pred'] 
    t_lemma = s['lemma']
        
    if t_pred == None and t_lemma == None:
        
        n = len(token_text)
        token_deps = ["Unknown"] *n
        return token_deps
        
    
    #find the nlp_pred corresponding to pred_text
    for t in doc:
        if t.text == t_pred:
            t_pred = t 
        else:
            continue
    #find the nlp_token corresponding to token_text
    t_token = None 
    
    token_deps = []
    for i, t in enumerate(doc):
        try:
            if t.text == token_text[i]:
                t_token = t
        except IndexError:
            print(f"unmatched nlp_token and token_text")
            
            
        
            
        #find ancestors of each token
        token_ancestors=[]
        for anc in t_token.ancestors:
            token_ancestors.append((anc, anc.dep_))#record each anc and the dep
            
        # path from the target token to the ancestors
        token_ancestors.insert(0,(t_token, t_token.dep_))
            
        pred_ancestors = []
        
        try:
            for anc in t_pred.ancestors:
                pred_ancestors.append((anc, anc.dep_))
                
        except AttributeError:
            print(AttributeError)
        # path from the target predict to the ancestors
        pred_ancestors.insert(0,(t_pred, t_pred.dep_))
 
        # Create path to the first common ancestor
        common_ancs = set(token_ancestors).intersection(pred_ancestors)
        
        token_path = []
        pred_path = []
        
        for t in token_ancestors: #add token and dep_ until meets the common_anc
            if t in common_ancs:
                break
            token_path.append(t) 
            
        for t in pred_ancestors:
            if t in common_ancs:
                break
            pred_path.append(t)
            
        # revers pred_path for order "t_taget - t_pred"
        pred_path.reverse()
        
        #path list with tuple of each token and dep
        dep_path_l= token_path + pred_path
        
        #add lemma of root
        if dep_path_l == []:
            dep_path_l.append((t_token,''))
            
        #path string
        dep_path = ''
        for t,p in dep_path_l:
            dep_path += p + '_' 
        #add the predicate lemma
        dep_path += t_lemma
        
        token_deps.append(dep_path) #list of dep path of each token to t_pred 
    return token_deps

In [32]:
s =  preprocessed_train[520]
sent_dict = merge_sent_token (s)
sent = sent_dict["sent"]
token_text = sent.split(' ') #token_text is the same with dataset
    
doc = nlp (sent)
    
    #get pred and its lemma of the sentence
    
        
t_pred = sent_dict['pred'] 
t_lemma = sent_dict['lemma']
        
# if t_pred == None and t_lemma == None:
        
#     n = len(token_text)
#     token_deps = ["Unknown"] *n
#     # return token_deps
        
    
    #find the nlp_pred corresponding to pred_text
for t in doc:
    if t.text == t_pred:
        t_pred = t 
        print(t_pred,type(t_pred))
    else:
        print("unknown")
        continue
    #find the nlp_token corresponding to token_text
# t_token = None 
# print(doc)
# print(token_text)

unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown
unknown


In [33]:
def extract_features_sent(preprocessed_list):
    """
    create extra features of dependency path to the predicate as well as the predicate lemmaremove the festures not needed
    return a list of sentence lists containing all token with wanted features
    
    preprocessed_list:a list of sentence lists containing all token with features from the dataset
    """
    
    feat_4sents = []
    
    for sentence in tqdm(preprocessed_list):
        sent_dict = merge_sent_token (sentence)
        sent = sent_dict["sent"]
        # print(sent)
        # break
        token_deps = dep_path_2lemma_c (sent_dict ,nlp)
        # print(token_deps)

        #write the feature into token_dict
        try:
            for feature, token in zip(token_deps, sentence):
                token["dep_path_lemma"] = feature
        except IndexError: 
            # Handle the case when the zip runs out of items in one of the lists
            for token in sentence:
                if "dep_path_lemma" not in token:
                    token["dep_path_lemma"] = "_"
                    print(f"dep_path not extracted for {token} in {sent}")
    
        feat_4sents.append(sentence)
    return (feat_4sents)

In [34]:
train_feat_4sents = extract_features_sent(preprocessed_train)
test_feat_4sents = extract_features_sent(preprocessed_test)


  0%|          | 0/42466 [00:00<?, ?it/s]

unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token and token_text
unmatched nlp_token 

AttributeError: 'str' object has no attribute 'dep_'

## Extract training features and labels

In [30]:
def extract_feature_label(preprocessed_list, selected_features):
    """
    Extract features and labels for training, keeping only selected features.
    
    Parametser:
    preprocessed_list (list): List of lists containing token dictionaries.
    selected_features (list): List of feature names to be kept in the extracted features.
    
    Returns:
    tuple: A tuple containing two lists - features and labels.
    """
    features = []
    gold_labels = []

    # Flatten the list of lists into a single list of token dictionaries
    token_list = [token_dict for sent_list in preprocessed_list for token_dict in sent_list]

    for token_dict in token_list:
        newdict = {feature: token_dict[feature] for feature in selected_features if feature in token_dict}
        features.append(newdict)
        gold_labels.append(token_dict["ARG"])

    return features, gold_labels


In [31]:
selected_features = ["upos", "dep_path_lemma"]
train_features, train_gold = extract_feature_label(preprocessed_try, selected_features)
test_features, test_gold = extract_feature_label(preprocessed_try, selected_features)

In [33]:
print(training_features[:2], gold_labels[:2])

[{'upos': 'PROPN', 'dep_path_lemma': 'compound_npadvmod_kill'}, {'upos': 'PUNCT', 'dep_path_lemma': 'punct_npadvmod_kill'}] ['_', '_']


## model training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [None]:
def create_log_model(train_features, gold_labels, max_iter):
    logreg = LogisticRegression(max_iter=max_iter)
    vec = DictVectorizer()
    features_vectorized = vec.fit_transform(train_features)
    model = logreg.fit(features_vectorized, train_targets)
    return model, vec

In [None]:
model_single, vec_single = create_log_classifier(training_features, gold_labels, 100)

## model predicting

In [None]:
def classify_data(model, vec, test_features):  
    features = vec.transform(test_features)
    predictions = model.predict(test_features)
    return predictions

In [None]:
single_pred = classify_data(model_single, vec_single, test_features)

## Evaluation

In [None]:
from sklearn.metrics import classification_report


In [None]:
overall_report = classification_report(test_gold, single_pred, digits = 7, target_names = ["True", "False"])
print(overall_report)

In [None]:
label_set = set(sorted(test_gold))
label_report = classification_report(test_gold, single_pred, digits = 7, target_names = label_set)
print(label_report)

In [34]:
for sentence in preprocessed_try:
    sent_dict = merge_sent (sentence)
    sent = sent_dict["sent"]
    # print(sent)
    # break
    token_deps = dep_path_2lemma (sent_dict ,nlp)
    # print(token_deps)
    
    #write the feature into token_dict
    try:
        for feature, token in zip(token_deps, sentence):
            token["dep_path_lemma"] = feature
    except IndexError: 
        # Handle the case when the zip runs out of items in one of the lists
        for token in sentence:
            if "dep_path_lemma" not in token:
                token["dep_path_lemma"] = "_"
                print(f"dep_path not extracted for {token} in {sent}")

    print(sentence)
    
    break
#     new_sent_dict = []
#     for word_dict in sentence:
#         new_word_dict = copy.deepcopy(word_dict)
        



[{'ID': '1', 'form': 'Al', 'lemma': 'Al', 'upos': 'PROPN', 'xpos': 'NNP', 'feats': 'Number=Sing', 'head': '0', 'deprel': 'root', 'deps': '0:root', 'misc': 'SpaceAfter=No', 'pred': '_', 'dup': 0, 'V': '_', 'ARG': '_', 'dep_path_lemma': 'compound_npadvmod_kill'}, {'ID': '2', 'form': '-', 'lemma': '-', 'upos': 'PUNCT', 'xpos': 'HYPH', 'feats': '_', 'head': '1', 'deprel': 'punct', 'deps': '1:punct', 'misc': 'SpaceAfter=No', 'pred': '_', 'dup': 0, 'V': '_', 'ARG': '_', 'dep_path_lemma': 'punct_npadvmod_kill'}, {'ID': '3', 'form': 'Zaman', 'lemma': 'Zaman', 'upos': 'PROPN', 'xpos': 'NNP', 'feats': 'Number=Sing', 'head': '1', 'deprel': 'flat', 'deps': '1:flat', 'misc': '_', 'pred': '_', 'dup': 0, 'V': '_', 'ARG': '_', 'dep_path_lemma': 'npadvmod_kill'}, {'ID': '4', 'form': ':', 'lemma': ':', 'upos': 'PUNCT', 'xpos': ':', 'feats': '_', 'head': '1', 'deprel': 'punct', 'deps': '1:punct', 'misc': '_', 'pred': '_', 'dup': 0, 'V': '_', 'ARG': '_', 'dep_path_lemma': 'punct_kill'}, {'ID': '5', 'form'

In [None]:
print()

In [None]:


sent = s["sent"]
token_text = sent.split(' ')
doc = nlp (sent)
t_pred = s['pred']
print(t_pred)

for t in doc:
    if t.text==t_pred:
        t_pred = t
    else:
        continue
t_pred

In [22]:
for t in doc:
    if t.text==t_pred:
        print(t.text)

killed


In [58]:
# This function generates the dependency path from target_token to target_predicate,
# and calculates the dependency distance of them.
sent_deps = []

for s in merged_l:

    sent = s["sent"]
    token_text = sent.split(' ') #a list of tokens extracted from the original data, also the target token 
    # print(t_token)
    doc = nlp (sent)
    t_pred = s['pred']
    t_lemma = s['lemma']
    #  find the token in doc corresponding to predicate   
    for t in doc:
        if t.text == t_pred:
            print(t)
            t_pred = t  # This line stores the token in t_pred if it matches
        else:
            continue  
            
    # dep_p = t_pred.dep_
    # print(dep_p)
        
            
# find the corresponding token             
    t_token = None
    
    token_deps = []
    for i, t in enumerate(doc):
        if t.text == token_text[i]: #check if the spacy tokenization same with the token_text in the dataset
            t_token = t
            # print(t_token)
        else:
            print(f"different tokenization:{t} and {token_text[i]} in{sent}")
            continue
            
# #     find the dependency label of the current token
#         dep_t = t_token.dep_

# find the common ancestor
# #find all the ancestors of each token
        token_ancestors=[]#tuple contains the ( one ancster and the dependecy label)
        for anc in t_token.ancestors:
            token_ancestors.append((anc, anc.dep_))
#             path from the target token to the ancesters
        token_ancestors.insert(0,(t_token, t_token.dep_))
            
        pred_ancestors = []
        for anc in t_pred.ancestors:
            pred_ancestors.append((anc, anc.dep_))
#             path from the target pred to the ancesters
        pred_ancestors.insert(0,(t_pred, t_pred.dep_))
    
        common_ancs = set(token_ancestors).intersection(pred_ancestors)
#         common_anc = common_ancs[0] #first common anc to generate shortest path
        
#         # Create path to the first common ancestor
        token_path = []
        pred_path = []
        
        for t in token_ancestors: #add token and dep_ until meets the common_anc
            if t in common_ancs:
                break
            token_path.append(t) 
            
        for t in pred_ancestors:
            if t in common_ancs:
                break
            pred_path.append(t)
            
#             revers pred_path for order "t_taget - t_pred"
        pred_path.reverse()
    
        dep_path_l= token_path + pred_path
        if dep_path_l == []:
            dep_path_l.append((t_token,''))
        
        dep_path = ''
        for t,p in dep_path_l:
            dep_path += p + '_' 
        
#         add the predicate lemma
        dep_path += t_lemma
        # print(dep_path)
        # print(type(dep_path))
        token_deps.append(dep_path)
        
    sent_deps.append(token_deps)
    print(sent_deps)
    break
    
    
# # check the number of features and tokens
#     if len(token_deps) == len(token_text):
#         for t, f in zip(token_text, token_deps):
#             print(t, f)
#     else:
#         print("feature amount and token anount not match correctly" )
        
            

        
#     break



killed
[['compound_npadvmod_kill', 'punct_npadvmod_kill', 'npadvmod_kill', 'punct_kill', 'amod_nsubj_kill', 'nsubj_kill', '_kill', 'compound_dobj_kill', 'compound_dobj_kill', 'compound_dobj_kill', 'punct_dobj_kill', 'dobj_kill', 'punct_dobj_kill', 'det_appos_dobj_kill', 'appos_dobj_kill', 'prep_appos_dobj_kill', 'det_pobj_prep_appos_dobj_kill', 'pobj_prep_appos_dobj_kill', 'prep_pobj_prep_appos_dobj_kill', 'det_pobj_prep_pobj_prep_appos_dobj_kill', 'pobj_prep_pobj_prep_appos_dobj_kill', 'prep_pobj_prep_pobj_prep_appos_dobj_kill', 'pobj_prep_pobj_prep_pobj_prep_appos_dobj_kill', 'punct_appos_dobj_kill', 'prep_appos_dobj_kill', 'det_pobj_prep_appos_dobj_kill', 'amod_pobj_prep_appos_dobj_kill', 'pobj_prep_appos_dobj_kill', 'punct_kill']]


In [42]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children], [ans for ans in token.ancestors])

Al compound Zaman PROPN [] [Zaman, killed]
- punct Zaman PROPN [] [Zaman, killed]
Zaman npadvmod killed VERB [Al, -] [killed]
: punct killed VERB [] [killed]
American amod forces NOUN [] [forces, killed]
forces nsubj killed VERB [American] [killed]
killed ROOT killed VERB [Zaman, :, forces, Ani, .] []
Shaikh compound Ani PROPN [] [Ani, killed]
Abdullah compound Ani PROPN [] [Ani, killed]
al compound Ani PROPN [] [Ani, killed]
- punct Ani PROPN [] [Ani, killed]
Ani dobj killed VERB [Shaikh, Abdullah, al, -, ,, preacher] [killed]
, punct Ani PROPN [] [Ani, killed]
the det preacher ADJ [] [preacher, Ani, killed]
preacher appos Ani PROPN [the, at, ,, near] [Ani, killed]
at prep preacher ADJ [mosque] [preacher, Ani, killed]
the det mosque NOUN [] [mosque, at, preacher, Ani, killed]
mosque pobj at ADP [the, in] [at, preacher, Ani, killed]
in prep mosque NOUN [town] [mosque, at, preacher, Ani, killed]
the det town NOUN [] [town, in, mosque, at, preacher, Ani, killed]
town pobj in ADP [the, of

In [None]:
sentence, negation cue, token
sentence, predicate, current_token

In [69]:
# for i in range(len(try_l[0])):#try_l[0] is a sentence
#     token_l_list = [] #contain labels of one token
#     for t_l in try_l[0][i]:#token list with the labels, t_l: label
#         token_l_list.append(str(t_l))
    
#     #extract first 11 columns of shared information from dataset
#     try:
#         tokendict = {"ID":token_l_list[0], "form":token_l_list[1], "lemma":token_l_list[2], "upos":token_l_list[3], "xpos":token_l_list[4], "feats":token_l_list[5], "head":token_l_list[6], 
#                              "deprel":token_l_list[7], "deps":token_l_list[8], "misc":token_l_list[9], "pred":token_l_list[10]}
#     except IndexError:
#         tokendict['pred'] = '_'
        
#     #add information depending on the predicate
#     #sents with no predicate 
#     if len(token_l_list) <= 11: 
#         tokendict['V'], tokendict['ARG'] ,tokendict['dup'] = '_','_','_'
#         sents[0].append(tokendict)
        
#     #sents with one or more than one predicate 
#     if len(token_l_list) > 11: 
#         dup = len(token_l_list)-11 # Times for dpulication
#         for k in range(0,dup):
#             tokendictk = copy.deepcopy(tokendict)
#             tokendictk['dup'] = k
#             ARGV = components[k+11]
#             #add the information based on the position of the time of duplication
#             if ARGV == "V":
#                 tokendictk['V'],tokendictk['ARG'] = 'V','_'
#             if (ARGV != 'V') & (ARGV != '_'):
#                 tokendictk['ARG'],tokendictk['V'],tokendictk['pred'] = ARGV,'_','_'
#             if ARGV == '_':
#                 tokendictk['V'],tokendictk['ARG'],tokendictk['pred'] = '_','_','_'
#                 sents[k].append(tokendictk)
                
            
#         res = [ele for ele in sents if ele != []] # remove empty list
#         sentlist += res
        
#         tokendict['V'], tokendict['ARG'] ,tokendict['dup'] = '_','_','_'
#         sents[0].append(tokendict)
#     print(token_l_list)
#     break

['1', 'Al', 'Al', 'PROPN', 'NNP', 'Number=Sing', '0', 'root', '0:root', 'SpaceAfter=No', '_', '_']


In [55]:
# duplicate the sentences with multiple predicates

sentlist = []

for s in try_list:# s is a sentence list of token lists
    sents = [[] for _ in range(50)]

    for x in range(len(s)):#x is the token list
        components = []
        for f in range(len(s[x])):# f is the feature label of the token 
            components.append(str(s[x][f]))

            # #add the first predicate with 11 label
        print(components)
        
        for i in range(0, 10):
            try:
                tokendict = {"ID":components[0], "form":components[1], "lemma":components[2], "upos":components[3], "xpos":components[4], "feats":components[5], "head":components[6], 
                             "deprel":components[7], "deps":components[8], "misc":components[9], "pred":components[10]}
            except IndexError: # Wrong sentence in the dataset that have no column 11
                tokendict['pred'] = '_'
                
            # If sentence have no predicate: assign the values '_'
        if len(components) <= 11: 
            tokendict['V'], tokendict['ARG'] ,tokendict['dup'] = '_','_','_'
            sents[0].append(tokendict)
            
        # Sentence have one or more predicate
        if len(components) > 11: 
            dup = len(components)-11 # Times for dpulication
            for k in range (0, dup):
                    tokendictk = copy.deepcopy(tokendict)
                    tokendictk['dup'] = k
                    ARGV = components[k+11]
                    if ARGV == "V":
                        tokendictk['V'],tokendictk['ARG'] = 'V','_'
                        try:
                            tokendictk['pred'] = s[int(tokendictk['ID'])-1][10]
                        except IndexError:
                            print('The following sentence contains error:',sentence)
                            continue
                    if (ARGV != 'V') & (ARGV != '_'):
                        tokendictk['ARG'],tokendictk['V'],tokendictk['pred'] = ARGV,'_','_'
                    if ARGV == '_':
                        tokendictk['V'],tokendictk['ARG'],tokendictk['pred'] = '_','_','_'
                    sents[k].append(tokendictk)
                    
            
        res = [ele for ele in sents if ele != []] # remove empty list
        sentlist += res                                                                                                                                                                                                                                                                                      


        
       
       


['1', 'Al', 'Al', 'PROPN', 'NNP', 'Number=Sing', '0', 'root', '0:root', 'SpaceAfter=No', '_', '_']
['2', '-', '-', 'PUNCT', 'HYPH', '_', '1', 'punct', '1:punct', 'SpaceAfter=No', '_', '_']
['3', 'Zaman', 'Zaman', 'PROPN', 'NNP', 'Number=Sing', '1', 'flat', '1:flat', '_', '_', '_']
['4', ':', ':', 'PUNCT', ':', '_', '1', 'punct', '1:punct', '_', '_', '_']
['5', 'American', 'american', 'ADJ', 'JJ', 'Degree=Pos', '6', 'amod', '6:amod', '_', '_', '_']
['6', 'forces', 'force', 'NOUN', 'NNS', 'Number=Plur', '7', 'nsubj', '7:nsubj', '_', '_', 'ARG0']
['7', 'killed', 'kill', 'VERB', 'VBD', 'Mood=Ind|Tense=Past|VerbForm=Fin', '1', 'parataxis', '1:parataxis', '_', 'kill.01', 'V']
['8', 'Shaikh', 'Shaikh', 'PROPN', 'NNP', 'Number=Sing', '7', 'obj', '7:obj', '_', '_', 'ARG1']
['9', 'Abdullah', 'Abdullah', 'PROPN', 'NNP', 'Number=Sing', '8', 'flat', '8:flat', '_', '_', '_']
['10', 'al', 'al', 'PROPN', 'NNP', 'Number=Sing', '8', 'flat', '8:flat', 'SpaceAfter=No', '_', '_']
['11', '-', '-', 'PUNCT', 