In [11]:
import copy
from tqdm.auto import tqdm

In [12]:
trainfile = 'data/en_ewt-up-train.conllu'
devfile = 'data/en_ewt-up-dev.conllu'
testfile = 'data/en_ewt-up-test.conllu'

# Read the files

In [13]:
def read_conll(conllfile):
    """
    This function read and process the conllu file into list of sentences lists.
    """
    with open(conllfile, 'r', encoding='utf8') as infile:
        fulllist, sentlist = [],[]
        for line in infile:
            line = line.strip()
            if (line != '\n') & (line.startswith("#") == False): # Not empty and not commented
                sentlist.append(line.split())
            if line.startswith("#") == True:
                sentlist = [i for i in sentlist if i] # Remove empty list
                fulllist.append(sentlist)
                sentlist = []
                continue
        res = [ele for ele in fulllist if ele != []] # remove empty list
    return res

In [14]:
#read the file into list of sentence lists
trainlist = read_conll(trainfile)

testlist = read_conll(testfile)

## Preprocess the data




In [15]:

# extract potential features and duplicate sentences

def preprocess_list(conlllist):
    """
    This function preprocess the lists into list of sentences list.
    Each sentence list is a list of token lists. Each token list have 13 columns.
        If a sentence have 0 predicates, the column (list item) 12 and 13 (list[11] and list[12]) are set as None.
        If the sentence have multiple predicates, it will be duplicated to align the column number.
        If a sentence does not have record on line 11, it will be filled with '_'
    """
    sentlist = []
    for sentence in conlllist:
        sents = [ [] for _ in range(50) ] # Initialize a large empty list for multiple predicate sentence    
        
        for x in range(len(sentence)): # replace 'for components in sentence' that brings duplicate removal error
            components = []
            for y in range(len(sentence[x])):
                components.append(str(sentence[x][y]))

            # First 11 lines
            for i in range(0,10):
                try:
                    tokendict = {"ID":components[0], "form":components[1], "lemma":components[2], "upos":components[3], "xpos":components[4], "feats":components[5], "head":components[6], 
                             "deprel":components[7], "deps":components[8], "misc":components[9], "pred":components[10]}
                except IndexError: # Wrong sentence in the dataset that have no column 11
                    tokendict['pred'] = '_'

            # If sentence have no predicate: assign the values '_'
            if len(components) <= 11: 
                tokendict['V'], tokendict['ARG'] ,tokendict['dup'] = '_','_','_'
                sents[0].append(tokendict)

            # Sentence have one or more predicate
            if len(components) > 11: 
                dup = len(components)-11 # Times for dpulication
                for k in range(0, dup):
                    tokendictk = copy.deepcopy(tokendict)
                    tokendictk['dup'] = k
                    ARGV = components[k+11]
                    # Following conditons change 'pred' (and ARG, V also) entry for duplicated sentence
                    if ARGV == 'V':
                        tokendictk['V'],tokendictk['ARG'] = 'V','_'
                        try:
                            tokendictk['pred'] = sentence[int(tokendictk['ID'])-1][10]
                        except IndexError:
                            print('The following sentence contains error:',sentence)
                            continue
                    if (ARGV != 'V') & (ARGV != '_'):
                        tokendictk['ARG'],tokendictk['V'],tokendictk['pred'] = ARGV,'_','_'
                    if ARGV == '_':
                        tokendictk['V'],tokendictk['ARG'],tokendictk['pred'] = '_','_','_'
                    sents[k].append(tokendictk)


        res = [ele for ele in sents if ele != []] # remove empty list
        sentlist += res

    return sentlist

In [16]:
preprocessed_train = preprocess_list(trainlist)

preprocessed_test = preprocess_list(testlist)

The following sentence contains error: [['1', 'I', 'I', 'PRON', 'PRP', 'Case=Nom|Number=Sing|Person=1|PronType=Prs', '2', 'nsubj', '2:nsubj|9.1:nsubj|10:nsubj', '_', '_', 'ARG0', '_', '_'], ['2', 'wish', 'wish', 'VERB', 'VBP', 'Mood=Ind|Tense=Pres|VerbForm=Fin', '0', 'root', '0:root', '_', 'wish.01', 'V', '_', '_'], ['3', 'all', 'all', 'DET', 'DT', '_', '2', 'iobj', '2:iobj', '_', '_', 'ARG2', '_', '_'], ['4', 'happy', 'happy', 'ADJ', 'JJ', 'Degree=Pos', '5', 'amod', '5:amod', '_', '_', '_', 'ARGM-ADJ', '_'], ['5', 'holidays', 'holiday', 'NOUN', 'NNS', 'Number=Plur', '2', 'obj', '2:obj', 'SpaceAfter=No', 'holiday.01', 'ARG1', 'V', '_'], ['6', ',', ',', 'PUNCT', ',', '_', '10', 'punct', '9.1:punct|10:punct', '_', '_', '_', '_', '_'], ['7', 'and', 'and', 'CCONJ', 'CC', '_', '10', 'cc', '9.1:cc|10:cc', '_', '_', '_', '_', '_'], ['8', 'moreso', 'moreso', 'ADV', 'RB', '_', '10', 'orphan', '9.1:advmod', 'SpaceAfter=No', '_', '_', '_', '_'], ['9', ',', ',', 'PUNCT', ',', '_', '10', 'punct', '

## Feature Extraction

In [17]:
import networkx as nx
def create_tree(sent):
    """
    This function creates a dependency tree between head and target token.

    Para::sent: A list of token dictionaries, each representing a token
                         and its information.
    
    """
    G = nx.Graph()
    for t_dict in sent:
        if t_dict['head'] != 0:
            start, end = int(t_dict['head']), int(t_dict['ID']) #find starting and end point of the tree
            G.add_edge(start,end)
            G.edges[start, end]['deprel'] = t_dict['deprel']
    return G

In [18]:
def path_with_deprel_and_depth(G, tokenID, predID):
    """
    This function finds the shortest dependency path from the target token to the target predicate
    Returns a dependency path str.
    
    Para::G: dependency graph from token to head
    Para::tokenID: target token ID
    para::predID: target predicate ID
    """
    path_string = ""
    path = nx.shortest_path(G, source=tokenID, target=predID)
    for i in range(len(path)-1):
        
        if len(nx.shortest_path(G, source=0, target=path[i])) > len(nx.shortest_path(G, source=0, target=path[i+1])):
            path_string += "U" # UP
        else:
            path_string += "D" # DOWN

        path_string = path_string + G.edges[path[i],path[i+1]]['deprel'] + '_'
        
    return path_string[:-1]

In [19]:
def find_path_to_pred(sent):
    """
    This function 
    1)adds predicate lemma to the end of dependecy path string
    2)add the dependency path feature of tokens to token dicts
    
    Para::sent:A list of token dictionaries, each representing a token and its information.
    """
    dep_tree = create_tree(sent)
    predID = -1
    pred_lema = ''
    for token in sent:
        if token['V'] == 'V':
            predID = int(token['ID'])
            pred_lema = token['lemma']
            break
    
    for token in sent:
        if predID == -1: # no predicate in the sentence
            token['path_to_pred'] = '_'
        else:
            token['path_to_pred'] = path_with_deprel_and_depth(dep_tree,int(token['ID']),predID) + f"_{pred_lema}"
            
    return sent

In [20]:
def extract_feature_label(preprocessed_list, selected_features):
    """
    Extract features and labels for training, keeping only selected features.
    Returns a tuple containing two lists - features and labels.
    
    Para::preprocessed_list (list): List of lists containing token dictionaries.
    Para::selected_features (list): List of feature names to be kept in the extracted features.
    """
    features = []
    gold_labels = []

    # Flatten the list of lists into a single list of token dictionaries
    token_list = [token_dict for sent_list in preprocessed_list for token_dict in sent_list]

    for token_dict in token_list:
        newdict = {feature: token_dict[feature] for feature in selected_features if feature in token_dict}
        features.append(newdict)
        gold_labels.append(token_dict["ARG"])

    return features, gold_labels

In [21]:
# apply to training data and extract training feature
    
selected_features = ["upos", "path_to_pred"]

sents_tr=[]
for sents in tqdm(preprocessed_train):
    sent_all_feat = find_path_to_pred(sents)
    sents_tr.append(sent_all_feat)

train_features, train_gold = extract_feature_label(sents_tr, selected_features)

for f,g in zip(train_features[:10], train_gold[:10]):
    print(f,g)

  0%|          | 0/42466 [00:00<?, ?it/s]

{'upos': 'PROPN', 'path_to_pred': 'Dparataxis_kill'} _
{'upos': 'PUNCT', 'path_to_pred': 'Upunct_Dparataxis_kill'} _
{'upos': 'PROPN', 'path_to_pred': 'Uflat_Dparataxis_kill'} _
{'upos': 'PUNCT', 'path_to_pred': 'Upunct_Dparataxis_kill'} _
{'upos': 'ADJ', 'path_to_pred': 'Uamod_Unsubj_kill'} _
{'upos': 'NOUN', 'path_to_pred': 'Unsubj_kill'} ARG0
{'upos': 'VERB', 'path_to_pred': '_kill'} _
{'upos': 'PROPN', 'path_to_pred': 'Uobj_kill'} ARG1
{'upos': 'PROPN', 'path_to_pred': 'Uflat_Uobj_kill'} _
{'upos': 'PROPN', 'path_to_pred': 'Uflat_Uobj_kill'} _


In [22]:
# apply to test data and extract test feature
sents_te=[]
for sents in tqdm(preprocessed_test):
    sent_all_feat = find_path_to_pred(sents)
    sents_te.append(sent_all_feat)

test_features, test_gold = extract_feature_label(sents_te, selected_features)

for f,g in zip(test_features[:10], test_gold[:10]):
    print(f,g)

  0%|          | 0/5328 [00:00<?, ?it/s]

{'upos': 'PRON', 'path_to_pred': 'Dadvcl_morph'} _
{'upos': 'SCONJ', 'path_to_pred': 'Umark_morph'} _
{'upos': 'PROPN', 'path_to_pred': 'Unsubj_morph'} ARG1
{'upos': 'VERB', 'path_to_pred': '_morph'} _
{'upos': 'ADP', 'path_to_pred': 'Ucase_Uobl_morph'} _
{'upos': 'PROPN', 'path_to_pred': 'Uobl_morph'} ARG2
{'upos': 'PUNCT', 'path_to_pred': 'Upunct_morph'} _
{'upos': 'PRON', 'path_to_pred': 'Dadvcl_expand'} _
{'upos': 'SCONJ', 'path_to_pred': 'Umark_expand'} _
{'upos': 'PROPN', 'path_to_pred': 'Unsubj_expand'} ARG0


## model training

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
def create_log_classifier(train_features, train_targets, max_iter):
    logreg = LogisticRegression(max_iter=max_iter)
    vec = DictVectorizer()
    features_vectorized = vec.fit_transform(train_features)
    model = logreg.fit(features_vectorized, train_targets) 
    return model, vec

In [25]:
model_single, vec_single = create_log_classifier(train_features, train_gold, 100)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## model predicting

In [28]:
def classify_data(model, vec, test_features):  
    features = vec.transform(test_features)
    predictions = model.predict(features)
    return predictions

In [29]:
single_pred = classify_data(model_single, vec_single, test_features)

In [32]:
print(len(test_gold), len(single_pred))

103046 103046


## Evaluation

In [36]:
from sklearn.metrics import classification_report
label_set = set(sorted(test_gold))
overall_report = classification_report(test_gold, single_pred, digits = 7)
print(overall_report)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        ARG0  0.8155668 0.5578704 0.6625430      1728
        ARG1  0.8761944 0.6519320 0.7476072      3235
    ARG1-DSP  0.0000000 0.0000000 0.0000000         4
        ARG2  0.8586327 0.6574978 0.7447236      1127
        ARG3  0.3437500 0.1486486 0.2075472        74
        ARG4  0.5555556 0.1785714 0.2702703        56
        ARG5  0.0000000 0.0000000 0.0000000         1
        ARGA  0.0000000 0.0000000 0.0000000         2
    ARGM-ADJ  0.7777778 0.3407080 0.4738462       226
    ARGM-ADV  0.3459119 0.1108871 0.1679389       496
    ARGM-CAU  0.0000000 0.0000000 0.0000000        46
    ARGM-COM  0.0000000 0.0000000 0.0000000        13
    ARGM-CXN  0.0000000 0.0000000 0.0000000        12
    ARGM-DIR  0.0000000 0.0000000 0.0000000        47
    ARGM-DIS  0.9473684 0.1978022 0.3272727       182
    ARGM-EXT  0.7500000 0.3142857 0.4429530       105
    ARGM-GOL  0.0000000 0.0000000 0.0000000        24
    ARGM-LOC  0.5254237 0.1

  _warn_prf(average, modifier, msg_start, len(result))
