In [1]:
import numpy as np
import torch

In [2]:
import re

def getNewLineIndices(text:str) -> np.array:
    i = 0 # assume first char always opens paragraph
    paragraph_indices = []
    while i != -1:
        paragraph_indices.append(i)
        i = text.find('\n', i + 1)
    
    return np.array(paragraph_indices)

In [3]:
# parse data from ".ann" UKP 2.0 files
def readAnnotatedFile(ann_path:str) -> (dict, dict, dict, list, list):
    propositions, prop_labels, prop_stances, supports, attacks = {}, {}, {}, [], []
    with open(file=ann_path, mode='rt', encoding='utf8') as f:
        for line in f:
            delimited = line.split('\t')
            typ = delimited[0][0] # T == proposition , A = Stance, R = link
            inner_index = int(delimited[0][1:])
            data = delimited[1].split()
            
            if typ == 'T':
                label = data[0] # prop lable (Premise, Cliam or MajorClaim)
                start, end = int(data[1]), int(data[2]) # proposition offsets
                propositions[inner_index] = (start, end) # represent propositions by it's index boundries
                prop_labels[inner_index] = label
                
            elif typ == 'A':
                _, target_index, stance_value = data # first Column in "A" lines is always "Stance", stance value in {For, Against}
                prop_stances[int(target_index[1:])] = stance_value
                
            elif typ == 'R':
                link_typ = data[0] # link type in {supports, attacks}
                source, target = int(data[1][6:]), int(data[2][6:]) #get inner indices of related propositions (ex:Arg1:T4 Arg2:T3 -> source == 4 , target = 3)
                link_list = supports if link_typ == 'supports' else attacks
                link_list.append((source,target))
    
    return propositions, prop_labels, prop_stances, supports, attacks

In [4]:
class ArgDoc(object):
    def __init__(self, base_path):
        self.ess_id = int(base_path[-3:]) # essay id according to UKP naming convention
        self._txt_path = base_path + ".txt" # essay text file path
        self._ann_path = base_path + ".ann" # UKP annotated file path
        # read document's text
        with open(file=self._txt_path, mode='rt', encoding='utf8') as f:
            self.text = f.read()
        
        # get essay's paragraph's indices (seperated with '\n')
        self.paragraph_offsets = getNewLineIndices(self.text)
        
        # read annotated data from file
        propositions, prop_labels, prop_stances, supports, attacks = readAnnotatedFile(self._ann_path)
        
        # update proposition offsets, labels, stances and link types
        inner_indices, self.prop_offsets = zip(*sorted(propositions.items(), key = lambda x: x[1])) # use the beginning index of propositions for sort
       
        # paragraph alignmnt of propositions (ordered by proposition's offsets)
        self.prop_paragraphs = [np.searchsorted(self.paragraph_offsets, start) -1 for start, _ in self.prop_offsets]
        
        # invert indices for key management 
        new_indices = {k: v for v, k in enumerate(inner_indices)}
        n_props = len(self.prop_offsets)
        
        # update fields with new inverted indices
        self.prop_labels = [prop_labels[inner_indices[i]] for i in range(n_props)]
        self.prop_stances = {new_indices[k]: v for k,v in prop_stances.items()}
        self.supports = [(new_indices[src], new_indices[trg]) for src, trg in supports]
        self.attacks = [(new_indices[src], new_indices[trg]) for src, trg in attacks]
        self.links = self.supports + self.attacks

In [62]:
class ArgMLModel(object):
    def __init__():
        # TODO
        pass
    

In [17]:
import os
def visualizeUKPArgDoc(doc:ArgDoc, output_path):
    """
    visualise UKP argument essay object and save to png
    """
    import pydot
    arg_graph = pydot.Dot(graph_type='digraph')
    
    maj_claims = [("! " + doc.text[doc.prop_offsets[i][0]:doc.prop_offsets[i][1]]) for i in range(len(doc.prop_labels)) if doc.prop_labels[i] == 'MajorClaim'] # handle more than 1 major cliam for main node
    # add the major claims node
    head_node = pydot.Node('\n'.join(maj_claims),style='filled',
                           fillcolor = '#eeccdd')
    arg_graph.add_node(head_node)
    
    nodes = {}
    # add the premise and claims nodes
    for i in range(len(doc.prop_labels)):
        if doc.prop_labels[i] == 'MajorClaim':
            continue
        text = doc.text[doc.prop_offsets[i][0]:doc.prop_offsets[i][1]]
        start = 0
        label = []
        next_i = -1
        for i_c in range(1,len(text)):
            if i_c < next_i:
                continue                
            if i_c % 30 == 0:
                next_i = text.find(" ",i_c) + 1
                if next_i > 0:
                    label.append(text[start:next_i - 1])
                    start = next_i
                else:
                    label.append(text[start:])
                    start = len(text)
        if (i_c > start):
            rest = " " + text[start:]
            if len(rest.split()) == 1:
                label[-1] += rest
            else:
                label.append(text[start:])

        nodes[i] = pydot.Node(i,
                              label = '\n'.join(label),
                              style ='filled',
                              fillcolor = '#ccbbdd' if doc.prop_labels[i] == 'Claim' else '#aabbdd'
                             )
        arg_graph.add_node(nodes[i])
        
    # add edges
    # add the stances (cliams-majorClaims) edges
    for i,val in doc.prop_stances.items():
        tmp_edge = pydot.Edge(nodes[i], head_node,
                              label=val,
                              labelfontcolor='red' if val == "Against" else 'green',
                              color = 'red' if val == "Against" else 'green'
                             )
        arg_graph.add_edge(tmp_edge)
    
    # add the support/attacks edges
    for src,trg in doc.supports:
        tmp_edge = pydot.Edge(nodes[src],nodes[trg])
        arg_graph.add_edge(tmp_edge)
    
    for src, trg in doc.attacks:
        tmp_edge = pydot.Edge(nodes[src],nodes[trg],
                              style = 'dotted',
                              color = 'red'
                             )
        arg_graph.add_edge(tmp_edge)
    
    # display and save
    path = os.path.join(output_path,"essay{:3d}.png".format(doc.ess_id).replace(" ","0"))
    arg_graph.write_png(path)
    print ("saved png to {}".format(path))

In [7]:
arg_doc_ex = ArgDoc(base_path='/home/yochay/arg_mining_ukp/data/essay095')

In [None]:
visualizeUKPArgDoc(arg_doc_ex)

In [19]:
base = '/home/yochay/ukp_argmining_rnn/data/'
output_path = '/home/yochay/ukp_argmining_rnn/graphs/'

essays = [os.path.join(base,"essay{:3d}.ann".format(i).replace(" ","0")) for i in range(1,403)]
for fn in essays:
    argDoc = ArgDoc(fn[:-4])
    visualizeUKPArgDoc(argDoc,output_path)

saved png to /home/yochay/ukp_argmining_rnn/graphs/essay1.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay2.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay3.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay4.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay5.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay6.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay7.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay8.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay9.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay10.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay11.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay12.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay13.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay14.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay15.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay16.png
s

saved png to /home/yochay/ukp_argmining_rnn/graphs/essay133.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay134.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay135.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay136.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay137.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay138.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay139.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay140.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay141.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay142.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay143.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay144.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay145.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay146.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay147.png
saved png to /home/yochay/ukp_argmining_

saved png to /home/yochay/ukp_argmining_rnn/graphs/essay263.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay264.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay265.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay266.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay267.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay268.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay269.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay270.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay271.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay272.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay273.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay274.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay275.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay276.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay277.png
saved png to /home/yochay/ukp_argmining_

saved png to /home/yochay/ukp_argmining_rnn/graphs/essay392.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay393.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay394.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay395.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay396.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay397.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay398.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay399.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay400.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay401.png
saved png to /home/yochay/ukp_argmining_rnn/graphs/essay402.png


In [7]:
delimited[1]

'attacks Arg1:T18 Arg2:T16'

In [11]:
arg_doc_ex.prop_offsets

((251, 338),
 (445, 511),
 (513, 667),
 (669, 788),
 (790, 844),
 (857, 915),
 (938, 984),
 (986, 1157),
 (1166, 1246),
 (1248, 1355),
 (1365, 1426),
 (1443, 1531),
 (1533, 1584),
 (1586, 1643),
 (1645, 1713),
 (1726, 1783))

In [45]:
[print("{}\t{}".format(arg_doc_ex.prop_offsets[i],arg_doc_ex.prop_labels[i])) for i in range(len(arg_doc_ex.prop_offsets))];

(251, 338)	MajorClaim
(445, 511)	Premise
(513, 667)	Premise
(669, 788)	Premise
(790, 844)	Premise
(857, 915)	Claim
(938, 984)	Claim
(986, 1157)	Premise
(1166, 1246)	Premise
(1248, 1355)	Premise
(1365, 1426)	Premise
(1443, 1531)	MajorClaim
(1533, 1584)	Premise
(1586, 1643)	Premise
(1645, 1713)	Premise
(1726, 1783)	Claim


In [49]:
from nltk.parse import CoreNLPParser
from nltk.tokenize import sent_tokenize

In [64]:
# calculate new proposition offsets w/o spaces
def calc_no_spaces_indices(arg_doc:ArgDoc)->[(int,int)]:
    old_indices = arg_doc.prop_offsets
    text = arg_doc.text
    new_offsets = []
    for (beg,end) in old_indices:
        new_beg = len(text[:beg].replace(" ",""))
        new_end = new_beg + len(text[beg:end].replace(" ",""))
        new_offsets.append(new_beg, new_end)
    return new_offsets

In [53]:
arg_doc = arg_doc_ex
text = arg_doc_ex.text
pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
output_file = '/home/yochay/arg_mining_proj/data/095ex.preprocessed'

EMPTY_SIGN = "~"

def pre_process_ukp_essay(base_path, pos_tagger):
    arg_doc = ArgDoc(base_path)
    output_file = base_path + ".tsv"
    with open(output_file,'wt',encoding='utf8') as out_f:
        i_no_space = 0
        paragraphs = []
        # list of sentences (list of list of tuples representing tokens and POSs)
        for paragraph in text.split('\n'):
            # token,POS-tag,no_space_index tuple
            tagged_sentences = []
            # skip empty lines (usually seperated from essay title)
            if len(paragraph) == 0:
                continue
            # use nltk sentence tokenizer (PunktSentenceTokenizer)
            sentences = sent_tokenize(paragraph)
            for sent in sentences:
                # use Stanford's CoreNLP for POS tagging sentence by sentence
                pos_tagged_sent = pos_tagger.tag(sent.split())
                tok_pos_noSpaceIndex_sent = []
                for tok, pos in tagged_sent:
                    tok_pos_noSpaceIndex_sent.append(tok,pos,i_no_space)
                    i_no_space += len(tok)
                tagged_sentences.append(tok_pos_noSpaceIndex_sent)
            paragraphs.append(tagged_sentences)

        # add appropriate AC tags by propositions
        no_space_prop_offsets = calc_no_spaces_indices(arg_doc)

        for i_paragraph in range(len(paragraphs)):
            out_f.write("# paragraph {}\n".format(i_paragraph))
            for tagged_sentence in paragraphs(i_paragraph):
                out_f.write("# sent\n")
                for tok,pos,i_no_space in tagged_sentence:
                    # handle AC tagging where propositions apply
                    # inefficient but written in haste for 0ne-time use ... TODO: improve later
                    for i_prop in range(len(no_space_prop_offsets)):
                        # if the current token is in proposition i_prop
                        if (i_no_space => no_space_prop_offsets[i_prop][0] and < no_space_prop_offsets[i_prop][1]):
                            # tag AC information as required (beginning(B) or middle(I) of proposition + AC type)
                            ac_type = arg_doc.prop_lables[i_prop]
                            ac_bio_tag = join("B-" + ac_type) if i_no_space == no_space_prop_offsets[i_prop][0] else join("I-" + ac_type)
                            # tag relation information according to AC type ({AC index:supports\attacks} for premise, For/Against for Claim, empty tab for MajorClaim)
                            rel_tag = EMPTY_SIGN
                            if (ac_type == "Premise"):
                                # either it supports or attacks a claim
                                support_prems, supported = zip(*arg_doc.supports)
                                if i_prop in support_prems:
                                    rel_tag = "supports:{}".format(supported[support_prems.index(i_prop)])
                                else:
                                    attack_prems, attacked = zip(*arg_doc.attacks)
                                    rel_tag = "attacks:{}".format(attacked[attack_prems.index(i_prop)])
                            elif (ac_type == "Claim"):
                                # Claims only have For or Against relation type (they refer to the essay's major claims)
                                rel_tag = "{}:{}".format(arg_doc.stances[i_prop],EMPTY_SIGN)

                            f_out.write("\t".join(tok,pos,ac_bio_tag,i_prop,rel_tag))
                        else:
                            f_out.write("\t".join(tok,pos,EMPTY_SIGN,EMPTY_SIGN,EMPTY_SIGN))
                        f_out.write("\n")
                                    

The popularity of news media
Original Sentence:
The popularity of news media
The	DT
popularity	NN
of	IN
news	NN
media	NNS

Nowadays news media have become more and more popular. Many people consider that the drawbacks of this phenomenon outweigh its merits. However, it is possible that this idea is not completely true. It is widely seen that news media not only brings people entertainment, but also polishes up people' knowledge. Therefore, it seems not unreasonable to suggest that this is a positive development.
Original Sentence:
Nowadays news media have become more and more popular.
Nowadays	RB
news	NN
media	NNS
have	VBP
become	VBN
more	RBR
and	CC
more	RBR
popular	JJ
.	.
Original Sentence:
Many people consider that the drawbacks of this phenomenon outweigh its merits.
Many	JJ
people	NNS
consider	VBP
that	IN
the	DT
drawbacks	NNS
of	IN
this	DT
phenomenon	NN
outweigh	VBP
its	PRP$
merits	NNS
.	.
Original Sentence:
However, it is possible that this idea is not completely true.
However	RB


In [111]:
arg_doc_ex.prop_stances

{15: 'For', 5: 'For', 6: 'For'}