In [1]:
class Document:
    def __init__(self, name, tokens, entities):
        self.name = name #book name
        self.tokens = tokens
        self.entities = entities
    def __repr__(self):
        return "Tokens:{}\nEntities:{}\n".format(self.tokens, self.entities)

class Entity:
    def __init__(self, id, ent_type, start_token, end_token, interval):
        self.id = id #id in objects file
        self.ent_type = ent_type
        self.start_token = start_token
        self.end_token = end_token
        self.interval = interval
        
    def __repr__(self):
        return " id: " + str(self.id) + " type: " + self.ent_type + " start: " + str(self.start_token) + " end: " + str(
            self.end_token) + "interval: " + str(self.interval)

In [2]:
test_path = "testset"
dev_path = "devset"

In [3]:
from dialent.task1.util import loadAllStandard

def get_docs_from_std(path):
    standard = loadAllStandard(path)
    docs = []
    for st in standard:
        tokens = [token.text for token in st.tokens]
        entities = []
        for entity in st.makeTokenSets():
            symb_start = entity.toInterval().start
            symb_end = entity.toInterval().end
            token_indexes = []
            for idx, token in enumerate(st.tokens):
                if symb_start <= token.start <= token.end <= symb_end:
                    token_indexes.append(idx)
            token_start = min(token_indexes)
            token_end = max(token_indexes)+1
            entities.append(Entity(entity.id, entity.tag, token_start, token_end, entity.toInterval()))
        docs.append(Document(st.name, tokens, entities))
    return docs

def get_samples_from_std(docs, word_window_size, embedding_model, word_emb_size):   
    samples_x = []
    samples_y = []
    for doc in docs:
        for ent in doc.entities:
            if ent.ent_type in {"loc", "locorg"}:
                sample = [0] * word_emb_size * word_window_size * 2
                entity_center = (ent.start_token+ent.end_token)//2
                for i in range(1, word_window_size + 1):
                    #left part of the window
                    token_pos = entity_center - i
                    if token_pos >= 0:
                        word = doc.tokens[token_pos]
                        if word in embedding_model:
                            sample[(i - 1) * word_emb_size: i * word_emb_size] = embedding_model[word]

                    #right part of the window
                    token_pos = entity_center + i - 1
                    if token_pos < len(doc.tokens):
                        word = doc.tokens[token_pos]
                        if word in embedding_model:
                            sample[(word_window_size + i - 1) * word_emb_size:(word_window_size + i) * word_emb_size] = embedding_model[word]
                        
                samples_x.append(sample)
                if ent.ent_type == "loc":
                    label = 0
                else:
                    label = 1
                samples_y.append(label)
    return samples_x, samples_y

In [4]:
import os

def write_docs(docs, path, pred):
    j = 0
    
    for doc in test_docs:
        for i, ent in enumerate(doc.entities):
            if ent.ent_type in {"loc", "locorg"}:
                doc.entities[i].ent_type = "loc" if pred[j] == 0 else "locorg"
                j += 1
    
    for doc in docs:
        object_file_path = os.path.join(path, doc.name+".task1")
        
        lines_to_write = []
            
        for entity in doc.entities:
            str_format = '{} {} {}\n'.format(entity.ent_type, entity.interval.start,entity.interval.end-entity.interval.start+1)
            lines_to_write.append(str_format)
        
        with open(object_file_path, "w") as f:
            f.writelines(lines_to_write)

In [5]:
word_window_size = 3
word_emb_size = 300

embedding_model = {}

with open('word_vec.txt', "r") as f:
    for line in f:
        split = line.strip().split(' ')
        embedding_model[split[0]] = [float(num) for num in split[1:]]

In [6]:
dev_docs = get_docs_from_std(dev_path)
test_docs = get_docs_from_std(test_path)

Failed to load the standard of book_3954:
Unknown mention tag: Facility


In [7]:
from sklearn.ensemble import GradientBoostingClassifier

x, y = get_samples_from_std(dev_docs, word_window_size, embedding_model, word_emb_size)
clf = GradientBoostingClassifier(n_estimators=250, learning_rate=0.3, random_state=777, verbose=True)
clf.fit(x,y)

  from numpy.core.umath_tests import inner1d


      Iter       Train Loss   Remaining Time 
         1           1.1076           53.28s
         2           0.9276           55.36s
         3           0.7958           55.47s
         4           0.7060           55.86s
         5           0.6405           55.70s
         6           0.5910           55.40s
         7           0.5480           55.30s
         8           0.5058           55.12s
         9           0.4760           54.80s
        10           0.4430           54.47s
        20           0.2570           52.09s
        30           0.1716           48.60s
        40           0.1199           45.51s
        50           0.0905           42.91s
        60           0.0646           40.58s
        70           0.0493           38.19s
        80           0.0390           35.83s
        90           0.0296           33.63s
       100           0.0230           31.53s
       200           0.0071           10.49s


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.3, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=250, presort='auto', random_state=777,
              subsample=1.0, verbose=True, warm_start=False)

In [8]:
test_samples, _ = get_samples_from_std(test_docs, word_window_size, embedding_model, word_emb_size)
pred=clf.predict(test_samples)

In [10]:
import subprocess
write_docs(test_docs, "out", pred)

output = subprocess.check_output("python3 t1_eval.py -s testset -t out".split()).decode("utf-8")

for line in output.split('\n'):
    print(line)

Failed to load the standard of book_3954:
Unknown mention tag: Facility
Type    P        R        F1       TP1      TP2      In Std.  In Test.
per        0.9993   0.9993   0.9993  1342.00  1342.00     1343     1343
loc        0.8150   0.7810   0.7977   467.83   467.83      599      574
org        0.9895   0.9895   0.9895  1557.55  1557.55     1574     1574
locorg     0.7411   0.8734   0.8018   552.83   552.83      633      746
overall    0.9251   0.9448   0.9348  3912.22  3912.22     4141     4229

