In [1]:
!git clone https://github.com/lang-uk/ner-uk.git ~/projects/ner-uk

fatal: destination path '/Users/serg/projects/ner-uk' already exists and is not an empty directory.


In [2]:
!git clone https://github.com/lang-uk/ua-gazetteers.git ~/projects/ua-gazetters

fatal: destination path '/Users/serg/projects/ua-gazetters' already exists and is not an empty directory.


In [3]:
PATH = "/Users/serg/projects/ner-uk/"

# Read tokens and positions of tokens from a file

def read_tokens(filename):
    tokens = []
    pos = 0
    with open(filename, "r") as f:
        text = f.read().split("\n")
        for line in text:
            if len(line) == 0:
                pos += 1
            else:
                tokens.append(("<S>", pos, pos))
                for token in line.split(" "):
                    tokens.append((token, pos, pos + len(token)))
                    pos += len(token) + 1
                tokens.append(("</S>", pos, pos))
    return tokens

# Read annotations and positions of annotations from a file

def read_annotations(filename):
    anno = []
    with open(filename, "r") as f:
        for line in f.readlines():
            annotations = line.split()
            anno.append((annotations[1], int(annotations[2]), int(annotations[3])))
    return anno

# Using positions of tokens and annotations, extract a list of token labels

def extract_labels(anno, tokens):
    labels = []
    ann_id = 0
    for token in tokens:
        if ann_id < len(anno):
            label, beg, end = anno[ann_id]
            if token[0] in ["<S>", "</S>"]:
                labels.append("--")
            elif token[1] < beg:
                labels.append("--")
            else:
                if token[1] == beg:
                    labels.append("B-" + label)
                else:
                    labels.append("I-" + label)
                if token[2] == end:
                    ann_id += 1
        else:
            labels.append("--")    
    return labels

# tokens = read_tokens(PATH + "data/A_alumni.krok.edu.ua_Prokopenko_Vidrodzhennia_velotreku(5).tok.txt")
# anno = read_annotations(PATH + "data/A_alumni.krok.edu.ua_Prokopenko_Vidrodzhennia_velotreku(5).tok.ann")
# labels = extract_labels(anno, tokens)

# for i, j in zip(tokens, labels):
#     print(i[0], j)

# Extract list of files for training and testing

dev_test = {"dev": [], "test": []}
category = ""
with open(PATH + "doc/dev-test-split.txt", "r") as f:
    for line in f.readlines():
        line = line.strip()
        if line in ["DEV", "TEST"]:
            category = line.lower()
        elif len(line) == 0:
            continue
        else:
            dev_test[category].append(line)

print(len(dev_test["dev"]), len(dev_test["test"]))

# Get train and test data and labels

train_tokens, test_tokens, train_labels, test_labels = [], [], [], []

for filename in dev_test["dev"]:
    try:
        tokens = read_tokens(PATH + "data/" + filename + ".txt")
        train_tokens += [token[0] for token in tokens]
        train_labels += extract_labels(read_annotations(PATH + "data/" + filename + ".ann"), tokens)
    except:
        pass

for filename in dev_test["test"]:
    try:
        tokens = read_tokens(PATH + "data/" + filename + ".txt")
        test_tokens += [token[0] for token in tokens]
        test_labels += extract_labels(read_annotations(PATH + "data/" + filename + ".ann"), tokens)
    except:
        pass

156 73


In [4]:
list(zip(train_tokens, train_labels))[50:100]

[('–', '--'),
 ('повідомляють', '--'),
 ('Першій', 'B-ОРГ'),
 ('електронній', 'I-ОРГ'),
 ('в', '--'),
 ('прес-службі', '--'),
 ('УДМС', 'B-ОРГ'),
 ('України', 'I-ОРГ'),
 ('в', 'I-ОРГ'),
 ('Кіровоградській', 'I-ОРГ'),
 ('області', 'I-ОРГ'),
 ('.', '--'),
 ('</S>', '--'),
 ('<S>', '--'),
 ('Розцінки', '--'),
 ('на', '--'),
 ('послуги', '--'),
 ('таких', '--'),
 ('посередників', '--'),
 ('починаються', '--'),
 ('від', '--'),
 ('1000', '--'),
 ('грн', '--'),
 ('.', '--'),
 ('</S>', '--'),
 ('<S>', '--'),
 ('Закінчується', '--'),
 ('«', '--'),
 ('біометрична', '--'),
 ('афера', '--'),
 ('»', '--'),
 ('в', '--'),
 ('кращому', '--'),
 ('випадку', '--'),
 ('звичайним', '--'),
 ('оформленням', '--'),
 ('документа', '--'),
 ('у', '--'),
 ('міграційній', 'B-ОРГ'),
 ('службі', 'I-ОРГ'),
 ('у', '--'),
 ('встановлений', '--'),
 ('законодавством', '--'),
 ('строк', '--'),
 (',', '--'),
 ('у', '--'),
 ('гіршому', '--'),
 ('–', '--'),
 ('втратою', '--'),
 ('коштів', '--')]

In [5]:
list(zip(test_tokens, test_labels))[:10]

[('<S>', '--'),
 ('В', '--'),
 ('Україні', 'B-ЛОК'),
 ('спостерігається', '--'),
 ('істотне', '--'),
 ('погіршення', '--'),
 ('погодних', '--'),
 ('умов', '--'),
 (':', '--'),
 ('сильний', '--')]

In [6]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer(lang='uk')

In [7]:
morph.parse('спостерігається')[0].tag.POS

'VERB'

In [8]:
from collections import namedtuple

Token = namedtuple('Token', 'text iob_tag pos lemma')

def extract_sent_features(sent):
    features = []
    for i, token in enumerate(sent):
        fdic = {}
        #fdic['len'] = len(token.text)
        fdic['case'] = token.text[:1].isupper() 
        fdic['pos'] = str(token.pos)
        fdic['pos_prev'] = str(None) if i == 0 else str(sent[i-1].pos)
        fdic['pos_next'] = str(None) if i == (len(sent) - 1) else str(sent[i+1].pos)
        fdic['prev_lemma'] = str(None) if i == 0 else sent[i-1].lemma
        fdic['next_lemma'] = str(None) if i == (len(sent) - 1) else sent[i+1].lemma
        fdic['lemma'] = token.lemma
        features.append(fdic)
    return features    
        

In [9]:
def prepare(tokens, labels):
    sent = []
    sents = []
    for token,label in zip(tokens, labels):
        pm = morph.parse(token)[0]
        sent.append(Token(text = token, iob_tag = label, pos = pm.tag.POS, lemma = pm.normal_form))
        if token == '</S>':
            sents.append(sent)
            sent = []
    return sents

sents = prepare(train_tokens, train_labels)

In [10]:
sents[:10]

[[Token(text='<S>', iob_tag='--', pos=None, lemma='<s>'),
  Token(text='На', iob_tag='--', pos='INTJ', lemma='на'),
  Token(text='довірливих', iob_tag='--', pos='ADJF', lemma='довірливий'),
  Token(text='кіровоградців', iob_tag='--', pos='NOUN', lemma='кіровоградець'),
  Token(text='полюють', iob_tag='--', pos='VERB', lemma='полювати'),
  Token(text='шахраї', iob_tag='--', pos='NOUN', lemma='шахрай'),
  Token(text='та', iob_tag='--', pos='CONJ', lemma='та'),
  Token(text='фірми-посередники', iob_tag='--', pos='NOUN', lemma='фірма-посередник'),
  Token(text=',', iob_tag='--', pos=None, lemma=','),
  Token(text='які', iob_tag='--', pos='NPRO', lemma='який'),
  Token(text='за', iob_tag='--', pos='ADVB', lemma='за'),
  Token(text='1000', iob_tag='--', pos=None, lemma='1000'),
  Token(text='грн', iob_tag='--', pos='NOUN', lemma='грн'),
  Token(text='.', iob_tag='--', pos=None, lemma='.'),
  Token(text='готові', iob_tag='--', pos='ADJF', lemma='готовий'),
  Token(text='«', iob_tag='--', pos=

In [11]:
def extract_features(sents):
    features = []
    for sent in sents:
        features.extend(extract_sent_features(sent))
    return features 

features = extract_features(sents)

In [12]:
features[0:20]

[{'case': False,
  'pos': 'None',
  'pos_prev': 'None',
  'pos_next': 'INTJ',
  'prev_lemma': 'None',
  'next_lemma': 'на',
  'lemma': '<s>'},
 {'case': True,
  'pos': 'INTJ',
  'pos_prev': 'None',
  'pos_next': 'ADJF',
  'prev_lemma': '<s>',
  'next_lemma': 'довірливий',
  'lemma': 'на'},
 {'case': False,
  'pos': 'ADJF',
  'pos_prev': 'INTJ',
  'pos_next': 'NOUN',
  'prev_lemma': 'на',
  'next_lemma': 'кіровоградець',
  'lemma': 'довірливий'},
 {'case': False,
  'pos': 'NOUN',
  'pos_prev': 'ADJF',
  'pos_next': 'VERB',
  'prev_lemma': 'довірливий',
  'next_lemma': 'полювати',
  'lemma': 'кіровоградець'},
 {'case': False,
  'pos': 'VERB',
  'pos_prev': 'NOUN',
  'pos_next': 'NOUN',
  'prev_lemma': 'кіровоградець',
  'next_lemma': 'шахрай',
  'lemma': 'полювати'},
 {'case': False,
  'pos': 'NOUN',
  'pos_prev': 'VERB',
  'pos_next': 'CONJ',
  'prev_lemma': 'полювати',
  'next_lemma': 'та',
  'lemma': 'шахрай'},
 {'case': False,
  'pos': 'CONJ',
  'pos_prev': 'NOUN',
  'pos_next': 'NOU

In [13]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()

vectorizer.fit(features)

feature_vecs = vectorizer.transform(features)

In [14]:
feature_vecs.toarray()[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
import re

def norm_labels(raw_labels):
    return [re.sub(r'[^\-]\-','',label) for label in raw_labels]

In [16]:
from sklearn.linear_model import LogisticRegression 

logreg = LogisticRegression(random_state=42,  solver='lbfgs', multi_class="multinomial", max_iter=2000)

logreg.fit(feature_vecs, norm_labels(train_labels))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=2000, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
test_sents = prepare(test_tokens, test_labels)

test_features_vec = vectorizer.transform(extract_features(test_sents))

In [18]:
forecast = logreg.predict(test_features_vec)

In [19]:
from sklearn.metrics import classification_report

rep = classification_report(norm_labels(test_labels), forecast)

print(rep)

              precision    recall  f1-score   support

          --       0.92      1.00      0.96     69817
         ЛОК       0.67      0.19      0.30      1485
         ОРГ       0.78      0.07      0.12      2188
        ПЕРС       0.74      0.34      0.47      3998
        РІЗН       0.36      0.11      0.17       555

   micro avg       0.92      0.92      0.92     78043
   macro avg       0.70      0.34      0.40     78043
weighted avg       0.90      0.92      0.89     78043

