In [1]:
import pickle
import string

from nltk.tag import tnt
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn

from sklearn_crfsuite import scorers, metrics, CRF


In [2]:
def extract_train_from_klc():
    """
        Extracts training data from KLC dataset. 
        Returns list of sentences. 
        Every sentence is list of tuples where fist element is word, second is tag
    """
    data = []
    with open('klc/data1.xml') as f:
        sen = []
        w, base = None, None
        for line in f:
            if line.startswith('<w>'):
                w = line[3:-6]
            elif line.startswith('<base>'):
                base = line[6:-9]
                sen.append((w, base))
            elif line.startswith('</tokens>'):
                data.append(sen)
                sen = []

    return data


def extract_test_from_klc():
    """
        Extracts testing data from KLC dataset. 
        Returns list of sentences. 
        Every sentence is list of tuples where fist element is word, second is tag
    """
    data = []
    with open('klc/data2.xml') as f:
        sen = []
        w, base = None, None
        for line in f:
            if line.startswith('<w>'):
                w = line[3:-6]
            elif line.startswith('<base>'):
                base = line[6:-9]
                sen.append((w, base))
            elif line.startswith('</tokens>'):
                data.append(sen)
                sen = []

    return data


def get_data(data_file="train"):
    """
        Extracts training or testing data from KLC dataset. 
        Returns list of sentences. 
        Every sentence is list of tuples where fist element is word, second is tag
    """
    data = []
    with open('UD_Kazakh/kk-{}.conllu'.format(data_file)) as f:
        sen = []
        for line in f:
            if not line.strip():
                data.append(sen)
                sen = []
            if line[0] in string.digits:
                splitted = line.split()
                if splitted[3] == "PUNCT":
                    sen.append((splitted[1], splitted[1]))
                else:
                    sen.append((splitted[1], splitted[3]))
    return data


calculate accuracy for TnT tagger on UD dataset

In [3]:
tagger = tnt.TnT(N=100000)

train_ud = get_data()
test_ud = get_data(data_file="test")

tagger.train(train_ud)
tagger.evaluate(test_ud)

0.4557124518613607

calculate accuracy for TnT on KLC dataset

In [4]:
train_klc = extract_train_from_klc()
test_klc = extract_test_from_klc()
tagger.train(train_klc)
tagger.evaluate(test_klc)

0.17421602787456447

calculating accuracy of CRF on UD dataset

In [5]:
UD_pos_tags = set(['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'])

In [6]:
def word2features(sent, pos):
    """
        function takes sentence and extracts features for word on `pos` position
    """
    word = sent[pos]
    features = {
        'bias': 1.0,
        'word.lower': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper': word.isupper(),
        'word.istitle': word.istitle(),
        'word.isdigit': word.isdigit(),
    }
                
    if pos > 0:
        word1 = sent[pos-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if pos < len(sent)-1:
        word1 = sent[pos+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True    
    return features

def sent2features(sent):
    """
        function takes sentence and converts each word to set of features
    """
    return [word2features(sent, i) for i in range(len(sent))]

In [8]:
%%time

import string

X_train, X_test, y_train, y_test = [], [], [], []
x, y = [], []

# building training set
with open('UD_Kazakh/kk-train.conllu') as f:
    for line in f:
        if not line.strip():
            X_train.append(sent2features(x))
            y_train.append(y)
            x, y = [], []
        
        if line[0] in string.digits:
            splitted = line.split()
            
            if splitted[3] == '_':
                continue
            
            if splitted[3] != "PUNCT":
                x.append(splitted[1])
                y.append(splitted[3])
            else:
                x.append(splitted[1])
                y.append(splitted[1])

# building test set
with open('UD_Kazakh/kk-test.conllu') as f:
    for line in f:
        if not line.strip():
            X_test.append(sent2features(x))
            y_test.append(y)
            x, y = [], []
        
        if line[0] in string.digits:
            splitted = line.split()
            
            if splitted[3] == '_':
                continue
            if splitted[3] != "PUNCT":
                x.append(splitted[1])
                y.append(splitted[3])
            else:
                x.append(splitted[1])
                y.append(splitted[1])


CPU times: user 80 ms, sys: 4 ms, total: 84 ms
Wall time: 81.3 ms


In [9]:
%%time
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 9.92 s, sys: 0 ns, total: 9.92 s
Wall time: 9.91 s


In [10]:
labels = [cls for cls in crf.classes_ if cls in string.punctuation or cls in UD_pos_tags]

In [11]:
y_pred = crf.predict(X_test)

metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.65851737034768043

In [12]:
print(metrics.flat_classification_report(y_test, y_pred, digits=3, labels=labels))

             precision    recall  f1-score   support

      PROPN      0.714     0.119     0.204        42
        NUM      0.739     0.773     0.756        22
       NOUN      0.603     0.850     0.705       200
        AUX      0.750     0.533     0.623        45
          .      0.961     1.000     0.980        49
       VERB      0.618     0.711     0.661       114
          ,      1.000     1.000     1.000        57
       PRON      0.833     0.484     0.612        31
          ?      1.000     1.000     1.000         3
        DET      0.429     0.545     0.480        11
        ADJ      0.528     0.418     0.467        67
          -      1.000     1.000     1.000         4
        ADP      0.818     0.692     0.750        13
        ADV      0.545     0.522     0.533        23
          X      0.200     0.111     0.143         9
      CCONJ      0.333     0.375     0.353         8
          (      1.000     1.000     1.000         5
          )      1.000     1.000     1.000   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


calculating accuracy of CRF tagger on KLC dataset

In [13]:
X_train_klc, X_test_klc, y_train_klc, y_test_klc = [], [], [], []

for sen in train_klc:
    y_train_klc.append([])
    x_train = []
    for word, tag in sen:
        x_train.append(word)
        y_train_klc[-1].append(tag)
    X_train_klc.append(sent2features(x_train))

for sen in test_klc:
    y_test_klc.append([])
    x_test = []
    for word, tag in sen:
        x_test.append(word)
        y_test_klc[-1].append(tag)
    X_test_klc.append(sent2features(x_test))
    
    

In [14]:
%%time
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train_klc, y_train_klc)

CPU times: user 460 ms, sys: 12 ms, total: 472 ms
Wall time: 467 ms


In [15]:
y_pred_klc = crf.predict(X_test_klc)

In [16]:
y_pred_klc[0], y_test_klc[0]

(['SIMU', 'ZEP', 'ZEP', 'ZEP', 'ZEP', 'ET', '.'],
 ['SIMU', 'ZEP', 'ZEP', 'ESM', 'ZEP', 'US', '.'])

In [17]:
metrics.flat_f1_score(y_test_klc, y_pred_klc, average='weighted')

0.55704292118257637