In [8]:
from subprocess import check_call

cluster_sizes = [150, 300, 600, 800, 1000, 1500, 2000, 2500, 3000]

# running brown clustering on test_words(UD dataset)
for cluster_size in cluster_sizes:
    check_call(['brown-cluster/wcluster', '--text', 'test_words.txt', '--c', str(cluster_size), '--threads', '4'])

In [13]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn

from sklearn_crfsuite import scorers, metrics, CRF

In [19]:
UD_pos_tags = set(['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'])

def word2features(sent, pos, path_dict):
    """
        function takes sentence and extracts features for word on `pos` position
    """        
    
    word = sent[pos]
    features = {
        'bias': 1.0,
        'word.lower': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper': word.isupper(),
        'word.istitle': word.istitle(),
        'word.isdigit': word.isdigit(),
    }
    # use prefixes of length 2,4,8... as feature for token `sent[pos]`   
    for length in range(2, min(17, len(path_dict[word])), 2):
        features['path_pref_{}'.format(length)] = path_dict[word][:length]
        
    if pos > 0:
        word1 = sent[pos-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
        
         # use prefixes of length 2,4,8... as feature for next token `sent[pos+1]` 
        for length in range(2, min(17, len(path_dict[word1])), 2):
            features['-1:path_pref_{}'.format(length)] = path_dict[word1][:length]
        
    else:
        features['BOS'] = True

    if pos < len(sent)-1:
        word1 = sent[pos+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
        
        # use prefixes of length 2,4,8... as feature for previous token `sent[pos-1]`
        for length in range(2, min(17, len(path_dict[word1])), 2):
            features['+1:path_pref_{}'.format(length)] = path_dict[word1][:length]
        
    else:
        features['EOS'] = True    
        
    return features

def sent2features(sent, path_dict):
    """
        function takes sentence and converts each word to set of features
    """
    return [word2features(sent, i, path_dict) for i in range(len(sent))]


import string

for cluster_size in cluster_sizes:
    
    path_dict, path_test_dict = {}, {}
    
    with open('train_words-c{}-p1.out/paths'.format(cluster_size)) as f:
        for line in f:
            splitted = line.split()
            path_dict[splitted[1]] = splitted[0]
    
    
    with open('test_words-c{}-p1.out/paths'.format(cluster_size)) as f:
        for line in f:
            splitted = line.split()
            path_test_dict[splitted[1]] = splitted[0]
    
    
    X_train, X_test, y_train, y_test = [], [], [], []
    x, y = [], []

    distinct = set()
    corpus_size = 0

    # building training set
    with open('UD_Kazakh/kk-train.conllu') as f:
        for line in f:
            if not line.strip():
                X_train.append(sent2features(x, path_dict))
                y_train.append(y)
                x, y = [], []

            if line[0] in string.digits:
                splitted = line.split()

                if splitted[3] == '_':
                    continue

                if splitted[3] != "PUNCT":
                    x.append(splitted[1])
                    y.append(splitted[3])
                else:
                    x.append(splitted[1])
                    y.append(splitted[1])

                corpus_size += 1
                distinct.add(splitted[1])


    # building test set
    with open('UD_Kazakh/kk-test.conllu') as f:
        for line in f:
            if not line.strip():
                X_test.append(sent2features(x, path_test_dict))
                y_test.append(y)
                x, y = [], []
                
                
            if line[0] in string.digits:
                splitted = line.split()

                if splitted[3] == '_':
                    continue
                if splitted[3] != "PUNCT":
                    x.append(splitted[1])
                    y.append(splitted[3])
                else:
                    x.append(splitted[1])
                    y.append(splitted[1])


    crf = CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )

    crf.fit(X_train, y_train)
    y_pred = crf.predict(X_test)
    labels = [cls for cls in crf.classes_ if cls in string.punctuation or cls in UD_pos_tags]
    print(cluster_size, metrics.flat_precision_score(y_test, y_pred, average='weighted', labels=labels))


(150, 0.61556308140159333)
(300, 0.55565219534847932)
(600, 0.57907536073365085)
(800, 0.5843288523740503)
(1000, 0.53591663318130511)
(1500, 0.55070383168783266)
(2000, 0.52841687438742579)
(2500, 0.5620465969818873)
(3000, 0.57546510991673172)
