In [1]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

import io
from io import open
from conllu import parse

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_data_file = open('hi-ud-train.conllu', mode = 'rt', encoding = 'utf-8')
train_data = train_data_file.read()
train_data = train_data.replace(',', '  ')
train_sents = parse(train_data, fields = ['ID', 'WORD', 'POS_TAG'])

test_data_file = open('hi-ud-test .conllu', mode = 'rt', encoding = 'utf-8')
test_data = test_data_file.read()
test_data = test_data.replace('TAG', 'POS_TAG')
test_sents = parse(test_data, fields = ['ID', 'WORD', 'POS_TAG'])

for i in range(len(train_sents[0])):
    if train_sents[0][i]['WORD']=='COMMA':
        train_sents[0][i]['WORD']=','
        train_sents[0][i]['POS_TAG']='PUNCT'
        
for i in range(len(test_sents[0])):
    if test_sents[0][i]['WORD']=='COMMA':
        test_sents[0][i]['WORD']=','
        test_sents[0][i]['POS_TAG']='PUNCT'
        
print('\n******* Training data **********\n')
for i in range(len(train_sents[0])):
    print(train_sents[0][i])

print('\n******* Test data *************\n')
for i in range(len(test_sents[0])):
    print(test_sents[0][i])


******* Training data **********

{'ID': 'ID', 'WORD': 'WORD', 'POS_TAG': 'POS_TAG'}
{'ID': '1', 'WORD': 'yaha', 'POS_TAG': 'DET'}
{'ID': '2', 'WORD': 'eSiyA', 'POS_TAG': 'PROPN'}
{'ID': '3', 'WORD': 'kI', 'POS_TAG': 'ADP'}
{'ID': '4', 'WORD': 'sabase', 'POS_TAG': 'ADV'}
{'ID': '5', 'WORD': 'badZI', 'POS_TAG': 'ADJ'}
{'ID': '6', 'WORD': 'masjixoM', 'POS_TAG': 'NOUN'}
{'ID': '7', 'WORD': 'meM', 'POS_TAG': 'ADP'}
{'ID': '8', 'WORD': 'se', 'POS_TAG': 'ADP'}
{'ID': '9', 'WORD': 'eka', 'POS_TAG': 'NUM'}
{'ID': '10', 'WORD': 'hE', 'POS_TAG': 'AUX'}
{'ID': '11', 'WORD': '.', 'POS_TAG': 'PUNCT'}
{'ID': '1', 'WORD': 'ise', 'POS_TAG': 'PRON'}
{'ID': '2', 'WORD': 'navAba', 'POS_TAG': 'NOUN'}
{'ID': '3', 'WORD': 'SAhajehana', 'POS_TAG': 'PROPN'}
{'ID': '4', 'WORD': 'ne', 'POS_TAG': 'ADP'}
{'ID': '5', 'WORD': 'banavAyA', 'POS_TAG': 'VERB'}
{'ID': '6', 'WORD': 'WA', 'POS_TAG': 'AUX'}
{'ID': '7', 'WORD': '.', 'POS_TAG': 'PUNCT'}
{'ID': '1', 'WORD': 'isakA', 'POS_TAG': 'PRON'}
{'ID': '2', 'WORD': 'pr

{'ID': '11', 'WORD': 'jIwA', 'POS_TAG': 'VERB'}
{'ID': '12', 'WORD': '.', 'POS_TAG': 'PUNCT'}
{'ID': '1', 'WORD': 'hamIra', 'POS_TAG': 'PROPN'}
{'ID': '2', 'WORD': 'kI', 'POS_TAG': 'ADP'}
{'ID': '3', 'WORD': 'mOwa', 'POS_TAG': 'NOUN'}
{'ID': '4', 'WORD': 'ke', 'POS_TAG': 'ADP'}
{'ID': '5', 'WORD': 'bAxa', 'POS_TAG': 'ADP'}
{'ID': '6', 'WORD': 'cOhAnoM', 'POS_TAG': 'NOUN'}
{'ID': '7', 'WORD': 'kA', 'POS_TAG': 'ADP'}
{'ID': '8', 'WORD': 'rAja', 'POS_TAG': 'NOUN'}
{'ID': '9', 'WORD': 'Kawma', 'POS_TAG': 'ADJ'}
{'ID': '10', 'WORD': 'ho', 'POS_TAG': 'VERB'}
{'ID': '11', 'WORD': 'gayA', 'POS_TAG': 'AUX'}
{'ID': '12', 'WORD': '.', 'POS_TAG': 'PUNCT'}
{'ID': '1', 'WORD': 'muslima', 'POS_TAG': 'ADJ'}
{'ID': '2', 'WORD': 'vijewAoM', 'POS_TAG': 'NOUN'}
{'ID': '3', 'WORD': 'ne', 'POS_TAG': 'ADP'}
{'ID': '4', 'WORD': 'kile', 'POS_TAG': 'NOUN'}
{'ID': '5', 'WORD': 'kI', 'POS_TAG': 'ADP'}
{'ID': '6', 'WORD': 'majabUwa', 'POS_TAG': 'ADJ'}
{'ID': '7', 'WORD': 'xIvAra', 'POS_TAG': 'NOUN'}
{'ID': '8', 'W

{'ID': '16', 'WORD': '.', 'POS_TAG': 'PUNCT'}
{'ID': '1', 'WORD': 'yahAz', 'POS_TAG': 'PRON'}
{'ID': '2', 'WORD': 'botiMga', 'POS_TAG': 'NOUN'}
{'ID': '3', 'WORD': 'karawe', 'POS_TAG': 'VERB'}
{'ID': '4', 'WORD': 'hue', 'POS_TAG': 'AUX'}
{'ID': '5', 'WORD': 'Apa', 'POS_TAG': 'PRON'}
{'ID': '6', 'WORD': 'prakqwi', 'POS_TAG': 'NOUN'}
{'ID': '7', 'WORD': 'ke', 'POS_TAG': 'ADP'}
{'ID': '8', 'WORD': 'suMxara', 'POS_TAG': 'ADJ'}
{'ID': '9', 'WORD': 'najAroM', 'POS_TAG': 'NOUN'}
{'ID': '10', 'WORD': 'ke', 'POS_TAG': 'ADP'}
{'ID': '11', 'WORD': 'sAWa', 'POS_TAG': 'ADP'}
{'ID': '12', 'WORD': '-', 'POS_TAG': 'PUNCT'}
{'ID': '13', 'WORD': 'sAWa', 'POS_TAG': 'NOUN'}
{'ID': '14', 'WORD': 'perisa', 'POS_TAG': 'PROPN'}
{'ID': '15', 'WORD': 'ke', 'POS_TAG': 'ADP'}
{'ID': '16', 'WORD': 'awIwa', 'POS_TAG': 'NOUN'}
{'ID': '17', 'WORD': 'se', 'POS_TAG': 'ADP'}
{'ID': '18', 'WORD': 'BI', 'POS_TAG': 'PART'}
{'ID': '19', 'WORD': 'sAkRAwkAra', 'POS_TAG': 'NOUN'}
{'ID': '20', 'WORD': 'kara', 'POS_TAG': 'VERB'}

{'ID': '14', 'WORD': 'hisse', 'POS_TAG': 'NOUN'}
{'ID': '15', 'WORD': 'se', 'POS_TAG': 'ADP'}
{'ID': '16', 'WORD': 'yahAz', 'POS_TAG': 'PRON'}
{'ID': '17', 'WORD': 'ke', 'POS_TAG': 'ADP'}
{'ID': '18', 'WORD': 'lie', 'POS_TAG': 'ADP'}
{'ID': '19', 'WORD': 'rela', 'POS_TAG': 'NOUN'}
{'ID': '20', 'WORD': 'yA', 'POS_TAG': 'CCONJ'}
{'ID': '21', 'WORD': 'basa', 'POS_TAG': 'NOUN'}
{'ID': '22', 'WORD': 'suviXA', 'POS_TAG': 'NOUN'}
{'ID': '23', 'WORD': 'le', 'POS_TAG': 'VERB'}
{'ID': '24', 'WORD': 'sakawe', 'POS_TAG': 'AUX'}
{'ID': '25', 'WORD': 'hEM', 'POS_TAG': 'AUX'}
{'ID': '26', 'WORD': '.', 'POS_TAG': 'PUNCT'}
{'ID': '1', 'WORD': 'kahAz', 'POS_TAG': 'PRON'}
{'ID': '2', 'WORD': 'Tahare', 'POS_TAG': 'VERB'}
{'ID': '3', 'WORD': '?', 'POS_TAG': 'PUNCT'}
{'ID': '1', 'WORD': 'yahAz', 'POS_TAG': 'PRON'}
{'ID': '2', 'WORD': 'POYresta', 'POS_TAG': 'NOUN'}
{'ID': '3', 'WORD': 'ke', 'POS_TAG': 'ADP'}
{'ID': '4', 'WORD': 'hotala', 'POS_TAG': 'NOUN'}
{'ID': '5', 'WORD': 'meM', 'POS_TAG': 'ADP'}
{'ID': 

{'ID': '3', 'WORD': 'se', 'POS_TAG': 'ADP'}
{'ID': '4', 'WORD': 'banI', 'POS_TAG': 'VERB'}
{'ID': '5', 'WORD': 'jAmA', 'POS_TAG': 'PROPN'}
{'ID': '6', 'WORD': 'masjixa', 'POS_TAG': 'PROPN'}
{'ID': '7', 'WORD': 'SAhajahAz', 'POS_TAG': 'PROPN'}
{'ID': '8', 'WORD': 'kI', 'POS_TAG': 'ADP'}
{'ID': '9', 'WORD': 'betI', 'POS_TAG': 'NOUN'}
{'ID': '10', 'WORD': 'jahAzArA', 'POS_TAG': 'PROPN'}
{'ID': '11', 'WORD': 'begama', 'POS_TAG': 'PROPN'}
{'ID': '12', 'WORD': 'kI', 'POS_TAG': 'ADP'}
{'ID': '13', 'WORD': 'yAxa', 'POS_TAG': 'NOUN'}
{'ID': '14', 'WORD': 'meM', 'POS_TAG': 'ADP'}
{'ID': '15', 'WORD': 'san', 'POS_TAG': 'NOUN'}
{'ID': '16', 'WORD': '1648', 'POS_TAG': 'PROPN'}
{'ID': '17', 'WORD': 'I.', 'POS_TAG': 'NOUN'}
{'ID': '18', 'WORD': 'meM', 'POS_TAG': 'ADP'}
{'ID': '19', 'WORD': 'banAI', 'POS_TAG': 'VERB'}
{'ID': '20', 'WORD': 'gaI', 'POS_TAG': 'AUX'}
{'ID': '21', 'WORD': 'WI', 'POS_TAG': 'AUX'}
{'ID': '22', 'WORD': '.', 'POS_TAG': 'PUNCT'}
{'ID': '1', 'WORD': 'wAjamahala', 'POS_TAG': 'PRO

{'ID': '10', 'WORD': ',', 'POS_TAG': 'PUNCT'}
{'ID': '11', 'WORD': 'vaha', 'POS_TAG': 'DET'}
{'ID': '12', 'WORD': 'Cavi', 'POS_TAG': 'NOUN'}
{'ID': '13', 'WORD': 'wAlAba', 'POS_TAG': 'NOUN'}
{'ID': '14', 'WORD': 'ke', 'POS_TAG': 'ADP'}
{'ID': '15', 'WORD': 'sAmane', 'POS_TAG': 'ADP'}
{'ID': '16', 'WORD': 'sWiwa', 'POS_TAG': 'ADJ'}
{'ID': '17', 'WORD': 'xumaMjilA', 'POS_TAG': 'ADJ'}
{'ID': '18', 'WORD': 'barAmaxe', 'POS_TAG': 'NOUN'}
{'ID': '19', 'WORD': 'meM', 'POS_TAG': 'ADP'}
{'ID': '20', 'WORD': 'lagAe', 'POS_TAG': 'VERB'}
{'ID': '21', 'WORD': 'gae', 'POS_TAG': 'AUX'}
{'ID': '22', 'WORD': 'AIne', 'POS_TAG': 'NOUN'}
{'ID': '23', 'WORD': 'meM', 'POS_TAG': 'ADP'}
{'ID': '24', 'WORD': 'alAuxxIna', 'POS_TAG': 'PROPN'}
{'ID': '25', 'WORD': 'ne', 'POS_TAG': 'ADP'}
{'ID': '26', 'WORD': 'xeKI', 'POS_TAG': 'VERB'}
{'ID': '27', 'WORD': 'Ora', 'POS_TAG': 'CCONJ'}
{'ID': '28', 'WORD': 'vaha', 'POS_TAG': 'PRON'}
{'ID': '29', 'WORD': 'apanI', 'POS_TAG': 'PRON'}
{'ID': '30', 'WORD': 'senAoM', 'POS_

{'ID': '6', 'WORD': 'parawa', 'POS_TAG': 'NOUN'}
{'ID': '7', 'WORD': 'caDZe', 'POS_TAG': 'VERB'}
{'ID': '8', 'WORD': 'cEwya', 'POS_TAG': 'NOUN'}
{'ID': '9', 'WORD': 'ke', 'POS_TAG': 'ADP'}
{'ID': '10', 'WORD': 'sAWa', 'POS_TAG': 'ADP'}
{'ID': '11', 'WORD': 'Poto', 'POS_TAG': 'NOUN'}
{'ID': '12', 'WORD': 'KIMcanA', 'POS_TAG': 'VERB'}
{'ID': '13', 'WORD': 'cAhawe', 'POS_TAG': 'VERB'}
{'ID': '14', 'WORD': 'hEM', 'POS_TAG': 'AUX'}
{'ID': '15', 'WORD': '.', 'POS_TAG': 'PUNCT'}
{'ID': '1', 'WORD': 'kuSInagara', 'POS_TAG': 'PROPN'}
{'ID': '2', 'WORD': 'ke', 'POS_TAG': 'ADP'}
{'ID': '3', 'WORD': 'viswAra', 'POS_TAG': 'NOUN'}
{'ID': '4', 'WORD': 'ke', 'POS_TAG': 'ADP'}
{'ID': '5', 'WORD': 'sAWa', 'POS_TAG': 'ADP'}
{'ID': '6', 'WORD': 'hI', 'POS_TAG': 'PART'}
{'ID': '7', 'WORD': 'yahAz', 'POS_TAG': 'PRON'}
{'ID': '8', 'WORD': 'para', 'POS_TAG': 'ADP'}
{'ID': '9', 'WORD': 'sabase', 'POS_TAG': 'ADV'}
{'ID': '10', 'WORD': 'aXika', 'POS_TAG': 'DET'}
{'ID': '11', 'WORD': 'banAe', 'POS_TAG': 'VERB'}
{

In [3]:
all_train_sent = []
all_train_tags = []
temp_sent_train = []
temp_tags_train = []
for i in range(len(train_sents[0])):
    if train_sents[0][i]['ID']=='1':
        all_train_sent.append(temp_sent_train)
        all_train_tags.append(temp_tags_train)
        temp_sent_train = []
        temp_tags_train = []
        temp_sent_train.append(train_sents[0][i]['WORD'])
        temp_tags_train.append(train_sents[0][i]['POS_TAG'])
    else:
        temp_sent_train.append(train_sents[0][i]['WORD'])
        temp_tags_train.append(train_sents[0][i]['POS_TAG'])
    
all_train_sent = [ele for ele in all_train_sent if ele != ['WORD']] 
all_train_tags = [ele for ele in all_train_tags if ele != ['POS_TAG']]

all_test_sent = []
all_test_tags = []
temp_sent_test = []
temp_tags_test = []
for i in range(len(test_sents[0])):
    if test_sents[0][i]['ID']=='1':
        all_test_sent.append(temp_sent_test)
        all_test_tags.append(temp_tags_test)
        temp_sent_test = []
        temp_tags_test = []
        temp_sent_test.append(test_sents[0][i]['WORD'])
        temp_tags_test.append(test_sents[0][i]['POS_TAG'])
    else:
        temp_sent_test.append(test_sents[0][i]['WORD'])
        temp_tags_test.append(test_sents[0][i]['POS_TAG'])
    
all_test_sent = [ele for ele in all_test_sent if ele != ['WORD']] 
all_test_tags = [ele for ele in all_test_tags if ele != ['POS_TAG']]

train = []
test = []
for i in range(len(all_train_sent)):
    temp_list_train = []
    for j in range(len(all_train_sent[i])):
        temp_list_train.append((all_train_sent[i][j], all_train_tags[i][j]))
    train.append(temp_list_train)
    
for i in range(len(all_test_sent)):
    temp_list_test = []
    for j in range(len(all_test_sent[i])):
        temp_list_test.append((all_test_sent[i][j], all_test_tags[i][j]))
    test.append(temp_list_test)

print('\nTrain\n')
print(train)
print('\nTest\n')
print(test)


Train

[[('yaha', 'DET'), ('eSiyA', 'PROPN'), ('kI', 'ADP'), ('sabase', 'ADV'), ('badZI', 'ADJ'), ('masjixoM', 'NOUN'), ('meM', 'ADP'), ('se', 'ADP'), ('eka', 'NUM'), ('hE', 'AUX'), ('.', 'PUNCT')], [('ise', 'PRON'), ('navAba', 'NOUN'), ('SAhajehana', 'PROPN'), ('ne', 'ADP'), ('banavAyA', 'VERB'), ('WA', 'AUX'), ('.', 'PUNCT')], [('isakA', 'PRON'), ('praveSa', 'NOUN'), ('xvAra', 'NOUN'), ('xo', 'NUM'), ('maMjilA', 'ADJ'), ('hE', 'AUX'), ('.', 'PUNCT')], [('jisameM', 'PRON'), ('cAra', 'NUM'), ('meharAbeM', 'NOUN'), ('hEM', 'AUX'), ('Ora', 'CCONJ'), ('muKya', 'ADJ'), ('prArWanA', 'NOUN'), ('hOYla', 'NOUN'), ('meM', 'ADP'), ('jAne', 'VERB'), ('ke', 'ADP'), ('lie', 'ADP'), ('9', 'NUM'), ('praveSa', 'NOUN'), ('xvAra', 'NOUN'), ('hEM', 'VERB'), ('.', 'PUNCT')], [('pUrI', 'ADJ'), ('imArawa', 'NOUN'), ('behaxa', 'ADV'), ('KUbasUrawa', 'ADJ'), ('hE', 'AUX'), ('.', 'PUNCT')], [('yahAz', 'PRON'), ('lagane', 'VERB'), ('vAlA', 'ADP'), ('wIna', 'NUM'), ('xina', 'NOUN'), ('kA', 'ADP'), ('ijwimA', 'N

In [4]:
def case_of_chars(s):
    l,u = 0,0
    for i in s: 
        if (i>='a'and i<='z'): 
            l=l+1                  #counting lower case 
        if (i>='A'and i<='Z'): 
            u=u+1                  #counting upper case 
    return l, u

In [5]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'length': len(word),
        'no. of lower case chars': case_of_chars(word)[0],
        'no. of upper case chars': case_of_chars(word)[1],
        'word[-1:]': word[-1:],
        'word[-2:]': word[-2:],
        'word[-3:]': word[-3:],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word[:1]': word[:1],
        'word.isalnum': word.isalnum(),
        'word.islower()': word.islower(),
        'word.isupper()': word.isupper(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word': word1,
            '-1:length' : len(word1),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word': word1,
            '+1:length': len(word1),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [postag for token, postag in sent]

## Features used : 
1) current word's length : since longer words usually turn out to be proper nouns and shorter words are usually conjunctions or determinants
<br>
2) no. of upper and lower case chars : the letters that are capitalised are usually pronounced differently; for example, A is pronounced as aa and a is pronounced as a, so it brings the pronounciation into account
<br>
3) suffix : the suffix might decide if a word is a verb
<br>
4) prefix : the prefix might decide if a word is a noun
<br>
5) if word is alphanumeric : if it's not alpha numeric, then the word automatically becomes a punctuation
<br>
6) case of the word : upper case words are usually proper nouns or exclamations
<br>
7) numbers : will decide if the word gets a POS tag apart from X
<br>
8) previous word : previous word will contribute to subject verb agreement and influence the POS tag of current word
<br>
9) previous word length : length will help in determining it's POS which might direct current word's POS due to subject verb agreement
<br>
10) next word : will contribute to subject verb agreement and influence current word's POS
<br>
11) next word's length : will aid in determining current word's POS through it's own POS tag
<br>

In [6]:
X_train = [sent2features(s) for s in train]
y_train = [sent2labels(s) for s in train]

X_test = [sent2features(s) for s in test]
y_test = [sent2labels(s) for s in test]

In [7]:
labels = []
for i in range(len(y_train)):
    for j in range(len(y_train[i])):
        if y_train[i][j] not in labels:
            labels.append(y_train[i][j])
labels

['DET',
 'PROPN',
 'ADP',
 'ADV',
 'ADJ',
 'NOUN',
 'NUM',
 'AUX',
 'PUNCT',
 'PRON',
 'VERB',
 'CCONJ',
 'PART',
 'SCONJ',
 'X']

In [8]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   25.1s finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001C09ABFCEB0>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001C09ABF57C0>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['DET', 'PROPN', 'ADP', 'ADV', 'ADJ', 'NOUN', 'NUM', 'AUX', 'PUNCT', 'PRON', 'VERB', 'CCONJ', 'PART', 'SCONJ', 'X']),
                   verbose=1)

In [9]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.017577430869692427, 'c2': 0.026644033992842625}
best CV score: 0.8543221649215033
model size: 0.39M


In [10]:
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

crf = rs.best_estimator_

y_pred_test = crf.predict(X_test)
print('\n For test set :\n')
print(metrics.flat_classification_report(y_test, y_pred_test, labels=sorted_labels, digits=3))

y_pred_train = crf.predict(X_train)
print('\n For training set :\n')
print(metrics.flat_classification_report(y_train, y_pred_train, labels=sorted_labels, digits=3))


 For test set :

              precision    recall  f1-score   support

           X      0.000     0.000     0.000         0
        PART      0.969     0.939     0.954        33
       CCONJ      1.000     1.000     1.000        25
       SCONJ      0.667     0.667     0.667         3
         ADJ      0.650     0.691     0.670        94
         ADP      0.967     0.970     0.969       303
         ADV      0.692     0.429     0.529        21
        VERB      0.878     0.869     0.873        99
         DET      0.821     0.889     0.853        36
        NOUN      0.792     0.846     0.818       324
        PRON      0.848     0.862     0.855        65
       PROPN      0.598     0.549     0.572       144
         NUM      0.889     0.960     0.923        25
       PUNCT      1.000     0.910     0.953       134
         AUX      0.956     0.949     0.953       138

   micro avg      0.852     0.852     0.852      1444
   macro avg      0.782     0.769     0.773      1444
weighted

In [12]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(10))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

Top likely transitions:
VERB   -> AUX     4.340922
PROPN  -> PROPN   2.849597
PART   -> NUM     2.450484
AUX    -> AUX     2.326650
NUM    -> NOUN    2.230064
PRON   -> ADP     2.202250
ADJ    -> NOUN    2.148174
DET    -> NOUN    1.863635
DET    -> PART    1.685775
NOUN   -> ADP     1.536293

Top unlikely transitions:
ADV    -> AUX     -1.946852
PROPN  -> PART    -2.044627
PROPN  -> DET     -2.100459
NUM    -> PROPN   -2.140155
CCONJ  -> AUX     -2.151830
CCONJ  -> PART    -2.219615
PROPN  -> AUX     -2.664885
DET    -> ADP     -3.285730
ADJ    -> PRON    -3.711961
ADJ    -> ADP     -4.240564
