In [1]:
#!/usr/bin/env python3

This file illustrates how you might experiment with the HMM interface.
You can paste these commands in at the Python prompt, or execute `test_en.py` directly.
A notebook interface is nicer than the plain Python prompt, so we provide
a notebook version of this file as `test_en.ipynb`, which you can open with
`jupyter` or with Visual Studio `code` (run it with the `nlp-class` kernel).

In [2]:
import logging
import math
import os
from pathlib import Path

In [3]:
import torch

In [4]:
from corpus import TaggedCorpus
from eval import eval_tagging, model_cross_entropy, viterbi_error_rate
from hmm import HiddenMarkovModel, EnhancedHMM
from crf import ConditionalRandomField

Set up logging.

In [5]:
logging.root.setLevel(level=logging.INFO)
log = logging.getLogger("test_en")       # For usage, see findsim.py in earlier assignment.
logging.basicConfig(format="%(levelname)s : %(message)s", level=logging.INFO)  # could change INFO to DEBUG
# torch.autograd.set_detect_anomaly(True)    # uncomment to improve error messages from .backward(), but slows down

Switch working directory to the directory where the data live.  You may need to edit this line.

In [6]:
os.chdir("../data")

In [7]:
entrain = TaggedCorpus(Path("ensup"), Path("enraw"))                               # mixed training
ensup =   TaggedCorpus(Path("ensup"), tagset=entrain.tagset, vocab=entrain.vocab)  # supervised training
endev =   TaggedCorpus(Path("endev"), tagset=entrain.tagset, vocab=entrain.vocab)  # evaluation
print(f"{len(entrain)=}  {len(ensup)=}  {len(endev)=}")

INFO : Read 191873 tokens from ensup, enraw
INFO : Created 26 tag types
INFO : Created 18461 word types


len(entrain)=8064  len(ensup)=4051  len(endev)=996


In [8]:
known_vocab = TaggedCorpus(Path("ensup")).vocab    # words seen with supervised tags; used in evaluation
log.info(f"Tagset: f{list(entrain.tagset)}")

INFO : Read 95936 tokens from ensup
INFO : Created 26 tag types
INFO : Created 12466 word types
INFO : Tagset: f['W', 'J', 'N', 'C', 'V', 'I', 'D', ',', 'M', 'P', '.', 'E', 'R', '`', "'", 'T', '$', ':', '-', '#', 'S', 'F', 'U', 'L', '_EOS_TAG_', '_BOS_TAG_']


Make an HMM.  Let's do some pre-training to approximately maximize the
regularized log-likelihood on supervised training data.  In other words, the
probabilities at the M step will just be supervised count ratios.

On each epoch, you will see two progress bars: first it collects counts from
all the sentences (E step), and then after the M step, it evaluates the loss
function, which is the (unregularized) cross-entropy on the training set.

The parameters don't actually matter during the E step because there are no
hidden tags to impute.  The first M step will jump right to the optimal
solution.  The code will try a second epoch with the revised parameters, but
the result will be identical, so it will detect convergence and stop.

We arbitrarily choose λ=1 for our add-λ smoothing at the M step, but it would
be better to search for the best value of this hyperparameter.

In [9]:
log.info("*** Hidden Markov Model (HMM)")
hmm = HiddenMarkovModel(entrain.tagset, entrain.vocab)  # randomly initialized parameters  
loss_sup = lambda model: model_cross_entropy(model, eval_corpus=ensup)
hmm.train(corpus=ensup, loss=loss_sup, λ=1.0,
          save_path="en_hmm.pkl") 

INFO : *** Hidden Markov Model (HMM)
100%|██████████| 4051/4051 [00:14<00:00, 276.09it/s]
INFO : Cross-entropy: 9.6013 nats (= perplexity 14783.735)
100%|██████████| 4051/4051 [00:31<00:00, 127.99it/s]
INFO : Saving model to en_hmm.pkl
INFO : Saved model to en_hmm.pkl
100%|██████████| 4051/4051 [00:10<00:00, 400.84it/s]
INFO : Cross-entropy: 7.1759 nats (= perplexity 1307.532)
100%|██████████| 4051/4051 [00:33<00:00, 119.50it/s]
INFO : Saving model to en_hmm.pkl
INFO : Saved model to en_hmm.pkl
100%|██████████| 4051/4051 [00:19<00:00, 204.66it/s]
INFO : Cross-entropy: 7.1759 nats (= perplexity 1307.532)
INFO : Saving model to en_hmm.pkl
INFO : Saved model to en_hmm.pkl


log.info("*** Already Run, skip this cell***")

pass

log.info("*** Enhanced Hidden Markov Model (HMM) with hyperparameter optimization")

### grid for opti
param_grid = {
    'λ': [0.01, 0.1, 0.5, 1.0],
    'supervised_constraint': [True],  # Always use constraints with supervised data
    'better_smoothing': [True, False]
}

### hyper param search
def evaluate_params(params):
    model = EnhancedHMM(
        tagset=ensup.tagset, 
        vocab=ensup.vocab,
        unigram=False,
        supervised_constraint=params['supervised_constraint'],
        better_smoothing=params['better_smoothing']
    )
    
    loss_sup = lambda model: model_cross_entropy(model, eval_corpus=ensup)
    model.train(
        corpus=ensup,
        loss=loss_sup,
        λ=params['λ'],
        max_steps=50 
    )
    
    return model_cross_entropy(model, ensup), params

#### trying  all parameter combinations
from itertools import product
param_combinations = [dict(zip(param_grid.keys(), v)) 
                     for v in product(*param_grid.values())]

results = []
for params in param_combinations:
    try:
        loss, params = evaluate_params(params)
        results.append((loss, params))
        log.info(f"Parameters: {params}, Loss: {loss:.4f}")
    except Exception as e:
        log.warning(f"Failed for params {params}: {str(e)}")

#### Find best parameters
best_loss, best_params = min(results, key=lambda x: x[0])
log.info(f"Best parameters found: {best_params} with loss: {best_loss:.4f}")


In [10]:
log.info("*** Enhanced Hidden Markov Model (HMM)")
# initialize with optimal settings
enhancedhmm = EnhancedHMM(
    entrain.tagset, 
    entrain.vocab,
    unigram=False,
    supervised_constraint=True,
    better_smoothing=True
)

# setup loss and train
loss_sup = lambda model: model_cross_entropy(model, eval_corpus=ensup)
enhancedhmm.train(
    corpus=ensup, 
    loss=loss_sup, 
    λ=0.01,
    save_path="en_hmm_awesome.pkl"
)

# test different decoders
log.info("Testing decoders...")
for decoder in ['viterbi', 'posterior', 'hybrid']:
    log.info(f"\nTesting {decoder} decoder:")
    for sentence in ensup:  # or use a smaller test set
        tagged = enhancedhmm.decode(sentence, ensup, method=decoder)
        # print or evaluate results
        print(f"{decoder}: {tagged}")

INFO : *** Enhanced Hidden Markov Model (HMM)
100%|██████████| 4051/4051 [00:09<00:00, 416.91it/s]
INFO : Cross-entropy: 9.5974 nats (= perplexity 14725.800)
100%|██████████| 4051/4051 [00:25<00:00, 161.34it/s]
INFO : Saving model to en_hmm_awesome.pkl
INFO : Saved model to en_hmm_awesome.pkl
100%|██████████| 4051/4051 [00:08<00:00, 503.29it/s]
INFO : Cross-entropy: 6.3784 nats (= perplexity 588.983)
100%|██████████| 4051/4051 [00:25<00:00, 159.55it/s]
INFO : Saving model to en_hmm_awesome.pkl
INFO : Saved model to en_hmm_awesome.pkl
100%|██████████| 4051/4051 [00:07<00:00, 511.58it/s]
INFO : Cross-entropy: 6.3784 nats (= perplexity 588.983)
INFO : Saving model to en_hmm_awesome.pkl
INFO : Saved model to en_hmm_awesome.pkl
INFO : Testing decoders...
INFO : 
Testing viterbi decoder:


viterbi: When/W such/J claims/N and/C litigation/N extend/V beyond/I the/D period/N ,/, the/D syndicates/N can/M extend/V their/P accounting/V deadlines/N ./.
viterbi: Lloyd/N 's/P said/V there/E are/V currently/R 115/C open/J account/N years/N involving/V 68/C of/I the/D market/N 's/P roughly/R 360/C syndicates/N ./.
viterbi: The/D open-year/J accounting/N practice/N ``/` is/V widely/R recognized/V within/I Lloyd/N 's/P as/R of/I serious/J concern/N ''/' to/T the/D 31,329/C member/N investors/N ,/, who/W underwrite/V insurance/N at/I Lloyd/N 's/V in/I return/N for/I premium/N and/C investment/N income/N ,/, Lloyd/N 's/P said/V ./.
viterbi: The/D procedure/N causes/V ``/` great/J uncertainty/N ''/' because/I an/D investor/N ca/M n't/R be/V sure/J of/I his/P or/C her/P individual/J liability/N ,/, Lloyd/N 's/P said/V ./.
viterbi: As/I a/D result/N ,/, the/D insurance/N market/N plans/V new/J measures/N to/T restrict/V the/D ability/N of/I syndicate/N officials/N to/T leave/V years/N ope

INFO : 
Testing posterior decoder:


viterbi: So/R ,/, following/V both/C the/D style/N he/P pursued/V as/I President/N Ford/N 's/P national/J security/N adviser/N and/C the/D recommendations/N of/I the/D Tower/N Commission/N ,/, Gen./N Scowcroft/N has/V pruned/V the/D NSC/N staff/N and/C tried/V to/T ensure/V that/I it/P sticks/V to/T its/P assigned/J tasks/N --/: namely/R ,/, gathering/V the/D views/N of/I the/D State/N Department/N ,/, Pentagon/N and/C intelligence/N community/N ;/: serving/V as/I an/D honest/J broker/N in/I distilling/V that/I information/N for/I the/D president/N and/C then/R making/V sure/J presidential/J decisions/N are/V carried/V out/R ./.
viterbi: The/D Tower/N Commission/N specifically/R said/V that/I the/D NSC/N staff/N should/M be/V ``/` small/J ''/' and/C warned/V against/I letting/V ``/` energetic/J self-starters/N ''/' like/I Lt./N Col./N Oliver/N North/N strike/V out/R on/I their/P own/V rather/R than/I leaving/V the/D day-to-day/J execution/N of/I policies/N to/T the/D State/N Department

INFO : 
Testing hybrid decoder:


posterior: The/D National/N Security/N Council/N itself/P was/V established/V in/I 1947/C because/I policy/N makers/N sensed/V a/D need/N ,/, in/I an/D increasingly/R complex/J world/N ,/, for/I a/D formal/J system/N within/I the/D White/N House/N to/T make/V sure/J that/I communications/N flowed/V smoothly/R between/I the/D president/N and/C the/D State/N Department/N ,/, Pentagon/N and/C intelligence/N agencies/N ./.
posterior: By/I law/N ,/, the/D council/N includes/V the/D president/N ,/, vice/N president/N and/C secretaries/N of/I state/N and/C defense/N ./.
posterior: In/I practice/N ,/, the/D director/N of/I central/J intelligence/N and/C chairman/N of/I the/D Joint/N Chiefs/N of/I Staff/N also/R serve/V as/I unofficial/J members/N ./.
posterior: But/C the/D size/N ,/, shape/V and/C role/N of/I the/D NSC/N staff/N have/V been/V left/V for/I each/D president/N and/C his/P national/J security/N adviser/N to/T decide/V ./.
posterior: That/D task/N is/V one/C of/I Washington/N 's/P 

Now let's throw in the unsupervised training data as well, and continue
training as before, in order to increase the regularized log-likelihood on
this larger, semi-supervised training set.  It's now the *incomplete-data*
log-likelihood.

This time, we'll use a different evaluation loss function: we'll stop when the
*tagging error rate* on a held-out dev set stops getting better.  Also, the
implementation of this loss function (`viterbi_error_rate`) includes a helpful
side effect: it logs the *cross-entropy* on the held-out dataset as well, just
for your information.

We hope that held-out tagging accuracy will go up for a little bit before it
goes down again (see Merialdo 1994). (Log-likelihood on training data will
continue to improve, and that improvement may generalize to held-out
cross-entropy.  But getting accuracy to increase is harder.)

In [11]:
enhancedhmm = EnhancedHMM.load("en_hmm_awesome.pkl")  # reset to supervised model (in case you're re-executing this bit)
loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)
enhancedhmm.train(
    corpus=entrain, 
    loss=loss_sup, 
    λ=0.01,
    save_path="en_hmm_awesome_raw.pkl"
)

INFO : Loaded model from en_hmm_awesome.pkl
100%|██████████| 4051/4051 [00:13<00:00, 301.50it/s]
INFO : Cross-entropy: 6.3784 nats (= perplexity 588.983)
100%|██████████| 8064/8064 [02:46<00:00, 48.44it/s] 
INFO : Saving model to en_hmm_awesome_raw.pkl
INFO : Saved model to en_hmm_awesome_raw.pkl
100%|██████████| 4051/4051 [00:11<00:00, 354.86it/s]
INFO : Cross-entropy: 6.4131 nats (= perplexity 609.796)
INFO : Saving model to en_hmm_awesome_raw.pkl
INFO : Saved model to en_hmm_awesome_raw.pkl


In [12]:
hmm = HiddenMarkovModel.load("en_hmm.pkl")  # reset to supervised model (in case you're re-executing this bit)
loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)
hmm.train(corpus=entrain, loss=loss_dev, λ=1.0,
          save_path="en_hmm_raw.pkl")


INFO : Loaded model from en_hmm.pkl
100%|██████████| 996/996 [00:03<00:00, 299.89it/s]
INFO : Cross-entropy: 7.2436 nats (= perplexity 1399.125)
100%|██████████| 996/996 [00:04<00:00, 213.14it/s]
INFO : Tagging accuracy: all: 88.663%, known: 93.059%, seen: 44.108%, novel: 42.734%
100%|██████████| 8064/8064 [02:32<00:00, 52.78it/s] 
INFO : Saving model to en_hmm_raw.pkl
INFO : Saved model to en_hmm_raw.pkl
100%|██████████| 996/996 [00:02<00:00, 374.40it/s]
INFO : Cross-entropy: 6.9584 nats (= perplexity 1051.928)
100%|██████████| 996/996 [00:04<00:00, 205.28it/s]
INFO : Tagging accuracy: all: 87.031%, known: 91.397%, seen: 45.791%, novel: 40.225%
INFO : Saving model to en_hmm_raw.pkl
INFO : Saved model to en_hmm_raw.pkl


In [13]:
log.info("*** mixed model")
enhancedhmm.printAB()

INFO : *** mixed model


Transition matrix A:
	W	J	N	C	V	I	D	,	M	P	.	E	R	`	'	T	$	:	-	#	S	F	U	L	_EOS_TAG_	_BOS_TAG_
W	0.000	0.033	0.101	0.008	0.459	0.012	0.087	0.007	0.090	0.138	0.001	0.006	0.046	0.000	0.000	0.011	0.001	0.000	0.001	0.000	0.000	0.000	0.000	0.000	0.000	0.000
J	0.001	0.076	0.692	0.034	0.009	0.077	0.004	0.035	0.001	0.003	0.025	0.000	0.005	0.001	0.004	0.024	0.003	0.004	0.001	0.000	0.000	0.000	0.000	0.000	0.000	0.000
N	0.012	0.011	0.243	0.052	0.132	0.185	0.009	0.131	0.016	0.034	0.100	0.000	0.020	0.002	0.004	0.030	0.000	0.012	0.005	0.000	0.000	0.000	0.000	0.000	0.001	0.000
C	0.006	0.071	0.354	0.142	0.059	0.082	0.065	0.065	0.006	0.028	0.046	0.001	0.025	0.003	0.000	0.025	0.008	0.004	0.009	0.000	0.000	0.000	0.000	0.000	0.001	0.000
V	0.007	0.072	0.123	0.033	0.135	0.158	0.153	0.024	0.001	0.061	0.036	0.001	0.108	0.007	0.001	0.068	0.008	0.003	0.001	0.000	0.000	0.000	0.000	0.000	0.000	0.000
I	0.010	0.101	0.306	0.058	0.032	0.020	0.334	0.003	0.000	0.073	0.003	0.001	0.019	0.006	0.000	0.002	0.030	0.000	0.000	0.00

You can also retry the above workflow where you start with a worse supervised
model (like Merialdo).  Does EM help more in that case?  It's easiest to rerun
exactly the code above, but first make the `ensup` file smaller by copying
`ensup-tiny` over it.  `ensup-tiny` is only 25 sentences (that happen to cover
all tags in `endev`).  Back up your old `ensup` and your old `*.pkl` models
before you do this.

More detailed look at the first 10 sentences in the held-out corpus,
including Viterbi tagging.

In [14]:
def look_at_your_data_with_stats(model, dev, N):
    total_losses = {'viterbi': 0, 'posterior': 0, 'hybrid': 0}
    total_denominator = 0
    
    for m, sentence in enumerate(dev):
        if m >= N: break
        
        # Same as before...
        viterbi = model.decode(sentence.desupervise(), endev, method='viterbi')
        posterior = model.decode(sentence.desupervise(), endev, method='posterior')
        hybrid = model.decode(sentence.desupervise(), endev, method='hybrid')
        
        counts_vit = eval_tagging(predicted=viterbi, gold=sentence, known_vocab=known_vocab)
        counts_post = eval_tagging(predicted=posterior, gold=sentence, known_vocab=known_vocab)
        counts_hyb = eval_tagging(predicted=hybrid, gold=sentence, known_vocab=known_vocab)
        
        # Accumulate statistics
        total_losses['viterbi'] += counts_vit['NUM', 'ALL']
        total_losses['posterior'] += counts_post['NUM', 'ALL']
        total_losses['hybrid'] += counts_hyb['NUM', 'ALL']
        total_denominator += counts_vit['DENOM', 'ALL']
        
        # Print individual results as before...

    # Print summary statistics
    log.info("\nSummary Statistics:")
    for method in total_losses:
        accuracy = total_losses[method] / total_denominator
        log.info(f"{method.capitalize()} Accuracy: {accuracy:.4f}")

In [15]:
look_at_your_data_with_stats(enhancedhmm, endev, 10)

INFO : 
Summary Statistics:
INFO : Viterbi Accuracy: 0.8857
INFO : Posterior Accuracy: 0.8952
INFO : Hybrid Accuracy: 0.8952


In [16]:
def look_at_your_data(model, dev, N):
    # initialize summary stats
    total_num = 0
    total_denom = 0
    total_xent = 0
    total_words = 0
    
    for m, sentence in enumerate(dev):
        if m >= N: break
        viterbi = model.viterbi_tagging(sentence.desupervise(), endev)
        counts = eval_tagging(predicted=viterbi, gold=sentence, 
                              known_vocab=known_vocab)
        num = counts['NUM', 'ALL']
        denom = counts['DENOM', 'ALL']
        
        # accumulate stats
        total_num += num
        total_denom += denom
        
        # per sentence output
        log.info(f"Gold:    {sentence}")
        log.info(f"Viterbi: {viterbi}")
        log.info(f"Loss:    {denom - num}/{denom}")
        xent = -model.logprob(sentence, endev) / len(sentence)  # measured in nats
        
        # accumulate cross-entropy stats
        total_xent += -model.logprob(sentence, endev)
        total_words += len(sentence)
        
        log.info(f"Cross-entropy: {xent/math.log(2)} nats (= perplexity {math.exp(xent)})\n---")
    
    # print summary stats
    log.info("\nSummary Statistics:")
    accuracy = total_num / total_denom
    avg_xent = total_xent / total_words
    avg_perplexity = math.exp(avg_xent)
    
    log.info(f"Overall Accuracy: {accuracy:.4f}")
    log.info(f"Total Correct/Total Tags: {total_num}/{total_denom}")
    log.info(f"Average Cross-entropy: {(avg_xent/math.log(2)):.4f} nats")
    log.info(f"Average Perplexity: {avg_perplexity:.4f}")


In [17]:
look_at_your_data(enhancedhmm, endev, 10)

INFO : Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Viterbi: ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/V ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/W means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Loss:    2/34
INFO : Cross-entropy: 10.578787803649902 nats (= perplexity 1529.4398868766723)
---
INFO : Gold:    I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/N ``/` _OOV_/N 's/P _OOV_/N _OOV_/N ./. ''/'
INFO : Viterbi: I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/J ``/` _OOV_/P 's/V _OOV_/D _OOV_/N ./. ''/'
INFO : Loss:    4/21
INFO : Cross-entropy: 13.16698932647705 nats (= perple

In [18]:
look_at_your_data(hmm, endev, 10)

INFO : Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Viterbi: ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/T added/V ,/, ``/` and/C that/I means/V virtually/R everyone/, who/W works/V here/R ./.
INFO : Loss:    3/34
INFO : Cross-entropy: 9.973000526428223 nats (= perplexity 1005.0141699643956)
---
INFO : Gold:    I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/N ``/` _OOV_/N 's/P _OOV_/N _OOV_/N ./. ''/'
INFO : Viterbi: I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/, ``/` _OOV_/P 's/V _OOV_/D _OOV_/N ./. ''/'
INFO : Loss:    4/21
INFO : Cross-entropy: 10.119484901428223 nats (= perple

Now let's try supervised training of a CRF (this doesn't use the unsupervised
part of the data, so it is comparable to the supervised pre-training we did
for the HMM).  We will use SGD to approximately maximize the regularized
log-likelihood. 

As with the semi-supervised HMM training, we'll periodically evaluate the
tagging accuracy (and also print the cross-entropy) on a held-out dev set.
We use the default `eval_interval` and `tolerance`.  If you want to stop
sooner, then you could increase the `tolerance` so the training method decides
sooner that it has converged.

We arbitrarily choose reg = 1.0 for L2 regularization, learning rate = 0.05,
and a minibatch size of 10, but it would be better to search for the best
value of these hyperparameters.

Note that the logger reports the CRF's *conditional* cross-entropy, log p(tags
| words) / n.  This is much lower than the HMM's *joint* cross-entropy log
p(tags, words) / n, but that doesn't mean the CRF is worse at tagging.  The
CRF is just predicting less information.

In [19]:
log.info("*** Conditional Random Field (CRF)\n")
loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)
crf = ConditionalRandomField(entrain.tagset, entrain.vocab)  # randomly initialized parameters  
crf.train(corpus=ensup, loss=loss_dev, reg=1.0, lr=0.05, minibatch_size=10,
          save_path="en_crf.pkl")

INFO : *** Conditional Random Field (CRF)

100%|██████████| 996/996 [00:07<00:00, 136.70it/s]
INFO : Cross-entropy: -0.0000 nats (= perplexity 1.000)
100%|██████████| 996/996 [00:04<00:00, 216.40it/s]
INFO : Tagging accuracy: all: 4.560%, known: 4.533%, seen: 4.882%, novel: 4.822%
100%|██████████| 500/500 [00:23<00:00, 21.20it/s]
100%|██████████| 996/996 [00:04<00:00, 243.16it/s]
INFO : Cross-entropy: -0.0000 nats (= perplexity 1.000)
100%|██████████| 996/996 [00:04<00:00, 247.87it/s]
INFO : Tagging accuracy: all: 71.886%, known: 72.987%, seen: 56.397%, novel: 62.087%
100%|██████████| 500/500 [00:17<00:00, 27.84it/s]
100%|██████████| 996/996 [00:04<00:00, 217.48it/s]
INFO : Cross-entropy: -0.0000 nats (= perplexity 1.000)
100%|██████████| 996/996 [00:04<00:00, 246.10it/s]
INFO : Tagging accuracy: all: 75.089%, known: 76.837%, seen: 56.566%, novel: 57.133%
100%|██████████| 500/500 [00:18<00:00, 27.13it/s]
100%|██████████| 996/996 [00:04<00:00, 244.79it/s]
INFO : Cross-entropy: -0.0000 n

In [20]:
look_at_your_data(crf, endev, 10)

INFO : Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Viterbi: ``/` We/P 're/J strongly/N _OOV_/N that/I anyone/N who/W has/V eaten/N in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/N ,/, ``/` and/C that/I means/J virtually/N everyone/N who/W works/V here/R ./.
INFO : Loss:    8/34
INFO : Cross-entropy: -0.0 nats (= perplexity 1.0)
---
INFO : Gold:    I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/N ``/` _OOV_/N 's/P _OOV_/N _OOV_/N ./. ''/'
INFO : Viterbi: I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/J Oct./N 13/N editorial/V ``/` _OOV_/N 's/P _OOV_/J _OOV_/N ./. ''/'
INFO : Loss:    4/21
INFO : Cross-entropy: -0.0 nats (= perplexity 1.0)
---
INFO : Gold:    It/P is/V th

In [21]:
crf = ConditionalRandomField.load("en_crf.pkl")  # reset to supervised model (in case you're re-executing this bit)
loss_dev = lambda model: viterbi_error_rate(model, eval_corpus=endev, 
                                            known_vocab=known_vocab)
hmm.train(corpus=entrain, loss=loss_dev, λ=1.0,
          save_path="en_csf_raw.pkl")

INFO : Loaded model from en_crf.pkl
100%|██████████| 996/996 [00:04<00:00, 247.89it/s]
INFO : Cross-entropy: 6.9584 nats (= perplexity 1051.928)
100%|██████████| 996/996 [00:05<00:00, 190.81it/s]
INFO : Tagging accuracy: all: 87.031%, known: 91.397%, seen: 45.791%, novel: 40.225%
100%|██████████| 8064/8064 [02:31<00:00, 53.15it/s] 
INFO : Saving model to en_csf_raw.pkl
INFO : Saved model to en_csf_raw.pkl
100%|██████████| 996/996 [00:02<00:00, 353.21it/s]
INFO : Cross-entropy: 6.9446 nats (= perplexity 1037.506)
100%|██████████| 996/996 [00:04<00:00, 203.34it/s]
INFO : Tagging accuracy: all: 85.883%, known: 90.170%, seen: 45.960%, novel: 39.696%
INFO : Saving model to en_csf_raw.pkl
INFO : Saved model to en_csf_raw.pkl


In [22]:
look_at_your_data(crf,endev,10)

INFO : Gold:    ``/` We/P 're/V strongly/R _OOV_/V that/I anyone/N who/W has/V eaten/V in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/V ,/, ``/` and/C that/D means/V virtually/R everyone/N who/W works/V here/R ./.
INFO : Viterbi: ``/` We/P 're/J strongly/N _OOV_/N that/I anyone/N who/W has/V eaten/N in/I the/D cafeteria/N this/D month/N have/V the/D shot/N ,/, ''/' Mr./N Mattausch/N added/N ,/, ``/` and/C that/I means/J virtually/N everyone/N who/W works/V here/R ./.
INFO : Loss:    8/34
INFO : Cross-entropy: -0.0 nats (= perplexity 1.0)
---
INFO : Gold:    I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/P Oct./N 13/C editorial/N ``/` _OOV_/N 's/P _OOV_/N _OOV_/N ./. ''/'
INFO : Viterbi: I/P was/V _OOV_/V to/T read/V the/D _OOV_/N of/I facts/N in/I your/J Oct./N 13/N editorial/V ``/` _OOV_/N 's/P _OOV_/J _OOV_/N ./. ''/'
INFO : Loss:    4/21
INFO : Cross-entropy: -0.0 nats (= perplexity 1.0)
---
INFO : Gold:    It/P is/V th

Let's examine how the CRF does on individual sentences. 
(Do you see any error patterns here that would inspire additional CRF features?)