# Homework: Phrasal Chunking

This is your documentation for the chunker homework.

In [13]:
%load_ext autoreload
# %load_ext cython
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Build before you run:
python setup.py build_ext --inplace

In [9]:
import perc
import default
import sys
import time
from collections import defaultdict
from tqdm import tnrange, tqdm_notebook

In [18]:
def perc_avg_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    default_tag = tagset[0]
    
    # Pre-compute
    feats_dict = defaultdict(lambda : defaultdict(list))
    true_output = {}
    for i, (labeled_list, feat_list) in enumerate(train_data):
        feat_index = 0
        true_output[i] = [x.split()[2] for x in labeled_list]
        for w_index in range(len(true_output[i])):
            (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
            feats_dict[i][w_index] = feats
            
    for epoch in tnrange(numepochs, desc='Training'):
        count_mistake = 0
        tic = time.time()
        pbar = tqdm_notebook(total=100, desc=f'Epoch: {epoch}')
        for i, (labeled_list, feat_list) in enumerate(train_data):
            pred_output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            if not i % round(len(train_data)/100):
                pbar.update(1)

            if pred_output != true_output[i]:
                count_mistake += 1
                feat_index = 0

                for w_index, pred_tag  in enumerate(pred_output):
                    true_tag = true_output[i][w_index]
                    # Below is time consuming
                    if pred_tag != true_tag:
                        for feat in feats_dict[i][w_index]:
                            if feat == 'B' and w_index > 0:
                                if true_output[i][w_index-1] != pred_output[w_index-1]:
                                    feat_vec['B:' + true_output[i][w_index-1], true_tag] += 1
                                    feat_vec['B:' + pred_output[w_index-1], pred_tag] -= 1
                            else:
                                feat_vec[feat, true_tag] += 1
                                feat_vec[feat, pred_tag] -= 1
        pbar.close()
        toc = time.time()
        print(f'Epoch {epoch+1} finished. Time cost on this epoch: {toc-tic}. Number of mistakes: {count_mistake}.')
    
    return feat_vec

In [3]:
feat_vec = {}
tagset = []
train_data = []
tagset = perc.read_tagset("data/tagset.txt")
print("reading data ...", file=sys.stderr)
train_data = perc.read_labeled_data("data/train.txt.gz", "data/train.feats.gz", verbose=False)
print("done.", file=sys.stderr)

reading data ...
done.


In [19]:
# average
feat_vec = perc_avg_train(train_data, tagset, 10)

perc.perc_write_to_file(feat_vec, "baseline_avg.model")
print("wrote model to disk")

HBox(children=(IntProgress(value=0, description='Training', max=10), HTML(value='')))

HBox(children=(IntProgress(value=0), HTML(value='')))

KeyboardInterrupt: 

In [68]:
%%capture --no-stderr output
print("reading test data ...", file=sys.stderr)
test_data = perc.read_labeled_data("data/dev.txt", "data/dev.feats", verbose=False)
print("done.", file=sys.stderr)
# baseline
# feat_vec = perc.perc_read_from_file("baseline.model")
# average
feat_vec = perc.perc_read_from_file("baseline_avg.model")
perc.perc_testall(feat_vec, test_data, tagset)

reading test data ...
done.


In [69]:
import score_chunks
boundary = "-X-" # something to use as boundary between sentences
outside = "O" # tag used to mark the outside of any chunk
conlleval = False # use conlleval (should be False for most use cases)
numfeats = 2 # number of columns to consider as features, typically "word POStag"
(test, _) = score_chunks.readTestFile(str(output), boundary, outside, conlleval, numfeats)
with open("data/reference500.txt") as f:
    (reference, _) = score_chunks.readTestFile(f.read(), boundary, outside, conlleval, numfeats)
print("Score: %.2f" % score_chunks.corpus_fmeasure(reference, test, False))

processed 500 sentences with 10375 tokens and 5783 phrases; found phrases: 5795; correct phrases: 5372
             ADJP: precision:  69.39%; recall:  68.69%; F1:  69.04; found:     98; correct:     99
             ADVP: precision:  75.12%; recall:  76.24%; F1:  75.68; found:    205; correct:    202
            CONJP: precision:  75.00%; recall:  60.00%; F1:  66.67; found:      4; correct:      5
             INTJ: precision:   0.00%; recall:   0.00%; F1:   0.00; found:      0; correct:      1
               NP: precision:  93.43%; recall:  93.52%; F1:  93.48; found:   3029; correct:   3026
               PP: precision:  96.60%; recall:  97.87%; F1:  97.23; found:   1237; correct:   1221
              PRT: precision:  81.25%; recall:  59.09%; F1:  68.42; found:     16; correct:     22
             SBAR: precision:  82.08%; recall:  81.31%; F1:  81.69; found:    106; correct:    107
               VP: precision:  92.91%; recall:  92.91%; F1:  92.91; found:   1100; correct:   1100
accura