# Homework: Phrasal Chunking

This is your documentation for the chunker homework.

In [1]:
import perc
import default
import sys
import time
from collections import defaultdict

In [2]:
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    default_tag = tagset[0]
    
    for epoch in range(numepochs):
        
        count_mistake = 0
        
        tic = time.time()
        
        for _, (labeled_list,feat_list) in enumerate(train_data):
            pred_output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            true_output = [x.split()[2] for x in labeled_list]
            
            if pred_output != true_output:
                count_mistake += 1
                feat_index = 0
                
                for w_index in range(len(pred_output)):
                    pred_tag = pred_output[w_index]
                    true_tag = true_output[w_index]
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    if pred_tag != true_tag:
                        for feat in feats:
                            feat_vec[feat, true_tag] += 1
                            feat_vec[feat, pred_tag] -= 1
        toc = time.time()
        print(f'Epoch {epoch+1} finished. Time cost on this epoch: {toc-tic}. Number of mistakes: {count_mistake}.')
    
    return feat_vec

In [5]:
def perc_avg_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    default_tag = tagset[0]
    
    for epoch in range(numepochs):
        
        count_mistake = 0
        
        tic = time.time()
        
        for _, (labeled_list,feat_list) in enumerate(train_data):
            pred_output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            true_output = [x.split()[2] for x in labeled_list]
            
            if pred_output != true_output:
                count_mistake += 1
                feat_index = 0
                
                for w_index in range(len(pred_output)):
                    pred_tag = pred_output[w_index]
                    true_tag = true_output[w_index]
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    if pred_tag != true_tag:
                        for feat in feats:
                            if feat == 'B' and w_index > 0:
                                if true_output[w_index-1] != pred_output[w_index-1]:
                                    feat_vec['B:' + true_output[w_index-1], true_tag] += 1
                                    feat_vec['B:' + pred_output[w_index-1], pred_tag] -= 1
                            else:
                                feat_vec[feat, true_tag] += 1
                                feat_vec[feat, pred_tag] -= 1
        toc = time.time()
        print(f'Epoch {epoch+1} finished. Time cost on this epoch: {toc-tic}. Number of mistakes: {count_mistake}.')
    
    return feat_vec

In [6]:
feat_vec = {}
tagset = []
train_data = []
tagset = perc.read_tagset("data/tagset.txt")
print("reading data ...", file=sys.stderr)
train_data = perc.read_labeled_data("data/train.txt.gz", "data/train.feats.gz", verbose=False)
print("done.", file=sys.stderr)
# baseline
# feat_vec = perc_train(train_data, tagset, 10)
# perc.perc_write_to_file(feat_vec, "baseline.model")
# average
feat_vec = perc_avg_train(train_data, tagset, 10)
perc.perc_write_to_file(feat_vec, "baseline_avg.model")
print("wrote model to disk")

reading data ...
done.


Epoch 1 finished. Time cost on this epoch: 65.68992137908936. Number of mistakes: 5704.
Epoch 2 finished. Time cost on this epoch: 69.1945493221283. Number of mistakes: 4087.
Epoch 3 finished. Time cost on this epoch: 69.90012168884277. Number of mistakes: 3009.
Epoch 4 finished. Time cost on this epoch: 72.36271595954895. Number of mistakes: 2268.
Epoch 5 finished. Time cost on this epoch: 71.14806699752808. Number of mistakes: 1919.
Epoch 6 finished. Time cost on this epoch: 71.09302139282227. Number of mistakes: 1510.
Epoch 7 finished. Time cost on this epoch: 71.33235287666321. Number of mistakes: 1313.
Epoch 8 finished. Time cost on this epoch: 72.50382709503174. Number of mistakes: 1015.
Epoch 9 finished. Time cost on this epoch: 72.15780901908875. Number of mistakes: 910.
Epoch 10 finished. Time cost on this epoch: 72.75086092948914. Number of mistakes: 825.
wrote model to disk


In [7]:
%%capture --no-stderr output
print("reading test data ...", file=sys.stderr)
test_data = perc.read_labeled_data("data/dev.txt", "data/dev.feats", verbose=False)
print("done.", file=sys.stderr)
# baseline
# feat_vec = perc.perc_read_from_file("baseline.model")
# average
feat_vec = perc.perc_read_from_file("baseline_avg.model")
perc.perc_testall(feat_vec, test_data, tagset)

reading test data ...
done.


In [10]:
import score_chunks
boundary = "-X-" # something to use as boundary between sentences
outside = "O" # tag used to mark the outside of any chunk
conlleval = False # use conlleval (should be False for most use cases)
numfeats = 2 # number of columns to consider as features, typically "word POStag"
(test, _) = score_chunks.readTestFile(str(output), boundary, outside, conlleval, numfeats)
with open("data/reference500.txt") as f:
    (reference, _) = score_chunks.readTestFile(f.read(), boundary, outside, conlleval, numfeats)
print("Score: %.2f" % score_chunks.corpus_fmeasure(reference, test, False))

processed 500 sentences with 10375 tokens and 5783 phrases; found phrases: 5752; correct phrases: 5322
             ADJP: precision:  72.55%; recall:  74.75%; F1:  73.63; found:    102; correct:     99
             ADVP: precision:  74.88%; recall:  75.25%; F1:  75.06; found:    203; correct:    202
            CONJP: precision: 100.00%; recall:  60.00%; F1:  75.00; found:      3; correct:      5
             INTJ: precision: 100.00%; recall: 100.00%; F1: 100.00; found:      1; correct:      1
               NP: precision:  93.23%; recall:  92.43%; F1:  92.83; found:   3000; correct:   3026
               PP: precision:  96.58%; recall:  97.22%; F1:  96.90; found:   1229; correct:   1221
              PRT: precision:  78.57%; recall:  50.00%; F1:  61.11; found:     14; correct:     22
             SBAR: precision:  78.07%; recall:  83.18%; F1:  80.54; found:    114; correct:    107
               VP: precision:  92.82%; recall:  91.64%; F1:  92.22; found:   1086; correct:   1100
accura