# Homework: Phrasal Chunking

This is your documentation for the chunker homework.

In [1]:
import perc
import default
import sys
import time
from collections import defaultdict

## Baseline

For the baseline implementation, we only care about the unigram features. The f1 score of this baseline is 91.37. Updating the weight only occurred when the predicted tag was not correct.

In [2]:
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    default_tag = tagset[0]
    
    for epoch in range(numepochs):
        
        count_mistake = 0
        
        tic = time.time()
        
        for _, (labeled_list,feat_list) in enumerate(train_data):
            pred_output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            true_output = [x.split()[2] for x in labeled_list]
            
            if pred_output != true_output:
                count_mistake += 1
                feat_index = 0
                
                for w_index in range(len(pred_output)):
                    pred_tag = pred_output[w_index]
                    true_tag = true_output[w_index]
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    if pred_tag != true_tag:
                        for feat in feats:
                            feat_vec[feat, true_tag] += 1
                            feat_vec[feat, pred_tag] -= 1
        toc = time.time()
        print(f'Epoch {epoch+1} finished. Time cost on this epoch: {toc-tic}. Number of mistakes: {count_mistake}.')
    
    return feat_vec

In [3]:
feat_vec = {}
tagset = []
train_data = []
tagset = perc.read_tagset("data/tagset.txt")
print("reading data ...", file=sys.stderr)
train_data = perc.read_labeled_data("data/train.txt.gz", "data/train.feats.gz", verbose=False)
print("done.", file=sys.stderr)
feat_vec = perc_train(train_data, tagset, 10)
perc.perc_write_to_file(feat_vec, "baseline.model")
print("wrote model to disk")

reading data ...
done.


Epoch 1 finished. Time cost on this epoch: 59.94477319717407. Number of mistakes: 6185.
Epoch 2 finished. Time cost on this epoch: 64.61289858818054. Number of mistakes: 4792.
Epoch 3 finished. Time cost on this epoch: 64.2726526260376. Number of mistakes: 3856.
Epoch 4 finished. Time cost on this epoch: 65.9210786819458. Number of mistakes: 3104.
Epoch 5 finished. Time cost on this epoch: 63.01786255836487. Number of mistakes: 2538.
Epoch 6 finished. Time cost on this epoch: 63.37905955314636. Number of mistakes: 2196.
Epoch 7 finished. Time cost on this epoch: 63.6257848739624. Number of mistakes: 1779.
Epoch 8 finished. Time cost on this epoch: 64.33813691139221. Number of mistakes: 1542.
Epoch 9 finished. Time cost on this epoch: 64.12444615364075. Number of mistakes: 1411.
Epoch 10 finished. Time cost on this epoch: 66.15416932106018. Number of mistakes: 1181.
wrote model to disk


In [4]:
%%capture --no-stderr output
print("reading test data ...", file=sys.stderr)
test_data = perc.read_labeled_data("data/dev.txt", "data/dev.feats", verbose=False)
print("done.", file=sys.stderr)
feat_vec = perc.perc_read_from_file("baseline.model")
perc.perc_testall(feat_vec, test_data, tagset)

reading test data ...
done.


In [5]:
import score_chunks
boundary = "-X-" # something to use as boundary between sentences
outside = "O" # tag used to mark the outside of any chunk
conlleval = False # use conlleval (should be False for most use cases)
numfeats = 2 # number of columns to consider as features, typically "word POStag"
(test, _) = score_chunks.readTestFile(str(output), boundary, outside, conlleval, numfeats)
with open("data/reference500.txt") as f:
    (reference, _) = score_chunks.readTestFile(f.read(), boundary, outside, conlleval, numfeats)
print("Score: %.2f" % score_chunks.corpus_fmeasure(reference, test, False))

processed 500 sentences with 10375 tokens and 5783 phrases; found phrases: 5886; correct phrases: 5331
             ADJP: precision:  58.33%; recall:  70.71%; F1:  63.93; found:    120; correct:     99
             ADVP: precision:  70.14%; recall:  76.73%; F1:  73.29; found:    221; correct:    202
            CONJP: precision:  66.67%; recall:  40.00%; F1:  50.00; found:      3; correct:      5
             INTJ: precision:   0.00%; recall:   0.00%; F1:   0.00; found:      0; correct:      1
               NP: precision:  90.70%; recall:  92.80%; F1:  91.73; found:   3096; correct:   3026
               PP: precision:  96.69%; recall:  97.95%; F1:  97.31; found:   1237; correct:   1221
              PRT: precision:  76.92%; recall:  45.45%; F1:  57.14; found:     13; correct:     22
             SBAR: precision:  83.17%; recall:  78.50%; F1:  80.77; found:    101; correct:    107
               VP: precision:  91.87%; recall:  91.45%; F1:  91.66; found:   1095; correct:   1100
accura

## Bigram features

Based on the baseline implementation, we further include the bigram features. The f1 score was improved from 91.37 to 92.36.

In [29]:
def perc_bigram_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    default_tag = tagset[0]
    
    for epoch in range(numepochs):
        
        count_mistake = 0
        
        tic = time.time()
        
        for _, (labeled_list,feat_list) in enumerate(train_data):
            pred_output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            true_output = [x.split()[2] for x in labeled_list]
            
            if pred_output != true_output:
                count_mistake += 1
                feat_index = 0
                
                for w_index in range(len(pred_output)):
                    pred_tag = pred_output[w_index]
                    true_tag = true_output[w_index]
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    for feat in feats:
                        if feat == 'B' and w_index > 0:
                            if true_output[w_index-1] != pred_output[w_index-1] or pred_tag != true_tag:
                                feat_vec['B:' + true_output[w_index-1], true_tag] += 1
                                feat_vec['B:' + pred_output[w_index-1], pred_tag] -= 1
                        elif pred_tag != true_tag:
                            feat_vec[feat, true_tag] += 1
                            feat_vec[feat, pred_tag] -= 1
        toc = time.time()
        print(f'Epoch {epoch+1} finished. Time cost on this epoch: {toc-tic}. Number of mistakes: {count_mistake}.')
    
    return feat_vec

In [30]:
feat_vec = {}
tagset = []
train_data = []
tagset = perc.read_tagset("data/tagset.txt")
print("reading data ...", file=sys.stderr)
train_data = perc.read_labeled_data("data/train.txt.gz", "data/train.feats.gz", verbose=False)
print("done.", file=sys.stderr)
feat_vec = perc_bigram_train(train_data, tagset, 10)
perc.perc_write_to_file(feat_vec, "baseline_bigram.model")
print("wrote model to disk")

reading data ...
done.


Epoch 1 finished. Time cost on this epoch: 68.16795706748962. Number of mistakes: 5594.
Epoch 2 finished. Time cost on this epoch: 72.09285426139832. Number of mistakes: 3990.
Epoch 3 finished. Time cost on this epoch: 74.29592108726501. Number of mistakes: 2928.
Epoch 4 finished. Time cost on this epoch: 74.93708634376526. Number of mistakes: 2269.
Epoch 5 finished. Time cost on this epoch: 76.31132340431213. Number of mistakes: 1849.
Epoch 6 finished. Time cost on this epoch: 75.99922156333923. Number of mistakes: 1450.
Epoch 7 finished. Time cost on this epoch: 81.31524085998535. Number of mistakes: 1134.
Epoch 8 finished. Time cost on this epoch: 79.69651818275452. Number of mistakes: 946.
Epoch 9 finished. Time cost on this epoch: 81.77506041526794. Number of mistakes: 804.
Epoch 10 finished. Time cost on this epoch: 81.91603827476501. Number of mistakes: 739.
wrote model to disk


In [31]:
%%capture --no-stderr output
print("reading test data ...", file=sys.stderr)
test_data = perc.read_labeled_data("data/dev.txt", "data/dev.feats", verbose=False)
print("done.", file=sys.stderr)
feat_vec = perc.perc_read_from_file("baseline_bigram.model")
perc.perc_testall(feat_vec, test_data, tagset)

reading test data ...
done.


In [32]:
import score_chunks
boundary = "-X-" # something to use as boundary between sentences
outside = "O" # tag used to mark the outside of any chunk
conlleval = False # use conlleval (should be False for most use cases)
numfeats = 2 # number of columns to consider as features, typically "word POStag"
(test, _) = score_chunks.readTestFile(str(output), boundary, outside, conlleval, numfeats)
with open("data/reference500.txt") as f:
    (reference, _) = score_chunks.readTestFile(f.read(), boundary, outside, conlleval, numfeats)
print("Score: %.2f" % score_chunks.corpus_fmeasure(reference, test, False))

processed 500 sentences with 10375 tokens and 5783 phrases; found phrases: 5755; correct phrases: 5328
             ADJP: precision:  68.69%; recall:  68.69%; F1:  68.69; found:     99; correct:     99
             ADVP: precision:  73.91%; recall:  75.74%; F1:  74.82; found:    207; correct:    202
            CONJP: precision:  50.00%; recall:  20.00%; F1:  28.57; found:      2; correct:      5
             INTJ: precision:   0.00%; recall:   0.00%; F1:   0.00; found:      0; correct:      1
               NP: precision:  92.98%; recall:  92.73%; F1:  92.85; found:   3018; correct:   3026
               PP: precision:  96.24%; recall:  98.44%; F1:  97.33; found:   1249; correct:   1221
              PRT: precision:  80.00%; recall:  36.36%; F1:  50.00; found:     10; correct:     22
             SBAR: precision:  87.13%; recall:  82.24%; F1:  84.62; found:    101; correct:    107
               VP: precision:  93.73%; recall:  91.09%; F1:  92.39; found:   1069; correct:   1100
accura

## Averaged weight + bigram features

Based on the baseline implementation plus bigram features, we implemented the averaged weighting approach with the guidance of the pseudo code on Page 38 of reference [Sakar 2011](http://www.cs.sfu.ca/~anoop/papers/pdf/syntax-parsing-survey-2011.pdf) and idea from section 2.5 of reference [Collins 2002](http://www.aclweb.org/anthology/W/W02/W02-1001.pdf). With this modification, the f1 score was improved from 92.36 to 93.51.

In [25]:
def perc_avg_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    avg_feat_vec = defaultdict(float)
    default_tag = tagset[0]

    for epoch in range(numepochs):
        count_mistake = 0
        print(f"Running on epoch {epoch+1}......")
        tic = time.time()
        for _, (labeled_list, feat_list) in enumerate(train_data):
            pred_output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
            true_output = [x.split()[2] for x in labeled_list]

            if pred_output != true_output:
                count_mistake += 1
                feat_index = 0
                
                for w_index in range(len(pred_output)):
                    pred_tag = pred_output[w_index]
                    true_tag = true_output[w_index]
                    (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
                    for feat in feats:
                        if feat == 'B' and w_index > 0:
                            if true_output[w_index-1] != pred_output[w_index-1] or pred_tag != true_tag:
                                feat_vec['B:' + true_output[w_index-1], true_tag] += 1
                                feat_vec['B:' + pred_output[w_index-1], pred_tag] -= 1
                        elif pred_tag != true_tag:
                            feat_vec[feat, true_tag] += 1
                            feat_vec[feat, pred_tag] -= 1


            for key in feat_vec.keys():
                # γ = σ/(mT)
                avg_feat_vec[key] += feat_vec[key]

        toc = time.time()
        print(f'Epoch {epoch+1} finished. Time cost on this epoch: {toc-tic}. Number of mistakes: {count_mistake}.')

    for key in avg_feat_vec.keys():
        avg_feat_vec[key] /= (numepochs * len(train_data))
    return avg_feat_vec

In [26]:
feat_vec = {}
tagset = []
train_data = []
tagset = perc.read_tagset("data/tagset.txt")
print("reading data ...", file=sys.stderr)
train_data = perc.read_labeled_data("data/train.txt.gz", "data/train.feats.gz", verbose=False)
print("done.", file=sys.stderr)
feat_vec = perc_avg_train(train_data, tagset, 10)
perc.perc_write_to_file(feat_vec, "baseline_bigram_avg.model")
print("wrote model to disk")

reading data ...
done.


Running on epoch 0......
Epoch 1 finished. Time cost on this epoch: 257.43480610847473. Number of mistakes: 5594.
Running on epoch 1......
Epoch 2 finished. Time cost on this epoch: 454.4710409641266. Number of mistakes: 3990.
Running on epoch 2......
Epoch 3 finished. Time cost on this epoch: 527.4040987491608. Number of mistakes: 2928.
Running on epoch 3......
Epoch 4 finished. Time cost on this epoch: 569.7692408561707. Number of mistakes: 2269.
Running on epoch 4......
Epoch 5 finished. Time cost on this epoch: 617.2872650623322. Number of mistakes: 1849.
Running on epoch 5......
Epoch 6 finished. Time cost on this epoch: 632.4525303840637. Number of mistakes: 1450.
Running on epoch 6......
Epoch 7 finished. Time cost on this epoch: 653.9197323322296. Number of mistakes: 1134.
Running on epoch 7......
Epoch 8 finished. Time cost on this epoch: 1148.7454252243042. Number of mistakes: 946.
Running on epoch 8......
Epoch 9 finished. Time cost on this epoch: 916.5279200077057. Number o

In [27]:
%%capture --no-stderr output
print("reading test data ...", file=sys.stderr)
test_data = perc.read_labeled_data("data/dev.txt", "data/dev.feats", verbose=False)
print("done.", file=sys.stderr)
# baseline
# feat_vec = perc.perc_read_from_file("baseline.model")
# average
feat_vec = perc.perc_read_from_file("baseline_bigram_avg.model")
perc.perc_testall(feat_vec, test_data, tagset)

reading test data ...
done.


In [28]:
import score_chunks
boundary = "-X-" # something to use as boundary between sentences
outside = "O" # tag used to mark the outside of any chunk
conlleval = False # use conlleval (should be False for most use cases)
numfeats = 2 # number of columns to consider as features, typically "word POStag"
(test, _) = score_chunks.readTestFile(str(output), boundary, outside, conlleval, numfeats)
with open("data/reference500.txt") as f:
    (reference, _) = score_chunks.readTestFile(f.read(), boundary, outside, conlleval, numfeats)
print("Score: %.2f" % score_chunks.corpus_fmeasure(reference, test, False))

processed 500 sentences with 10375 tokens and 5783 phrases; found phrases: 5801; correct phrases: 5416
             ADJP: precision:  71.00%; recall:  71.72%; F1:  71.36; found:    100; correct:     99
             ADVP: precision:  77.03%; recall:  79.70%; F1:  78.35; found:    209; correct:    202
            CONJP: precision: 100.00%; recall:  60.00%; F1:  75.00; found:      3; correct:      5
             INTJ: precision:   0.00%; recall:   0.00%; F1:   0.00; found:      0; correct:      1
               NP: precision:  94.42%; recall:  94.42%; F1:  94.42; found:   3026; correct:   3026
               PP: precision:  96.77%; recall:  98.03%; F1:  97.40; found:   1237; correct:   1221
              PRT: precision:  80.00%; recall:  72.73%; F1:  76.19; found:     20; correct:     22
             SBAR: precision:  84.47%; recall:  81.31%; F1:  82.86; found:    103; correct:    107
               VP: precision:  92.84%; recall:  93.09%; F1:  92.96; found:   1103; correct:   1100
accura