Introduction to NLP course (2017-2018).

Homework 1: Tokenization and corpus statistics

Objectives:

1) Load and tokenize the treebank corpus from NLTK using regexp_tokenizer
- obtain the corpus using get_corpus_t1()
- obtain the gold standard using get_gold_tokens()
- extend the existing regexp grammar to improve its coverage
- modify the corpus prior to the tokenization (if needed)
- tokenize the corpus with regexp_tokenize()
- evaluate the tokenization using evaluate_t1()
- improve the regexp grammar until satisfied with the result

2) Print basic statistics for the corpus (after the tokenization)
- The number of tokens in the corpus
- The number of types in the corpus (case insensitive!)
- The number of hapaxes - tokens that appear in the corpus only once (case insensitive!) 
- The most frequent types with length >=5 
- The average token length
- The most frequent token length in the corpus
- The number of bi-, tri-, and five-grams in the corpus (you need to write your own function for extracting five-grams);
- The most frequent bi- and tri-grams that do NOT contain punctuation (for the task, assume punctuation to be , . ! ? )
- The most frequent five-grams
- The percentage of bi-,tri-, and five-grams that appear only once
- The 10 most frequent collocates of "man" and "woman" in the corpus, within a window of 4
- The 10 most frequent collocates of "man" and "woman", with a frequency of 5 or more, according to the PPMI score (within a window of 4)


In [1]:
# Import section

# Import nltk
import nltk
from nltk import word_tokenize
from nltk import regexp_tokenize
from nltk import FreqDist
from nltk import bigrams, trigrams
from nltk.collocations import *

# Import regular expressions
import re

# Import corpora
from nltk.corpus import treebank_raw

In [2]:
## Functions given in the task
## You should not change anything here
def get_corpus_t1(nr_files=199):
    """Returns the raw corpus as a long string.
    'nr_files' says how much of the corpus is returned;
    default is 199, which is the whole corpus.
    """
    fileids = nltk.corpus.treebank_raw.fileids()[:nr_files]
    corpus_text = nltk.corpus.treebank_raw.raw(fileids)
    # Get rid of the ".START" text in the beginning	of each file:
    corpus_text = corpus_text.replace(".START", "")
    return corpus_text

def fix_gold_tokens(tokens):
    """Replace tokens so that they are similar to the raw corpus text."""
    return [token.replace("''", '"').replace("``",'"').replace(r"\/", "/") for token in tokens]

def get_gold_tokens(nr_files=199):
    """Returns the gold corpus as a list of strings.
    'nr_files' says how much of the corpus is returned;
    default is 199, which is the whole corpus.
    """
    fileids = nltk.corpus.treebank_chunk.fileids()[:nr_files]
    gold_tokens = nltk.corpus.treebank_chunk.words(fileids)
    return fix_gold_tokens(gold_tokens)

def evaluate_t1(test_tokens, gold_tokens):
    """Finds the chunks where test_tokens differs from gold_tokens.
    Prints the errors and calculates similarity measures.
    """
    import difflib
    matcher = difflib.SequenceMatcher()
    matcher.set_seqs(test_tokens, gold_tokens)
    error_chunks = true_positives = false_positives = false_negatives = 0
    print(" Token%30s | %-30sToken" % ("Error", "Correct"))
    print("-" * 38 + "+" + "-" * 38)
    for difftype, test_from, test_to, gold_from, gold_to in matcher.get_opcodes():
        if difftype == "equal":
            true_positives += test_to - test_from
        else:
            false_positives += test_to - test_from
            false_negatives += gold_to - gold_from
            error_chunks += 1
            test_chunk = " ".join(test_tokens[test_from:test_to])
            gold_chunk = " ".join(gold_tokens[gold_from:gold_to])
            print("%6d%30s | %-30s%d" % (test_from,test_chunk, gold_chunk, gold_from))
    precision = 1.0 * true_positives / (true_positives + false_positives)
    recall = 1.0 * true_positives / (true_positives+ false_negatives)
    fscore = 2.0 * precision * recall / (precision+ recall)
    print()
    print("Test size: %5d tokens" % len(test_tokens))
    print("Gold size: %5d tokens" % len(gold_tokens))
    print("Nr errors: %5d chunks" % error_chunks)
    print("Precision: %5.2f %%" % (100 * precision))
    print("Recall: %5.2f %%" % (100 * recall))
    print("F-score: %5.2f %%" % (100 * fscore))
    print()

In [3]:
# HOMEWORK 1. PART 1.
# Dummy function
# Feel free to make it more verbose and include prints/status updates
def hw1_part1():
    # Get the corpus
    corpus = get_corpus_t1()
    
    # Get the gold standard
    gold_tokens = get_gold_tokens()

    # Initial regular expression grammar
    # You need to modify it so that you can improve the performance of the tokenizer
    re_grammar = r'''(?x) # set flag to allow verbose regexps
    (?:[A-Z]\.)+          # abbreviations, e.g. U.S.A.
    | \w+(?:-\w+)*        # words with optional internal hyphens
    | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
    | \.\.\.              # ellipsis
    | [][.,;"'?():-_`]    # these are separate tokens; includes ], [
    ''' 

    # Modify the corpus prior to tokenization here, if necessary
    
    # Tokenize the corpus
    test_tokens = regexp_tokenize(corpus, re_grammar)
    
    # Evaluate the results
    evaluate_t1(test_tokens,gold_tokens)
    
    return(test_tokens)

In [4]:
# HOMEWORK 1. PART 2.
# Dummy function
# Feel free to make it more verbose and include prints/status updates
def hw1_part2(tokens):
    print("\nThis function prints the corpus statistics")
    print("For help on the statistics, refer to the lectures and NLTK book")

In [5]:
# Main program
tokens = hw1_part1()
hw1_part2(tokens)

 Token                         Error | Correct                       Token
--------------------------------------+--------------------------------------
    15                         Nov . | Nov.                          15
    19                          Mr . | Mr.                           18
   128                         Inc . | Inc.                          126
   137                        Corp . | Corp.                         134
   172                           ' s | 's                            168
   204                          ' re | 're                           199
   271                           ' s | 's                            265
   277                          Dr . | Dr.                           270
   340                         9 . 8 | 9.8                           332
   369                               | --                            359
   436                          Dr . | Dr.                           427
   454                        Mass . | Mass.  