##Categorize Terms With WordNet##
Original code by Anna Swigart, MIMS 2015

Modified by Marti Hearst


In [1]:
import nltk
import re
from nltk.corpus import brown
from nltk.collocations import *
from string import punctuation
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from urllib.request import urlopen

###Preliminaries: code to train tagger.###

In [2]:
# Training on all brown sentences, excluding news corpus
brown_tagged_sents = brown.tagged_sents(categories=['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies',
'humor', 'learned', 'lore', 'mystery', 'religion', 'reviews', 'romance',
'science_fiction'])

cooking_action_sents = [[('Strain', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Mix', 'VB'), ('them', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Season', 'VB'), ('them', 'PPS'), ('with', 'IN'), ('pepper', 'NN'), ('.', '.')], 
                        [('Wash', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Chop', 'VB'), ('the', 'AT'), ('greens', 'NNS'), ('.', '.')],
                        [('Slice', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Bake', 'VB'), ('the', 'AT'), ('cake', 'NN'), ('.', '.')],
                        [('Pour', 'VB'), ('into', 'IN'), ('a', 'AT'), ('mold', 'NN'), ('.', '.')],
                        [('Stir', 'VB'), ('the', 'AT'), ('mixture', 'NN'), ('.', '.')],
                        [('Moisten', 'VB'), ('the', 'AT'), ('grains', 'NNS'), ('.', '.')],
                        [('Cook', 'VB'), ('the', 'AT'), ('duck', 'NN'), ('.', '.')],
                        [('Drain', 'VB'), ('for', 'IN'), ('one', 'CD'), ('day', 'NN'), ('.', '.')]]

all_tagged_sents = cooking_action_sents + brown_tagged_sents
all_tagged_sents

def create_data_sets():
    size = int(len(all_tagged_sents) * 0.9)
    train_sents = all_tagged_sents[:size]
    test_sents = all_tagged_sents[size:]
    return train_sents, test_sents
train_sents, test_sents = create_data_sets()

def build_backoff_tagger (train_sents):
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    return t2
ngram_tagger = build_backoff_tagger(train_sents)


###Preliminaries: code to tokenize and tag text.###

In [3]:
def tokenize_and_tag_text(corpus):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences 
    sents = [nltk.word_tokenize(word) for word in raw_sents] # tokenize sentences
    tagged_POS_sents = [ngram_tagger.tag(word) for word in sents] # tags sentences
    return tagged_POS_sents

    


###Find important terms: Determine which words occur most frequently that are lemmas within WordNet and that are tagged as nouns.###

In [4]:
# input is a list of (word, tag) pairs and 
def freq_normed_unigrams(tagged_sents, num_terms=50):
    wnl = WordNetLemmatizer() # to get word stems
        
    normed_tagged_words = [wnl.lemmatize(word[0].lower()) for sent in tagged_sents
                           for word in sent 
                           if word[0].lower() not in nltk.corpus.stopwords.words('english')
                           and word[0] not in punctuation # remove punctuation
                           and word[1].startswith('N')]  #retain only nouns 
   
    top_normed_tagpairs = nltk.FreqDist(normed_tagged_words).most_common(num_terms) #get the num_terms most frequent
    return [word for (word,count) in top_normed_tagpairs] #extract out the words from the pairs
    

# input is a list of (word, tag) pairs and 
def freq_normed_bigrams(tagged_sents, num_terms=50):
    wnl = WordNetLemmatizer() # to get word stems
        
    normed_tagged_words = [wnl.lemmatize(word[0].lower()) for sent in tagged_sents
                           for word in sent 
                           if word[0].lower() not in nltk.corpus.stopwords.words('english')
                           and word[0] not in punctuation # remove punctuation
                           and word[1].startswith('N') # retain only nouns 
                          ]
   
    top_normed_tagpairs = nltk.FreqDist(normed_tagged_words).most_common(num_terms) #get the num_terms most frequent
    return [word for (word,count) in top_normed_tagpairs] #extract out the words from the pairs

###Group the most frequently occuring nouns under their common hypernyms.  Then find those hypernyms that occur most frequently.  This creates a de facto categorization.###

In [8]:
def categories_from_hypernyms(termlist, num_cats=20):
    
    hypterms = []
    hypterms_dict = defaultdict(list)
    for term in termlist:                  # for each term
        s = wn.synsets(term.lower(), 'n')  # get its nominal synsets
        for syn in s:                      # for each lemma synset
            for hyp in syn.hypernyms():    # It has a list of hypernyms
                hypterms = hypterms + [hyp.name()]      # Extract the hypernym name and add to list
                hypterms_dict[hyp.name()].append(term)  # Extract examples and add them to dict
                
    hypfd = nltk.FreqDist(hypterms)
    for (name, count) in hypfd.most_common(num_cats):
        print (name, '({0})'.format(count))
        print ('\t', ', '.join(set(hypterms_dict[name])))
        print ('\n')

### Example of the code in action on the cookbook data: tokenize and tag it, and then compute the categories.  What works well?  What is problematic?###

In [12]:
with open('cookbooks.txt', 'r') as text_file:
    cookbooks_corpus = text_file.read()

top_terms = freq_normed_unigrams(tokenize_and_tag_text(cookbooks_corpus), 50)

categories_from_hypernyms(top_terms, 20)


time_period.n.01 (5)
	 day, time, hour


time_unit.n.01 (4)
	 day, minute, hour


meat.n.01 (3)
	 beef, mutton, veal


distance.n.01 (3)
	 piece, minute, hour


time.n.03 (3)
	 day, piece, minute


flavorer.n.01 (3)
	 pepper, herb, salt


british_capacity_unit.n.01 (2)
	 pint, quart


helping.n.01 (2)
	 piece, slice


united_states_liquid_unit.n.01 (2)
	 pint, quart


money.n.01 (2)
	 bread, sugar


avoirdupois_unit.n.01 (2)
	 pound, ounce


thing.n.12 (2)
	 water, piece


dish.n.02 (2)
	 stew, soup


food.n.02 (2)
	 butter, meat


containerful.n.01 (2)
	 jar, dish


herb.n.01 (2)
	 parsley, carrot


attendant.n.01 (2)
	 page


united_states_dry_unit.n.01 (2)
	 pint, quart


happening.n.01 (2)
	 gravy, fire


case.n.01 (2)
	 piece, time




Let's do it on another collection: state of the union.  First, let's look at what files are available.  Then see what output gets produced on some of these speeches.  Compare to each other and to the cooking collection.

In [13]:
from nltk.corpus import state_union
state_union.fileids()

['1945-Truman.txt',
 '1946-Truman.txt',
 '1947-Truman.txt',
 '1948-Truman.txt',
 '1949-Truman.txt',
 '1950-Truman.txt',
 '1951-Truman.txt',
 '1953-Eisenhower.txt',
 '1954-Eisenhower.txt',
 '1955-Eisenhower.txt',
 '1956-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1958-Eisenhower.txt',
 '1959-Eisenhower.txt',
 '1960-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1962-Kennedy.txt',
 '1963-Johnson.txt',
 '1963-Kennedy.txt',
 '1964-Johnson.txt',
 '1965-Johnson-1.txt',
 '1965-Johnson-2.txt',
 '1966-Johnson.txt',
 '1967-Johnson.txt',
 '1968-Johnson.txt',
 '1969-Johnson.txt',
 '1970-Nixon.txt',
 '1971-Nixon.txt',
 '1972-Nixon.txt',
 '1973-Nixon.txt',
 '1974-Nixon.txt',
 '1975-Ford.txt',
 '1976-Ford.txt',
 '1977-Ford.txt',
 '1978-Carter.txt',
 '1979-Carter.txt',
 '1980-Carter.txt',
 '1981-Reagan.txt',
 '1982-Reagan.txt',
 '1983-Reagan.txt',
 '1984-Reagan.txt',
 '1985-Reagan.txt',
 '1986-Reagan.txt',
 '1987-Reagan.txt',
 '1988-Reagan.txt',
 '1989-Bush.txt',
 '1990-Bush.txt',
 '1991-Bush-1.txt',
 '1991-B

Try a SOTU file here.

In [15]:
sotu_raw_text = state_union.raw("1993-Clinton.txt")
top_terms = freq_normed_unigrams(tokenize_and_tag_text(sotu_raw_text), 50)

categories_from_hypernyms(top_terms, 20) 

time_period.n.01 (5)
	 year, time


group.n.01 (5)
	 world, people, system


people.n.01 (4)
	 country, business, nation, world


political_unit.n.01 (3)
	 country, nation, state


action.n.01 (3)
	 thing, economy, change


attribute.n.02 (3)
	 thing, time, state


work.n.01 (3)
	 job, care


person.n.01 (3)
	 worker, child


administrative_district.n.01 (3)
	 country, state


activity.n.01 (3)
	 business, work, job


share.n.01 (3)
	 way, interest, cut


stroke.n.01 (2)
	 cut


cash.n.01 (2)
	 change


idea.n.01 (2)
	 program, plan


division.n.03 (2)
	 cut


happening.n.01 (2)
	 thing, change


system.n.04 (2)
	 program, government


aim.n.02 (2)
	 business, thing


concern.n.01 (2)
	 thing, world


proportion.n.01 (2)
	 percent, rate




Try another SOTU file here.

Try your collection here.

In [19]:
import re
tal = open("../tal_stories/tal_text_clean.txt", "r")
tal_text = tal.read()
tal.close()

top_terms = freq_normed_unigrams(tokenize_and_tag_text(tal_text), 50)

categories_from_hypernyms(top_terms, 20)

time_period.n.01 (14)
	 day, time, week, month, year, life, hour


person.n.01 (6)
	 intellectual, friend, case, life


time_unit.n.01 (4)
	 day, month, hour


being.n.01 (3)
	 life


aim.n.02 (3)
	 business, thing, point


group.n.01 (3)
	 people


family.n.04 (2)
	 name, people


category.n.02 (2)
	 way, kind


state.n.02 (2)
	 office, point


attribute.n.02 (2)
	 thing, time


work_time.n.01 (2)
	 day, week


artifact.n.01 (2)
	 way, thing


characteristic.n.02 (2)
	 point


language_unit.n.01 (2)
	 phone, name


happening.n.01 (2)
	 thing, case


tract.n.01 (2)
	 lot, oasis


grammatical_category.n.01 (2)
	 number, case


fact.n.01 (2)
	 case, point


collection.n.01 (2)
	 lot, hand


document.n.01 (2)
	 program, patent


