# Natural Language Processing

## Exercise Sheet 5

In [29]:
#imports for all exercises
import nltk
from nltk.corpus import brown
from nltk import FreqDist
from collections import defaultdict
from collections import Counter
from nltk.util import ngrams

### Exercise 1

Produce a sorted list of tags used in the Brown corpus, removing duplicates. Do the same for the universal part-of-speech tagset.

In [7]:
# Brown
brown_tagged = nltk.corpus.brown.tagged_words()
brown_no_duplicates = set(tag for w, tag in brown_tagged)
brown_sorted_list = sorted(brown_no_duplicates)
print(brown_sorted_list)

["'", "''", '(', '(-HL', ')', ')-HL', '*', '*-HL', '*-NC', '*-TL', ',', ',-HL', ',-NC', ',-TL', '--', '---HL', '.', '.-HL', '.-NC', '.-TL', ':', ':-HL', ':-TL', 'ABL', 'ABN', 'ABN-HL', 'ABN-NC', 'ABN-TL', 'ABX', 'AP', 'AP$', 'AP+AP-NC', 'AP-HL', 'AP-NC', 'AP-TL', 'AT', 'AT-HL', 'AT-NC', 'AT-TL', 'AT-TL-HL', 'BE', 'BE-HL', 'BE-TL', 'BED', 'BED*', 'BED-NC', 'BEDZ', 'BEDZ*', 'BEDZ-HL', 'BEDZ-NC', 'BEG', 'BEM', 'BEM*', 'BEM-NC', 'BEN', 'BEN-TL', 'BER', 'BER*', 'BER*-NC', 'BER-HL', 'BER-NC', 'BER-TL', 'BEZ', 'BEZ*', 'BEZ-HL', 'BEZ-NC', 'BEZ-TL', 'CC', 'CC-HL', 'CC-NC', 'CC-TL', 'CC-TL-HL', 'CD', 'CD$', 'CD-HL', 'CD-NC', 'CD-TL', 'CD-TL-HL', 'CS', 'CS-HL', 'CS-NC', 'CS-TL', 'DO', 'DO*', 'DO*-HL', 'DO+PPSS', 'DO-HL', 'DO-NC', 'DO-TL', 'DOD', 'DOD*', 'DOD*-TL', 'DOD-NC', 'DOZ', 'DOZ*', 'DOZ*-TL', 'DOZ-HL', 'DOZ-TL', 'DT', 'DT$', 'DT+BEZ', 'DT+BEZ-NC', 'DT+MD', 'DT-HL', 'DT-NC', 'DT-TL', 'DTI', 'DTI-HL', 'DTI-TL', 'DTS', 'DTS+BEZ', 'DTS-HL', 'DTX', 'EX', 'EX+BEZ', 'EX+HVD', 'EX+HVZ', 'EX+MD', '

In [8]:
# Universal part-of-speech
universal_tagged = nltk.corpus.brown.tagged_words(tagset='universal')
universal_no_duplicates = set(tag for w, tag in universal_tagged)
universal_sorted_list = sorted(universal_no_duplicates)
print(universal_sorted_list)


['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']


### Exercise 2

Write a program to process the Brown Corpus using the universal part-of-speech tagset to find out which nouns are more common in their plural form than in their singular form. Only consider regular plurals formed with the "-s" suffix. Print an alphabetically sorted list of the nouns together with the frequencies for the singular and plural forms, one per line. 


In [17]:
# Frequency of all nouns
noun_fd = FreqDist(w.lower() for w, tag in brown.tagged_words(tagset='universal') if tag == 'NOUN')

# Going through all nouns and checking if they have their plural form
# If yes -> comparing the frequecies
# If plural > singular -> add (noun, sing freq, plur freq) to list
# sort and print
more_common_plural = []

for noun in noun_fd:
    plural_noun = noun + "s"
    
    if plural_noun in noun_fd and noun_fd[plural_noun]>noun_fd[noun]:
        more_common_info = (noun, noun_fd[noun], noun_fd[plural_noun])
        more_common_plural.append(more_common_info)
    
    #if plural_noun not in noun_fd:
        #more_common_info = (noun, noun_fd[noun], 0)
        #more_common_plural.append(more_common_info)


sorted_result = sorted(more_common_plural)

for noun, s_freq, p_freq in sorted_result:
    print(f"Noun: {noun}, Singular count: {s_freq}, Plural count: {p_freq}")

# NOTE: Only singular words that HAVE their plural form are considered.

Noun: 2-year-old, Singular count: 2, Plural count: 3
Noun: aberration, Singular count: 3, Plural count: 5
Noun: abolitionist, Singular count: 1, Plural count: 4
Noun: aborigine, Singular count: 7, Plural count: 8
Noun: absolute, Singular count: 1, Plural count: 3
Noun: abstract, Singular count: 1, Plural count: 4
Noun: accommodation, Singular count: 1, Plural count: 8
Noun: accomplishment, Singular count: 7, Plural count: 10
Noun: acre, Singular count: 10, Plural count: 44
Noun: active, Singular count: 6, Plural count: 8
Noun: ad, Singular count: 5, Plural count: 10
Noun: adapter, Singular count: 1, Plural count: 2
Noun: addict, Singular count: 1, Plural count: 4
Noun: additive, Singular count: 3, Plural count: 4
Noun: adherent, Singular count: 1, Plural count: 5
Noun: adjective, Singular count: 2, Plural count: 4
Noun: admonition, Singular count: 1, Plural count: 3
Noun: adventure, Singular count: 13, Plural count: 14
Noun: adverb, Singular count: 1, Plural count: 2
Noun: advertisemen

### Exercise 3

Find out which word has the greatest number of distinct tags in the Brown corpus using the original tagset. Without using the `most_common` function, print a list of the tags together with the frequencies for the word, sorted by frequency from highest to lowest, one per line.



In [8]:
tagged = brown.tagged_words()
tagged_dict = defaultdict(set)

for word, tag in tagged:
    tagged_dict[word.lower()].add(tag)

# Word with greatest number of distinct tags:

max_tags = 0
most_tagged = None
for word, tags in tagged_dict.items():
    tags_count = len(tags)
    if tags_count > max_tags:
        max_tags = tags_count
        most_tagged = word

#print (most_tagged, max_tags)

# Going through corpus to now count the tag frequencies
tag_freqs = Counter()

for word, tag in tagged:
    if word.lower() == most_tagged:
        tag_freqs[tag] +=1

# Sort by frequency (reverse)
sorted_result = sorted(tag_freqs.items(), key = lambda x: x[1], reverse = True)

print(f"Word with greatest # of distinct tags: {most_tagged}, Number of tags: {max_tags}")
    
for tag, freq in sorted_result:
    print(f"Tag: {tag}, Frequency: {freq}")


Word with greatest # of distinct tags: that, Number of tags: 15
Tag: CS, Frequency: 6464
Tag: DT, Frequency: 2260
Tag: WPS, Frequency: 1654
Tag: WPO, Frequency: 135
Tag: QL, Frequency: 56
Tag: DT-NC, Frequency: 6
Tag: DT-TL, Frequency: 5
Tag: WPS-TL, Frequency: 3
Tag: WPS-NC, Frequency: 3
Tag: CS-NC, Frequency: 2
Tag: WPS-HL, Frequency: 2
Tag: CS-HL, Frequency: 1
Tag: DT-HL, Frequency: 1
Tag: NIL, Frequency: 1
Tag: WPO-NC, Frequency: 1


### Exercise 4

Tabulate the frequencies of the universal tags that precede nouns in the Brown Corpus. 

In [12]:
tagged = brown.tagged_words(tagset='universal')

tags_preceding_nouns = []

for i in range (1, len(tagged)): # starting from 2nd word (1st has no preceding tags)
    curr_word, curr_tag = tagged[i] 
    if curr_tag == "NOUN": # if noun then
        prev_word, prev_tag = tagged[i-1] # get the preceding
        tags_preceding_nouns.append(prev_tag) # add to the list
        
fd = FreqDist(tags_preceding_nouns)
fd.tabulate()



  DET   ADJ  NOUN   ADP     .  VERB  CONJ   NUM   ADV   PRT  PRON     X 
85845 54653 41309 37418 20084 17851  9294  5668  1851  1068   440    77 


### Exercise 5

Write a function `ambiguous(tagged_text)` that returns the number of ambiguous word types as well as the number of all word types in a tagged text. A word type is ambiguous if it is tagged with at least two different tags. Use the function to print both values as well as the percentage of ambiguous word types for the Brown Corpus both for the original and the universal tagset.

In [21]:
def ambiguous(tagged_text):
    
    tagged_dict = defaultdict(set)

    for word, tag in tagged_text:
        tagged_dict[word.lower()].add(tag)
        
    total_n_word_types = len(tagged_dict)
    ambiguous_n_word_types = 0
        
    for word, tags in tagged_dict.items():
        if len(tags) > 1: # at least two different tags
            ambiguous_n_word_types +=1
    
    percentage_ambiguous = (ambiguous_n_word_types/total_n_word_types)*100 # calculate percentage
    
    print(f"Total # of word types: {total_n_word_types}")
    print(f"# of ambiguous word types: {ambiguous_n_word_types}")
    print(f"Percentage of ambiguous word types: {percentage_ambiguous:.2f}%")
    
    return total_n_word_types, ambiguous_n_word_types, percentage_ambiguous

print("Original:")
ambiguous(brown.tagged_words())
print("\n")
print("Universal:")
ambiguous(brown.tagged_words(tagset='universal'))

Original:
Total # of word types: 49815
# of ambiguous word types: 9580
Percentage of ambiguous word types: 19.23%


Universal:
Total # of word types: 49815
# of ambiguous word types: 3408
Percentage of ambiguous word types: 6.84%


(49815, 3408, 6.84131285757302)

### Exercise 6

Write code to search the Brown Corpus to answer the following questions:

a) produce an alphabetically sorted list of the distinct words tagged as `MD`  
b) identify words that can be plural nouns or third person singular verbs  
c) print an alphabetically sorted list of distinct three-word prepositional phrases of the form `IN+AT+NN`, separated by semicolons


In [23]:
# a). produce an alphabetically sorted list of the distinct words tagged as MD

distinct_md = sorted(set(word.lower() for word, tag in brown.tagged_words() if tag == 'MD'))
print(distinct_md)

["c'n", 'can', 'colde', 'could', 'dare', 'kin', 'maht', 'mai', 'may', 'maye', 'mayst', 'might', 'must', 'need', 'ought', 'shall', 'should', 'shuld', 'shulde', 'wil', 'will', 'wilt', 'wod', 'wold', 'wolde', 'would']


In [26]:
# b). identify words that can be plural nouns or third person singular verbs

result_words = set()

for word,tag in brown.tagged_words():
    if tag in ['NNS', 'VBZ'] and word.isalpha(): # additionally only consists of valid letters
        result_words.add((word.lower(), tag))

sorted_result = sorted(result_words) #sorting the result
print(sorted_result)



In [31]:
# c) print an alphabetically sorted list of distinct three-word prepositional phrases of the form IN+AT+NN,
# separated by semicolons

result_phrases = set()

for sentence in brown.tagged_sents():
    for w1,w2,w3 in ngrams(sentence,3): # generating trigrams and going through them directly
        if(w1[1] == 'IN' and w2[1] == 'AT' and w3[1] == 'NN'):
            result_phrases.add(f"{w1[0]} {w2[0]} {w3[0]}")

print(';'.join(sorted(result_phrases)))



### Exercise 7

Write a function `prec_adv(word, text)` that returns an alphabetically sorted list of distinct adverbs that precede `word` in `text`. Use this function to find out which adverbs precede the words "love", "like", and "prefer" in the Brown corpus. 

In [39]:
def prec_adv(word, text):
    
    adverbs=set()
    
    for i in range (1, len(text)): # starting from 2nd word (1st has no preceding tags)
        curr_word, curr_tag = text[i] 
        
        if curr_word == word: 
            prev_word, prev_tag = text[i-1] # get the preceding
            
            if prev_tag in ['RB']:
                adverbs.add(prev_word) # add to the set

    return sorted(list(adverbs))


# TESTING:

# love
print("Adverbs preceding 'love':")
love_result = prec_adv("love", brown.tagged_words())
print(love_result)

# like
print("Adverbs preceding 'like':")
like_result = prec_adv("like", brown.tagged_words())
print(like_result)

# prefer
print("Adverbs preceding 'prefer':")
prefer_result = prec_adv("prefer", brown.tagged_words())
print(prefer_result)

Adverbs preceding 'love':
['always', 'dearly', 'just']
Adverbs preceding 'like':
['Jist', 'Just', 'Kinda', 'abreast', 'almost', 'alone', 'always', 'around', 'by', 'close', 'deceptively', 'even', 'exactly', 'gloriously', 'here', 'increasingly', 'jist', 'just', 'much', 'often', 'particularly', 'quick', 'rather', 'remarkably', 'roughly', 'simply', 'so', 'somewhat', 'sure', 'there', 'together', 'wildly', 'yet']
Adverbs preceding 'prefer':
['generally', 'much', 'spontaneously']
