In [1]:
import nltk
import datetime
from nltk.corpus import brown

#### Task-1:

Consider the following "spoof newspaper headlines", such as:  

British Left Waffles on Falkland Islands, and Juvenile Court to Try Shooting Defendant.  

Manually tag these headlines to see if knowledge of the [part-of-speech tags](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) removes the ambiguity.

In [2]:
headline1 = "British/NNPS Left/VB Waffles/NNPS on/IN Falkland/NNP Islands/NNPS"  #one potential interpretation.
[nltk.tag.str2tuple(t) for t in headline1.split()]

[('British', 'NNPS'),
 ('Left', 'VB'),
 ('Waffles', 'NNPS'),
 ('on', 'IN'),
 ('Falkland', 'NNP'),
 ('Islands', 'NNPS')]

In [3]:
# here TRY means to examine evidence in court and decide whether sb is innocent or guilty 
headline2 = 'Juvenile/NOUN Court/NOUN to/PRT Try/VERB Shooting/ADJ Defendant/NOUN'       #one potential interpretation.
[nltk.tag.str2tuple(t) for t in headline2.split()]

[('Juvenile', 'NOUN'),
 ('Court', 'NOUN'),
 ('to', 'PRT'),
 ('Try', 'VERB'),
 ('Shooting', 'ADJ'),
 ('Defendant', 'NOUN')]

#### Task-2:

Tokenize and tag the below given sentence by using ([using NLTK POS Tagger](https://www.nltk.org/api/nltk.tag.pos_tag.html)) and [spaCy](https://spacy.io/usage/linguistic-features)  

"They wind back the clock, while we chase after the wind".  

What different pronunciations and parts of speech are involved?

In [5]:
sent = 'They wind back the clock, while we chase after the wind.'
nltk.pos_tag(nltk.word_tokenize(sent))

[('They', 'PRP'),
 ('wind', 'VBP'),
 ('back', 'RB'),
 ('the', 'DT'),
 ('clock', 'NN'),
 (',', ','),
 ('while', 'IN'),
 ('we', 'PRP'),
 ('chase', 'VBP'),
 ('after', 'IN'),
 ('the', 'DT'),
 ('wind', 'NN'),
 ('.', '.')]

In [6]:
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")
sent = nlp("They wind back the clock, while we chase after the wind.")
cols = ["text", "pos", "tag", "explain pos", "explain tag"]

rows = []
for token in sent:
  row = token.text, token.pos_, token.tag_, spacy.explain(token.pos_), spacy.explain(token.tag_)
  rows.append(row)
df = pd.DataFrame(rows, columns=cols)
df

Unnamed: 0,text,pos,tag,explain pos,explain tag
0,They,PRON,PRP,pronoun,"pronoun, personal"
1,wind,VERB,VBP,verb,"verb, non-3rd person singular present"
2,back,ADP,RP,adposition,"adverb, particle"
3,the,DET,DT,determiner,determiner
4,clock,NOUN,NN,noun,"noun, singular or mass"
5,",",PUNCT,",",punctuation,"punctuation mark, comma"
6,while,SCONJ,IN,subordinating conjunction,"conjunction, subordinating or preposition"
7,we,PRON,PRP,pronoun,"pronoun, personal"
8,chase,VERB,VBP,verb,"verb, non-3rd person singular present"
9,after,ADP,IN,adposition,"conjunction, subordinating or preposition"


### Task-3:

Write programs to read and process the [NLTK Brown Corpus](https://www.nltk.org/howto/corpus.html) and perform the followig tasks:

- Get and display a list of all categories of the brown corpus.
- Get and display the tokenized and tagged version of the "news" category.
- Get and display the sentence segmented, tokenized, and tagged version of the "news" category.
- Print the first 5 words, and the first 2 sentences.
- Get and display the set of all the tags in the brown corpus.
- Get and display the set of all the tags in the universal tagset.

In [5]:
# Get a list of all categories of the brown corpus
brown.categories()

# Get the tokenized and tagged version of the "news" category
brown_twords = brown.tagged_words(categories='news')

# Get the sentence segmented, tokenized, and tagged version of the "news" category
brown_tsents = brown.tagged_sents(categories='news')

# Print the first 5 words
print("\nThe first 5 words in the tokenized and tagged version are: {}".format(brown_twords[:5]))
print("\nThe first 2 sentences in the sentence segmented, tokenized and tagged version are {}".format(brown_tsents[:2]))

# Get the set of all the tags in the brown corpus
brown_tags = set([tag for (token,tag) in brown_twords])
print("\nThe set of all original tags in the brown orpus is: {}".format(brown_tags))

# Get the set of all the tags in the universal tagset
brown_utwords = brown.tagged_words(categories='news',tagset='universal')
universal_tags = set([tag for (token,tag) in brown_utwords])
print("\nThe set of universal tags is: {}".format(universal_tags))


The first 5 words in the tokenized and tagged version are: [('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL')]

The first 2 sentences in the sentence segmented, tokenized and tagged version are [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN

#### Task-4:

Write programs to process the Brown Corpus and find answers to the following questions:

Which nouns are more common in their plural form, rather than their singular form? (Only consider regular plurals, formed with the -s suffix.)
Which word has the greatest number of distinct tags. What are they, and what do they represent?
List tags in order of decreasing frequency. What do the 20 most frequent tags represent?
Which tags are nouns most commonly found after? What do these tags represent?

In [11]:
brown_tagged = brown.tagged_words()
cfd = nltk.ConditionalFreqDist(brown_tagged)

AttributeError: 'ConcatenatedCorpusView' object has no attribute 'lower'

In [9]:
# Which nouns are more common in their plural form, rather than their singular form? 
# (Only consider regular plurals, formed with the -s suffix.)

common_plural = set()
for word in set(brown.words()):
    if cfd[word+'s']['NNS'] > cfd[word]['NN']:
        common_plural.add(word)
print(common_plural)

{'Aid', 'outburst', 'spectator', 'Crowd', 'endearment', 'spoil', 'Proprietorship', 'interface', 'tactic', 'outbreak', 'fuck', 'nothing', 'Bird', 'polyphosphate', 'tavern', 'Reporter', 'tree', 'serpent', 'Area', 'Senator', 'supporter', 'organism', 'Thank', 'makeshift', 'spoke', 'vital', 'determinant', 'indicator', 'limit', 'filbert', 'Institution', 'offensive', 'silo', 'wandering', 'ruler', 'carrier', 'happening', 'active', 'cell', 'reservation', 'periodical', 'manifestation', 'Conference', 'nun', 'one', 'youngster', 'postulate', 'parasite', 'prolusion', 'turnpike', 'compulsive', 'Northerner', 'Gain', 'milligram', 'epithet', 'follower', 'saving', 'burglar', 'highlight', 'technique', 'Avocado', 'publisher', 'Fund', 'trait', 'plate', 'Idea', 'sock', 'ray', 'parameter', 'diver', 'sailor', 'banker', 'Thing', 'Record', 'synthetic', 'adverb', 'dolphin', 'spacer', 'leak', 'speculation', 'Hazard', 'loyalist', 'wrestle', 'realtor', 'Vineyard', '1890', 'doing', 'modification', 'silicate', 'Set', 

In [10]:
# Which word has the greatest number of distinct tags. What are they, and what do they represent?

tag_dict = {k:len(cfd[k]) for k in cfd}
greatest = max(tag_dict, key=lambda key: tag_dict[key])

greatest

'that'

In [9]:
# List tags in order of decreasing frequency. What do the 20 most frequent tags represent?

helper_list = [t for (_, t) in brown_tagged]    # extract the tags to a list 
fd = nltk.FreqDist(helper_list)
fd.most_common(20)

[('NN', 152470),
 ('IN', 120557),
 ('AT', 97959),
 ('JJ', 64028),
 ('.', 60638),
 (',', 58156),
 ('NNS', 55110),
 ('CC', 37718),
 ('RB', 36464),
 ('NP', 34476),
 ('VB', 33693),
 ('VBN', 29186),
 ('VBD', 26167),
 ('CS', 22143),
 ('PPS', 18253),
 ('VBG', 17893),
 ('PP$', 16872),
 ('TO', 14918),
 ('PPSS', 13802),
 ('CD', 13510)]

In [10]:
# Which tags are nouns most commonly found after? What do these tags represent?

word_tag_pairs = nltk.bigrams(brown_tagged)
noun_after = [b[1] for (a, b) in word_tag_pairs if a[1].startswith('NN')]
fdist = nltk.FreqDist(noun_after)
[tag for (tag, _) in fdist.most_common(10)]

['IN', '.', ',', 'CC', 'NN', 'NNS', 'VBD', 'CS', 'MD', 'BEZ']

#### Task-5:

Train a unigram tagger ([on Brown Corpus news categories](https://www.nltk.org/howto/corpus.html)) and run it on the below given test text:  

test_text = ['hello', 'world', 'natural', 'language', 'processing']  

Observe that some words are not assigned a tag. Why not?

In [14]:
brown_tagged_sents = brown.tagged_sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)

test_text = ['hello', 'world', 'natural', 'language', 'processing']
unigram_tagger.tag(test_text)

[('hello', None),
 ('world', 'NN'),
 ('natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN')]

The words doesn't appear in the training text, and therefore the tagger can't speculate the word's tag.

#### Task-6:

By using the [NLTK default tagger](https://www.nltk.org/book/ch05.html#:~:text=4.1-,The%20Default%20Tagger,-The%20simplest%20possible), tag the following sentence with most frequent tag in the corpus:  

"the quick brown fox jumped over the lazy dog"

In [11]:
# Default tagger

# Get the sentence segmented and tokenized version of "news"
# This is the non-part of speech tagged version that we will be tagging
brown_sents = brown.sents(categories='news')

# Get a list of all tags in the corpus
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]

# Get the most frequent tag in the corpus
most_frequent_tag = nltk.FreqDist(tags).max()

# Configure a default tagger
# The default tagger assigns the same "default" tag to every token in the corpus
# We configure it to annotate with the most frequent tag
default_tagger = nltk.DefaultTagger(most_frequent_tag)

my_sent = "the quick brown fox jumped over the lazy dog".split()
print("The sentence tagged with default tagger: {}".format(default_tagger.tag(my_sent)))

The sentence tagged with default tagger: [('the', 'NN'), ('quick', 'NN'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'NN'), ('over', 'NN'), ('the', 'NN'), ('lazy', 'NN'), ('dog', 'NN')]


#### Task-7:

By using the([Brown Corpus news categories](https://www.nltk.org/howto/corpus.html)), train and evaluate the following built-in taggers:
- [Unigram Tagger](https://www.nltk.org/book/ch05.html)
- [Bigram Tagger](https://www.nltk.org/book/ch05.html)
- [Trigram Tagger](https://www.nltk.org/book/ch05.html)

In [12]:
# Import taggers
from nltk import DefaultTagger, AffixTagger, UnigramTagger, BigramTagger, TrigramTagger

# We will split the corpus to train and test
test_corpus = brown_tsents[:1000]
train_corpus = brown_tsents[1000:]

# Train the affix tagger
affix_tagger = AffixTagger(train_corpus)

# Tag the corpus with the affix tagger
affix_sents = affix_tagger.tag_sents(brown_sents)

# Print the first sentence and accuracy
print("\nThe first sentence, tagged with affix tagger: {}".format(affix_sents[0]))
print("\nThe accuracy of the affix tagger on the corpus is: {}".format(round(affix_tagger.evaluate(test_corpus),2)))


# Train the unigram tagger
unigram_tagger = UnigramTagger(train_corpus)

# Tag the corpus with the unigram tagger
uni_sents = unigram_tagger.tag_sents(brown_sents)

# Print the first sentence and accuracy
print("\nThe first sentence, tagged with unigram tagger: {}".format(uni_sents[0]))
print("\nThe accuracy of the unigram tagger on the corpus is: {}".format(round(unigram_tagger.evaluate(test_corpus),2)))


# Train the bigram tagger
bigram_tagger = BigramTagger(train_corpus)

# Tag the corpus with the bigram tagger
bi_sents = bigram_tagger.tag_sents(brown_sents)

# Print the first sentence and accuracy
print("\nThe first sentence, tagged with bigram tagger: {}".format(bi_sents[0]))
print("\nThe accuracy of the bigram tagger on the corpus is: {}".format(round(bigram_tagger.evaluate(test_corpus),2)))


# Train the trigram tagger
trigram_tagger = TrigramTagger(train_corpus)

# Tag the corpus with the trigram tagger
tri_sents = trigram_tagger.tag_sents(brown_sents)

tri_sents = trigram_tagger.tag_sents(brown_sents)

# Print the first sentence and accuracy
print("\nThe first sentence, tagged with trigram tagger: {}".format(tri_sents[0]))
print("\nThe accuracy of the trigram tagger on the corpus is: {}".format(round(trigram_tagger.evaluate(test_corpus),2)))


The first sentence, tagged with affix tagger: [('The', None), ('Fulton', 'NP'), ('County', 'NN'), ('Grand', 'NP'), ('Jury', None), ('said', None), ('Friday', 'NR'), ('an', None), ('investigation', 'NN'), ('of', None), ("Atlanta's", 'NP$'), ('recent', 'NN'), ('primary', 'JJ'), ('election', 'NN'), ('produced', 'VBN'), ('``', None), ('no', None), ('evidence', 'NN'), ("''", None), ('that', None), ('any', None), ('irregularities', 'NNS'), ('took', None), ('place', 'NN'), ('.', None)]

The accuracy of the affix tagger on the corpus is: 0.26

The first sentence, tagged with unigram tagger: [('The', 'AT'), ('Fulton', None), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'JJ'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', None), ('took'