## Text Syntax and Structure

### Parts of Speech Tagging
Parts of speech (POS) are specific lexical categories to which words are assigned based on their syntactic context and role. The process of classifying and labeling POS tags for words is defined as parts of speech tagging (POS tagging).

For further details about POS-Tags:  <a href="http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/Penn-Treebank-Tagset.pdf">Penn Treebank</a>


In [1]:
# POS-Tagging with Spacy

import spacy
import pandas as pd

sample_sentence = 'The brown fox is quick and he is jumping over the lazy dog'

nlp = spacy.load('en_core_web_lg')
doc = nlp(sample_sentence)

word_tags = [(token, token.pos_, token.tag_) for token in doc]
pd.DataFrame(word_tags, columns=['Word', 'Tag Type','POS Tag',])

Unnamed: 0,Word,Tag Type,POS Tag
0,The,DET,DT
1,brown,ADJ,JJ
2,fox,NOUN,NN
3,is,AUX,VBZ
4,quick,ADJ,JJ
5,and,CCONJ,CC
6,he,PRON,PRP
7,is,AUX,VBZ
8,jumping,VERB,VBG
9,over,ADP,IN


In [2]:
# POS Tagging with NLTK
import nltk

sample_sentence = 'The brown fox is quick and he is jumping over the lazy dog'

pos_tags = nltk.pos_tag(nltk.word_tokenize(sample_sentence))
pd.DataFrame(pos_tags, columns=['Word', 'POS tag'])

Unnamed: 0,Word,POS tag
0,The,DT
1,brown,JJ
2,fox,NN
3,is,VBZ
4,quick,JJ
5,and,CC
6,he,PRP
7,is,VBZ
8,jumping,VBG
9,over,IN


In [3]:
from nltk.corpus import treebank

data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
test_data[0]

[('About', 'IN'),
 ('30', 'CD'),
 ('%', 'NN'),
 ('of', 'IN'),
 ('Ratners', 'NNP'),
 ("'s", 'POS'),
 ('profit', 'NN'),
 ('already', 'RB'),
 ('is', 'VBZ'),
 ('derived', 'VBN'),
 ('*-1', '-NONE-'),
 ('from', 'IN'),
 ('the', 'DT'),
 ('U.S.', 'NNP'),
 ('.', '.')]

In [4]:
from nltk.tag import DefaultTagger
import nltk

dt = DefaultTagger('NN')

sample_sentence = 'The brown fox is quick and he is jumping over the lazy dog'

dt.tag(nltk.word_tokenize(sample_sentence))

[('The', 'NN'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('is', 'NN'),
 ('quick', 'NN'),
 ('and', 'NN'),
 ('he', 'NN'),
 ('is', 'NN'),
 ('jumping', 'NN'),
 ('over', 'NN'),
 ('the', 'NN'),
 ('lazy', 'NN'),
 ('dog', 'NN')]

In [5]:
from nltk.tag import RegexpTagger
import nltk

sample_sentence = 'The brown fox is quick and he is jumping over the lazy dog'

patterns = [
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # simple past
(r'.*es$', 'VBZ'), # 3rd singular present
(r'.*ould$', 'MD'), # modals
(r'.*\'s$', 'NN$'), # possessive nouns
(r'.*s$', 'NNS'), # plural nouns
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'.*', 'NN') # nouns (default) ...
]

rt = RegexpTagger(patterns)
rt.tag(nltk.word_tokenize(sample_sentence))

[('The', 'NN'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('is', 'NNS'),
 ('quick', 'NN'),
 ('and', 'NN'),
 ('he', 'NN'),
 ('is', 'NNS'),
 ('jumping', 'VBG'),
 ('over', 'NN'),
 ('the', 'NN'),
 ('lazy', 'NN'),
 ('dog', 'NN')]

In [6]:
import nltk
from nltk.corpus import treebank
# N gram taggers
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger

sample_sentence = 'The brown fox is quick and he is jumping over the lazy dog'

data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]

# train taggers
ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

# testing performance of unigram tagger
print(ut.evaluate(test_data))
print(ut.tag(nltk.word_tokenize(sample_sentence)))

# testing performance of bigram tagger
print(bt.evaluate(test_data))
print(bt.tag(nltk.word_tokenize(sample_sentence)))

# testing performance of trigram tagger
print(tt.evaluate(test_data))
print(tt.tag(nltk.word_tokenize(sample_sentence)))

0.8607803272340013
[('The', 'DT'), ('brown', None), ('fox', None), ('is', 'VBZ'), ('quick', 'JJ'), ('and', 'CC'), ('he', 'PRP'), ('is', 'VBZ'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('lazy', None), ('dog', None)]
0.13466937748087907
[('The', 'DT'), ('brown', None), ('fox', None), ('is', None), ('quick', None), ('and', None), ('he', None), ('is', None), ('jumping', None), ('over', None), ('the', None), ('lazy', None), ('dog', None)]
0.08064672281924679
[('The', 'DT'), ('brown', None), ('fox', None), ('is', None), ('quick', None), ('and', None), ('he', None), ('is', None), ('jumping', None), ('over', None), ('the', None), ('lazy', None), ('dog', None)]
