# Morphosyntax

In [2]:
import nltk

Download corpora:
- Macmorpho
- Floresta

In [6]:
nltk.download('floresta')
nltk.download('mac_morpho')

[nltk_data] Downloading package floresta to /home/aluno/nltk_data...
[nltk_data]   Package floresta is already up-to-date!


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [7]:
from nltk.corpus import mac_morpho, floresta

# Mac_morpho

In [8]:
tagged_sentences = []

for sentence in mac_morpho.tagged_sents():
    tagged_sentences.append(sentence)
print(len(tagged_sentences))

51397


# Floresta

In [9]:
tagged_sentences = []
def simplify_tag_floresta(t):
    if "+" in t:
        return t[t.index("+")+1:].upper()
    else:
        return t.upper()

for sentence in floresta.tagged_sents():
    sentence = [(text, simplify_tag_floresta(tag)) for text, tag in sentence]
    tagged_sentences.append(sentence)
print(len(tagged_sentences))

9266


In [10]:
from collections import Counter

all_tags = []
for sent in tagged_sentences:
    for w, t in sent:
        all_tags.append(t)

tagset_freq = Counter(all_tags)
tagset_freq

Counter({'!': 109,
         '"': 9,
         "'": 99,
         '*': 7,
         ',': 13444,
         '-': 59,
         '.': 7725,
         '/': 23,
         '2]+ADV': 1,
         ';': 633,
         '?': 221,
         'ADJ': 10725,
         'ADV': 9096,
         'ART': 29360,
         'CONJ-C': 5119,
         'CONJ-S': 2284,
         'EC': 5,
         'IN': 40,
         'N': 40081,
         "N<{'185/60_R_14'}": 1,
         'NUM': 4157,
         'PP': 397,
         'PRON-DET': 4972,
         'PRON-INDP': 3278,
         'PRON-PERS': 2748,
         'PROP': 11652,
         'PRP': 32442,
         'PRP-': 3,
         'V-FIN': 15802,
         'V-GER': 854,
         'V-INF': 5015,
         'V-PCP': 4661,
         'VP': 1,
         '[': 30,
         ']': 29,
         '{': 1044,
         '}': 1047,
         '«': 2369,
         '»': 2310})

# Training a tagger

In [11]:
cutoff = int(.80 * len(tagged_sentences))
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]
 
print(len(training_sentences))
print(len(test_sentences))

7412
1854


# Always a noun

In [12]:
#classifica com a classe base(N - substantivo)
tagger_default = nltk.DefaultTagger('N')
tagger_default.evaluate(test_sentences)

0.20040125462714403

# Most frequent tag

In [13]:
unigram_tagger = nltk.UnigramTagger(training_sentences, verbose=True, backoff=tagger_default)
unigram_tagger.evaluate(test_sentences)

[Trained Unigram tagger: size=18484, backoff=21.36%, pruning=29.25%]


0.8842860775947329

# Bigram tagger

In [14]:
bigram_tagger = nltk.BigramTagger(training_sentences, verbose=True, backoff=unigram_tagger)
bigram_tagger.evaluate(test_sentences)

[Trained Unigram tagger: size=1539, backoff=73.94%, pruning=96.48%]


0.8962389443047275

# Trigram tagger

In [15]:
trigram_tagger = nltk.TrigramTagger(training_sentences, verbose=True, backoff=bigram_tagger)
trigram_tagger.evaluate(test_sentences)

[Trained Unigram tagger: size=1363, backoff=80.65%, pruning=97.91%]


0.8959846279917488

# Tagging

In [21]:
sentence = "Mirella, vem cá".split()
print("Default")
tagged_sentence = tagger_default.tag(sentence)
print(tagged_sentence)
print("Unigram")
tagged_sentence = unigram_tagger.tag(sentence)
print(tagged_sentence)
print("Bigram")
tagged_sentence = bigram_tagger.tag(sentence)
print(tagged_sentence)
print("Trigram")
tagged_sentence = trigram_tagger.tag(sentence)
print(tagged_sentence)

Default
[('Mirella,', 'N'), ('vem', 'N'), ('cá', 'N')]
Unigram
[('Mirella,', 'N'), ('vem', 'V-FIN'), ('cá', 'ADV')]
Bigram
[('Mirella,', 'N'), ('vem', 'V-FIN'), ('cá', 'ADV')]
Trigram
[('Mirella,', 'N'), ('vem', 'V-FIN'), ('cá', 'ADV')]


In [22]:
import pickle

In [23]:
with open('bigram_tagger.pickle', 'wb') as p_file:
    p_file.write(pickle.dumps(bigram_tagger))

# Other corpus
http://www.nilc.icmc.usp.br/nilc/download/corpus100.txt

# Other taggers

In [1]:
import spacy

In [2]:
import subprocess
command = "python -m spacy download pt_core_news_sm".split()
subprocess.call(command)

0

In [3]:
nlp = spacy.load('pt_core_news_sm')
sentence = "O rato comeu a roupa do rei de roma"
doc = nlp(sentence)
for token in doc:
    print(token.text, token.pos_, token.tag_)

O DET <artd>|ART|M|S|@>N
rato NOUN <np-def>|N|M|S|@SUBJ>
comeu VERB <mv>|V|PS|3S|IND|@FS-STA
a DET <artd>|ART|F|S|@>N
roupa NOUN <np-def>|N|F|S|@<ACC
do ADP PRP|@N<
rei NOUN <np-idf>|N|M|S|@<ACC
de ADP PRP|@N<
roma NOUN <np-idf>|N|M|S|@P<


In [4]:
import nlpnet

ImportError: No module named 'nlpnet'

Download model from http://nilc.icmc.usp.br/nlpnet/data/pos-pt.tgz

In [None]:
nlpnet.set_data_dir('pos-pt/')
sentence = "O rato comeu a roupa do rei de roma"
tagger = nlpnet.POSTagger()
tagger.tag(sentence)