# IIC-3800 Tópicos en CC - NLP UC

- Versiones de librerías, python 3.8.10

- numpy 1.20.3
- nltk 3.7
- spacy 3.5.1
- sklearn 
- keras 2.9.0
- tensorflow 2.9.1


In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

!python3 -m spacy download es_core_news_lg

In [2]:
import spacy

sp = spacy.load('es_core_news_lg')

In [3]:
doc = sp("Paredes bate el record de Chamaco Valdes y deja a la U en zona de descenso")

In [4]:
for word in doc:
    print(str(word.text).ljust(15) + str(word.pos_).ljust(15))

Paredes        PROPN          
bate           VERB           
el             DET            
record         NOUN           
de             ADP            
Chamaco        PROPN          
Valdes         PROPN          
y              CCONJ          
deja           VERB           
a              ADP            
la             DET            
U              PROPN          
en             ADP            
zona           NOUN           
de             ADP            
descenso       NOUN           


In [5]:
for entity in doc.ents:
    print(entity.text + ' - ' + entity.label_ + ' - ' + str(spacy.explain(entity.label_)))

Chamaco Valdes - PER - Named person or family.


In [6]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)

Building a POS tagger from scratch

In [7]:
import nltk
 
tagged_sentences = nltk.corpus.treebank.tagged_sents()
 
print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.treebank.tagged_words()))

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences:  3914
Tagged words: 100676


In [8]:
import numpy as np
 
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))
 
print(sentences[1])
print(sentence_tags[1])

['Mr.' 'Vinken' 'is' 'chairman' 'of' 'Elsevier' 'N.V.' ',' 'the' 'Dutch'
 'publishing' 'group' '.']
['NNP' 'NNP' 'VBZ' 'NN' 'IN' 'NNP' 'NNP' ',' 'DT' 'NNP' 'VBG' 'NN' '.']


In [9]:
from sklearn.model_selection import train_test_split
 
(train_sentences, test_sentences,  train_tags,  test_tags) = train_test_split(sentences, sentence_tags, test_size=0.15)

In [10]:
words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w.lower())
 
for ts in train_tags:
    for t in ts:
        tags.add(t)
 
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0   # The special value used to padding


In [11]:
print(tags)

{'-RRB-', 'WDT', 'EX', 'WP', 'RBS', 'DT', '-NONE-', 'VBP', 'JJ', 'CC', 'IN', ',', '-LRB-', 'PRP$', 'SYM', 'VBN', 'PRP', 'NNP', 'CD', '``', 'FW', 'WRB', 'PDT', 'POS', '#', 'LS', 'NNPS', 'MD', 'JJR', 'RP', 'VBG', 'WP$', 'VB', "''", '.', '$', 'JJS', 'VBD', 'UH', 'VBZ', 'RBR', 'NN', 'TO', 'NNS', ':', 'RB'}


In [12]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []
 
for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)
 
for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)
 
for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])
 
for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])
 
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[981, 158, 8849, 9279, 7953, 3291, 2944, 9499, 5911, 7748, 8125, 4947, 5957, 10198, 8069, 1985, 5250, 348, 8125, 5462, 9909, 7589, 1784, 1340, 1660, 10032, 6467, 3061, 9909, 2163, 97, 5590, 10198, 1603, 2850, 5250]
[1722, 7514, 4197, 1607, 9773, 208, 6927, 8226, 9389, 8986, 1100, 1066, 1888, 1867, 10042, 4449, 7953, 3291, 8770, 1985, 3303, 3026, 8770, 9831, 2579, 8440, 1985, 981, 158, 8849, 9279, 2171, 2850]
[18, 18, 40, 7, 18, 18, 40, 18, 10, 18, 20, 46, 9, 9, 44, 12, 34, 11, 20, 11, 7, 31, 44, 11, 17, 17, 8, 16, 7, 43, 33, 14, 9, 42, 35, 34]
[9, 42, 44, 46, 8, 42, 44, 33, 42, 10, 6, 11, 14, 42, 11, 6, 18, 18, 42, 12, 10, 9, 42, 44, 8, 6, 12, 18, 18, 40, 7, 7, 35]


In [13]:
MAX_LENGTH = len(max(train_sentences_X, key=len))
print(MAX_LENGTH)

271


In [14]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
 
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
 
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[  981   158  8849  9279  7953  3291  2944  9499  5911  7748  8125  4947
  5957 10198  8069  1985  5250   348  8125  5462  9909  7589  1784  1340
  1660 10032  6467  3061  9909  2163    97  5590 10198  1603  2850  5250
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0   

In [15]:
from keras import backend as K

def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [16]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
 
 
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy', ignore_class_accuracy(0)])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 271, 128)          1337984   
                                                                 
 bidirectional (Bidirectiona  (None, 271, 512)         788480    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 271, 47)          24111     
 ibuted)                                                         
                                                                 
 activation (Activation)     (None, 271, 47)           0         
                                                                 
Total params: 2,150,575
Trainable params: 2,150,575
Non-trainable params: 0
_________________________________________________________________


In [17]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)


In [18]:
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))
print(cat_train_tags_y[0])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [19]:
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=40, validation_split=0.15)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f26b87f3850>

In [20]:
scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")   

accuracy: 99.18856620788574


In [21]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

In [28]:
test_samples = ["running is very important for me".split(), "I was running every day for a month".split()]
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)
 
test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
predictions = model.predict(test_samples_X)



In [29]:
logits_to_tokens(predictions, {i: t for t, i in tag2index.items()})[0][:len(test_samples[0])]

['NNS', 'VBZ', 'RB', 'JJ', 'IN', 'PRP']

In [30]:
logits_to_tokens(predictions, {i: t for t, i in tag2index.items()})[1][:len(test_samples[1])]

['PRP', 'VBD', 'VBG', 'DT', 'NN', 'IN', 'DT', 'NN']