In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import treebank
import tensorflow as tf
import os
import io
import re
import unicodedata
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, Bidirectional, SimpleRNN, TimeDistributed, LSTM, GRU, Activation, Dropout

In [2]:
sentences = treebank.tagged_sents(tagset='universal')

In [3]:
examples = []
labels = []
vocab = set([])
pos = set([])
maxLength = 0
argmaxLength = 0

for sentence in sentences:
    
    if maxLength<len(sentence):
        maxLength = len(sentence)
        argmaxLength = sentence
    example = []
    label = []
    for unit in sentence:
        word = unit[0].lower()
        example.append(word)
        label.append(unit[1])
        vocab.add(word)
        pos.add(unit[1])
    examples.append(example)
    labels.append(label)

In [4]:
input_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
input_tokenizer.fit_on_texts(examples)
input_tensor = input_tokenizer.texts_to_sequences(examples)
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,padding='post')

In [5]:
pos_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
pos_tokenizer.fit_on_texts(labels)
pos_tensor = pos_tokenizer.texts_to_sequences(labels)
pos_tensor = tf.keras.preprocessing.sequence.pad_sequences(pos_tensor,padding='post')

In [6]:
input_train,input_val,output_train, output_val = train_test_split(input_tensor,pos_tensor,test_size=0.2, random_state=42)

In [7]:
BUFFER_SIZE = len(input_train)
BATCH_SIZE = 32
steps_per_epoch = len(input_train)//BATCH_SIZE
embedding_dim = 20
vocab_inp_size = len(vocab)+1
vocab_tar_size = len(pos)+1

In [9]:
train_dataset = tf.data.Dataset.from_tensor_slices((input_train, output_train)).shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)
val_dataset = tf.data.Dataset.from_tensor_slices((input_val, output_val))

In [10]:
model = Sequential([Embedding(vocab_inp_size, embedding_dim,input_length = input_train.shape[1], name="embedding"),
                    SimpleRNN(64,return_sequences=True),
                    TimeDistributed(Dense(vocab_tar_size, activation='softmax'))])

In [11]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 271, 20)           227760    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 271, 64)           5440      
_________________________________________________________________
time_distributed (TimeDistri (None, 271, 13)           845       
Total params: 234,045
Trainable params: 234,045
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.fit(train_dataset,epochs=10,validation_data=val_dataset,validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x146669810>