## Check Data Set

In [1]:
with open('data/train.txt', encoding='utf-8') as f:
    for _ in range(10):
        line = f.readline()
        print(repr(line))

'RT O\n'
'@TheValarium O\n'
': O\n'
'Online O\n'
'ticket O\n'
'sales O\n'
'for O\n'
'Ghostland B-musicartist\n'
'Observatory I-musicartist\n'
'extended O\n'


In [2]:
with open('data/validation.txt', encoding='utf-8') as f:
    for _ in range(10):
        line = f.readline()
        print(line)

Dylan B-person

refuses O

to O

take O

a O

nap O

and O

he O

was O

up O



In [3]:
with open('data/test.txt', encoding='utf-8') as f:
    for _ in range(10):
        line = f.readline()
        print(line)

Man O

i O

hate O

when O

people O

carry O

ragedy O

luggage O

.. O

ima O



## Read Dataset

In [4]:
def read_data(file_path):
    tokens = []
    tags = []
    
    tweet_tokens = []
    tweet_tags = []
    for line in open(file_path, encoding='utf-8'):
        line = line.strip()
        if not line:
            if tweet_tokens:
                tokens.append(tweet_tokens)
                tags.append(tweet_tags)
            tweet_tokens = []
            tweet_tags = []
        else:
            token, tag = line.split()
            # Replace all urls with <URL> token
            if token.lower().startswith('http://') or token.lower().startswith('https://'):
                token = '<URL>'
            
            # Replace all users with <USR> token
            if token.startswith('@'):
                token = '<USR>'
            
            tweet_tokens.append(token)
            tweet_tags.append(tag)
            
    return tokens, tags

In [5]:
train_tokens, train_tags = read_data('data/train.txt')
validation_tokens, validation_tags = read_data('data/validation.txt')
test_tokens, test_tags = read_data('data/test.txt')

In [6]:
for i in range(3):
    for token, tag in zip(train_tokens[i], train_tags[i]):
        print('%s\t%s' % (token, tag))
    print()

RT	O
<USR>	O
:	O
Online	O
ticket	O
sales	O
for	O
Ghostland	B-musicartist
Observatory	I-musicartist
extended	O
until	O
6	O
PM	O
EST	O
due	O
to	O
high	O
demand	O
.	O
Get	O
them	O
before	O
they	O
sell	O
out	O
...	O

Apple	B-product
MacBook	I-product
Pro	I-product
A1278	I-product
13.3	I-product
"	I-product
Laptop	I-product
-	I-product
MD101LL/A	I-product
(	O
June	O
,	O
2012	O
)	O
-	O
Full	O
read	O
by	O
eBay	B-company
<URL>	O
<URL>	O

Happy	O
Birthday	O
<USR>	O
!	O
May	O
Allah	B-person
s.w.t	O
bless	O
you	O
with	O
goodness	O
and	O
happiness	O
.	O



In [7]:
from collections import defaultdict
from tqdm import tqdm

In [8]:
def build_dict(tokens_or_tags, special_tokens):
    """
        tokens_or_tags: a list of lists of tokens or tags
        special_tokens: some special tokens
    """
    # Create a dictionary with default value 0
    tok2idx = defaultdict(lambda: 0)
    idx2tok = []
    
    # Create mappings from tokens (or tags) to indices and vice versa.
    # At first, add special tokens (or tags) to the dictionaries.
    # The first special token must have index 0.

    # idx = 0
    # for s_token in special_tokens:
    #   tok2idx[s_token] = idx + 1
    #   idx += 1
    
    # Mapping tok2idx should contain each token or tag only once. 
    # To do so, you should:
    # 1. extract unique tokens/tags from the tokens_or_tags variable, which is not
    #    occur in special_tokens (because they could have non-empty intersection)
    # 2. index them (for example, you can add them into the list idx2tok
    # 3. for each token/tag save the index into tok2idx).
    
    ######################################
    ######### YOUR CODE HERE #############
    ######################################
    uni_toks = list(set([tok for tweet in tokens_or_tags for tok in tweet] + special_tokens))
    vocab = uni_toks
  
    for i,tok in enumerate(vocab):
        tok2idx[tok] = i
        idx2tok.append(tok)
    
    return tok2idx, idx2tok

In [9]:
special_tokens = ['<UNK>', '<PAD>']
special_tags = ['O']

# Create dictionaries 
token2idx, idx2token = build_dict(train_tokens + validation_tokens, special_tokens)
tag2idx, idx2tag = build_dict(train_tags, special_tags)

### Check Tag

In [10]:
idx2tag

['O',
 'B-other',
 'I-other',
 'I-company',
 'I-movie',
 'I-tvshow',
 'B-person',
 'I-geo-loc',
 'B-company',
 'I-person',
 'B-facility',
 'B-product',
 'B-musicartist',
 'B-tvshow',
 'B-movie',
 'B-sportsteam',
 'I-sportsteam',
 'B-geo-loc',
 'I-musicartist',
 'I-product',
 'I-facility']

In [11]:
def words2idxs(tokens_list):
    return [token2idx[word] for word in tokens_list]

def tags2idxs(tags_list):
    return [tag2idx[tag] for tag in tags_list]

def idxs2words(idxs):
    return [idx2token[idx] for idx in idxs]

def idxs2tags(idxs):
    return [idx2tag[idx] for idx in idxs]

In [12]:
train_tokens_vec = [words2idxs(words) for words in train_tokens]
train_tokens_len = [len(words) for words in train_tokens]
train_tags_vec = [tags2idxs(tags) for tags in train_tags]

validation_tokens_vec = [words2idxs(words) for words in validation_tokens]
validation_tokens_len = [len(words) for words in validation_tokens]
validation_tags_vec = [tags2idxs(tags) for tags in validation_tags]

test_tokens_vec = [words2idxs(words) for words in test_tokens]
test_tags_vec = [tags2idxs(tags) for tags in test_tags]

In [13]:
train_tags_vec[0]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 12,
 18,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [14]:
import numpy as np
np.amax(train_tokens_len), np.amax(validation_tokens_len)

(41, 37)

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [16]:
vocab_size = len(idx2token)
embedding_dim = 100 
max_length = 50
padding_type = 'post'
trunc_type = 'post'


train_tokens_seq = pad_sequences(sequences = train_tokens_vec,
             maxlen = max_length,
             padding = padding_type,
             truncating = trunc_type,
             value = token2idx['<PAD>'])

train_tags_seq = pad_sequences(sequences = train_tags_vec,
             maxlen = max_length,
             padding = padding_type,
             truncating = trunc_type,
             value = tag2idx['O'])

validation_tokens_seq = pad_sequences(sequences = validation_tokens_vec,
             maxlen = max_length,
             padding = padding_type,
             truncating = trunc_type,
             value = token2idx['<PAD>'])

validation_tags_seq = pad_sequences(sequences = validation_tags_vec,
             maxlen = max_length,
             padding = padding_type,
             truncating = trunc_type,
             value = tag2idx['O'])

test_tokens_seq = pad_sequences(sequences = test_tokens_vec,
             maxlen = max_length,
             padding = padding_type,
             truncating = trunc_type,
             value = token2idx['<PAD>'])

test_tags_seq = pad_sequences(sequences = test_tags_vec,
             maxlen = max_length,
             padding = padding_type,
             truncating = trunc_type,
             value = tag2idx['O'])


In [17]:
test_tags_seq.shape

(724, 50)

In [18]:
np.unique(test_tags_seq, return_counts = True)[1]

array([35173,   103,    93,    40,    10,     5,   104,    52,    84,
          66,    47,    28,    27,     7,     8,    31,    12,   165,
          24,    60,    61], dtype=int64)

In [19]:
35173 / np.unique(test_tags_seq, return_counts = True)[1].sum()

0.9716298342541436

## embedding matrix

In [20]:
# glove.840B.300d.txt
# glove.6B.100d.txt

glove_file = 'data/glove.6B.100d.txt'

def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf-8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
       
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map    
    

In [21]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(glove_file)

In [22]:
'.\xa0.\xa0.'.replace('\xa0', ' ') == '. . .'  # \xa0 is used in GloVe dataset.

True

In [23]:
test_word = idx2token[0]
print(test_word)

# word_to_vec_map[test_word]

68%


In [24]:
vocab_size = len(idx2token)
embedding_dim = 100 

def build_matrix(index_word):
    embedding_matrix = np.zeros((vocab_size, 100))
    for i, word in enumerate(index_word):
        try:
            embedding_matrix[i] = word_to_vec_map[word]
        except KeyError:
            try:
                embedding_matrix[i] = word_to_vec_map[word.lower()]
            except:
                try:
                    embedding_matrix[i] = word_to_vec_map[word.title()]
                except:
                    pass
    return embedding_matrix
                
                
        
embedding_matrix = build_matrix(idx2token)


In [25]:
embedding_matrix[5]

array([-0.35876  ,  0.58159  , -0.027631 , -0.25278  , -0.62805  ,
       -0.42516  ,  0.24538  ,  0.60198  ,  0.16226  , -0.043216 ,
        0.12896  , -0.045218 ,  0.43728  ,  0.093024 , -0.44829  ,
       -0.36199  ,  0.1906   ,  0.48196  , -0.59453  , -0.36116  ,
       -0.12156  , -0.16411  ,  0.057408 , -0.45872  , -0.096812 ,
        0.46578  , -0.15785  , -0.097296 ,  0.75225  ,  0.072599 ,
       -0.71558  ,  0.29924  , -0.15847  ,  0.21901  ,  0.98759  ,
        0.43268  ,  0.026921 , -0.28457  ,  0.092205 , -0.30228  ,
       -0.111    , -0.055727 ,  0.56083  , -0.85266  , -0.07291  ,
       -0.18132  , -0.25805  , -0.25662  ,  0.084227 , -1.1677   ,
        0.0035248, -0.018172 ,  0.31162  ,  0.73281  , -0.14573  ,
       -2.0167   ,  0.043788 , -0.21965  ,  1.6191   ,  0.22962  ,
        0.061044 ,  0.72638  , -0.97811  , -0.07765  ,  0.78095  ,
        0.086923 ,  0.10981  ,  0.72691  , -0.88268  , -0.44801  ,
        0.85117  , -0.84212  , -0.61164  , -0.67277  ,  0.2199

## Model

In [26]:
import tensorflow as tf
import numpy as np

In [27]:
tf.__version__

'2.0.0'

In [28]:
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Flatten #Dense, LSTM, Input ...

In [29]:
vocab_size = len(idx2token)
n_tags = len(idx2tag)
embedding_dim = 100 
max_length = 50



embedding_layer = Embedding(input_dim = vocab_size,
                            output_dim = embedding_dim,
                            input_length = max_length,
                            weights = [embedding_matrix],
                            trainable = False
                           )

model_input = Input(shape=(max_length, ))

x = embedding_layer(model_input)

x_LSTM1 = Bidirectional(LSTM(units=128, 
                           return_sequences=True, 
                           recurrent_dropout=0.2,
                           dropout=0.2))(x)

x_LSTM2 = Bidirectional(LSTM(units=128,
                             return_sequences=True,
                             recurrent_dropout=0.2,
                             dropout=0.2))(x_LSTM1)



x = Dense(128, activation='relu')(x_LSTM2)

# x = layers.concatenate([x_LSTM1, x_LSTM2])

model_output = Dense(n_tags, activation='softmax')(x)



model = Model(inputs = model_input, outputs = model_output)

In [30]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 50, 100)           2050500   
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 256)           234496    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 256)           394240    
_________________________________________________________________
dense (Dense)                (None, 50, 128)           32896     
_________________________________________________________________
dense_1 (Dense)              (None, 50, 21)            2709      
Total params: 2,714,841
Trainable params: 664,341
Non-trainable params: 2,050,500
_____________________________________________

In [31]:
model.compile(optimizer="adam",
             loss="sparse_categorical_crossentropy",
             metrics=["accuracy"])

In [32]:
model.fit(
          x = train_tokens_seq,
          y = train_tags_seq,
#          batch_size = 64,
         epochs = 5,
         verbose = 1,
         #validation_split=0.1
         validation_data = (validation_tokens_seq, validation_tags_seq)
         )

Train on 5795 samples, validate on 724 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x193b4ca94a8>

In [33]:
model.evaluate(x = test_tokens_seq, 
               y = test_tags_seq,
              batch_size = None,
              verbose = 0)

[0.1051552983114074, 0.9772652]

In [34]:
pred_one = model.predict(x = test_tokens_seq[0:1],
             batch_size = None,
             verbose = 0)

In [35]:
pred_one.shape

(1, 50, 21)

In [36]:
pred_one[0]

array([[9.9236101e-01, 1.0988724e-03, 6.6390930e-04, ..., 4.5722470e-04,
        3.9113776e-04, 1.5672176e-04],
       [9.9878329e-01, 1.0728989e-04, 2.6566739e-04, ..., 1.0351578e-04,
        7.4374824e-05, 4.1915428e-05],
       [9.9859470e-01, 3.3697265e-04, 1.3910639e-04, ..., 4.5106139e-05,
        3.2994820e-05, 2.0195446e-05],
       ...,
       [9.9996245e-01, 2.6246739e-06, 6.5932488e-07, ..., 8.3189803e-07,
        4.9715044e-07, 1.5572590e-07],
       [9.9993718e-01, 3.9095648e-06, 1.0917832e-06, ..., 1.5497109e-06,
        8.9663178e-07, 3.4193113e-07],
       [9.9979728e-01, 1.0017093e-05, 3.1354207e-06, ..., 5.3247836e-06,
        2.9472085e-06, 1.5884386e-06]], dtype=float32)

In [37]:
pred_one[0].shape

(50, 21)

In [38]:
test_pred = model.predict(x = test_tokens_seq,
             batch_size = None,
             verbose = 0)

In [39]:
np.argmax(test_pred[0], axis=1)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=int64)

In [40]:
results_matrix = np.zeros((724, 50), dtype=int)

In [41]:
results_matrix.shape

(724, 50)

In [42]:
for row_id in range(724):
    results_matrix[row_id] = np.argmax(test_pred[row_id], axis=1)

In [46]:
results_matrix[5:15]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  0, 17, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 17,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 

In [48]:
test_tags_seq[5:15]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  0, 10, 20, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  8,  0,  0,  0,  0,  0,  0,  0, 17,  0, 17,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 

In [58]:
[(i, tag) for i,tag in enumerate(idx2tag)]

[(0, 'O'),
 (1, 'B-other'),
 (2, 'I-other'),
 (3, 'I-company'),
 (4, 'I-movie'),
 (5, 'I-tvshow'),
 (6, 'B-person'),
 (7, 'I-geo-loc'),
 (8, 'B-company'),
 (9, 'I-person'),
 (10, 'B-facility'),
 (11, 'B-product'),
 (12, 'B-musicartist'),
 (13, 'B-tvshow'),
 (14, 'B-movie'),
 (15, 'B-sportsteam'),
 (16, 'I-sportsteam'),
 (17, 'B-geo-loc'),
 (18, 'I-musicartist'),
 (19, 'I-product'),
 (20, 'I-facility')]