In [12]:
import numpy as np
import pandas as pd

In [13]:
data = pd.read_csv("train.tsv",delimiter="\t",on_bad_lines='skip')

In [14]:
data.columns=['words','seg']

In [15]:
#convert to lowercase
data['seg'] = data['seg'].str.lower()
data['words'] = data['words'].str.lower()
data


Unnamed: 0,words,seg
0,depotwa,d-e-p-o-t-w-a
1,sosyopwofesyonèl,s-o-s-y-o-p-w-o-f-e-s-y-o-n-è-l
2,vejetal,v-e-j-e-t-a-l
3,repibliye,r-e-p-i-b-l-i-y-e
4,mason,m-a-s-on
...,...,...
12806,remi,r-e-m-i
12807,diskriminatwa,d-i-s-k-r-i-m-i-n-a-t-w-a
12808,rejè,r-e-j-è
12809,manzè,m-an-z-è


In [16]:
k = data['seg'].tolist()

In [17]:
#creating sets of words with tags
wrds = ['an', 'ch','en','ng','on','ou','oun','ui']
a = []
for seg in k:
  word_letters = seg.split('-')
  sett = []
  for letter in word_letters:
    sett.append((letter[0], 'B'))
    if len(letter) > 1:
      for stri in letter[1:]:
        sett.append((stri, 'I'))
  a.append(sett)

In [18]:
vocab = list(set([w for sent in a for (w,t) in sent]))
vocab.append('<PAD>')
print(len(vocab))
tags = list(set([t for sent in a for (w,t) in sent]))
tags.append('<PAD>')
print(tags)
print(a[0]) 

28
['I', 'B', '<PAD>']
[('d', 'B'), ('e', 'B'), ('p', 'B'), ('o', 'B'), ('t', 'B'), ('w', 'B'), ('a', 'B')]


In [19]:
# check length of longest sentence
max_word = [len(seq) for seq in a]
print("Length of longest Word: {}".format(max(max_word)))

Length of longest Word: 18


In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = max(max_word)
word2index = {w: i for i, w in enumerate(vocab)}
tag2index = {t: i for i, t in enumerate(tags)}
onehot = [[word2index[w[0]] for w in s] for s in a]
X = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(vocab)-1)

In [21]:
from tensorflow.keras.utils import to_categorical
onehot_y = [[tag2index[w[1]] for w in s] for s in a]
y = pad_sequences(maxlen=max_len, sequences=onehot_y, padding="post", value=tag2index["<PAD>"])
y = np.asarray([to_categorical(i, num_classes=len(tags)) for i in y])

In [22]:
# length of all words
print(len(X))
# train test split
X_train = X[:10000]
y_train = y[:10000]
X_test = X[10000:]
y_test = y[10000:]

12811


In [23]:
# basic bilstm referred from dr. kevin scanell
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional 
from tensorflow.keras.metrics import CategoricalAccuracy
model = Sequential()
model.add(Embedding(input_dim=len(vocab), output_dim=50, input_length=max_len))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(len(tags), activation="softmax")))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 18, 50)            1400      
                                                                 
 bidirectional (Bidirectiona  (None, 18, 200)          120800    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 18, 3)            603       
 ibuted)                                                         
                                                                 
Total params: 122,803
Trainable params: 122,803
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
history = model.fit(X_train, y_train, batch_size=5, epochs=5, validation_split=0.1, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
test_sent = 3
pred = model.predict(np.array([X_test[test_sent]]))
p = np.argmax(pred, axis=-1)
p_actual = np.argmax(y_test[test_sent],axis=-1)
print(p[0])
print(p_actual[0])

[1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2]
1


In [26]:
y_actual = []
y_pred = []
for test_sent in range(len(X_test)):
  pred = model.predict(np.array([X_test[test_sent]]))
  p = np.argmax(pred, axis=-1)
  p_actual = np.argmax(y_test[test_sent],axis=-1)
  for i in range(len(p_actual)):
    if p_actual[i] == 2: 
      break
    y_actual.append(p_actual[i])
    y_pred.append(p[0][i])

In [27]:
# f1 score for all labels
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_actual, y_pred, average=None, labels=[0, 1, 2])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(array([0.98721072, 0.99994343, 0.        ]),
 array([0.99938348, 0.99881336, 0.        ]),
 array([0.9932598 , 0.99937807, 0.        ]),
 array([ 1622, 17697,     0]))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(array([0.98721072, 0.99994343, 0.        ]),
 array([0.99938348, 0.99881336, 0.        ]),
 array([0.9932598 , 0.99937807, 0.        ]),
 array([ 1622, 17697,     0]))