In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,Embedding,Bidirectional,LSTM,TimeDistributed
from keras.losses import sparse_categorical_crossentropy
from keras.optimizers import Adam

In [None]:
df=pd.read_csv('/content/drive/MyDrive/ner.csv')

In [None]:
X=df['Sentence']
Y=df['Tag']


In [None]:
max_words=36000
tokenizer=Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
sequences=tokenizer.texts_to_sequences(X)

In [None]:
max_len=110
X_preprocessed=pad_sequences(sequences,maxlen=max_len,padding='post')

In [None]:
from ast import literal_eval
Y_ready=[]
for sentence_tag in Y:
  Y_ready.append(literal_eval(sentence_tag))

In [None]:
tags=[]
for y in Y_ready:
  for tag in y:
    if tag not in tags:
      tags.append(tag)

In [None]:
num_tags=len(tags)

In [None]:
tags2id={}
for i,tag in enumerate(tags):
  tags2id[tag]=i

In [None]:
id2tags={}
for key,value in tags2id.items():
  id2tags[value]=key

In [None]:
def preprocess_y(tags2id,Y_ready):
  Y_preprocessed=[]
  max_len=110
  for y in Y_ready:
    result=[]
    for tag in y:
      result.append(tags2id[tag])
    length=len(result)
    num_O_to_append=max_len-length
    padded_tags=result+([tags2id['O']]*num_O_to_append)
    Y_preprocessed.append(padded_tags)
  return Y_preprocessed


In [None]:
Y_preprocessed=preprocess_y(tags2id,Y_ready)

In [None]:
X_train=X_preprocessed[:30000]
X_val=X_preprocessed[30000:37000]
X_test=X_preprocessed[37000:]

In [None]:
Y_preprocessed=np.array(Y_preprocessed)
Y_train=Y_preprocessed[:30000]
Y_val=Y_preprocessed[30000:37000]
Y_test=Y_preprocessed[37000:]

In [None]:
train_dataset=tf.data.Dataset.from_tensor_slices((X_train,Y_train))
val_dataset=tf.data.Dataset.from_tensor_slices((X_val,Y_val))
test_dataset=tf.data.Dataset.from_tensor_slices((X_test,Y_test))

In [None]:
batch_size=128
train_dataset=train_dataset.batch(batch_size)
val_dataset=val_dataset.batch(batch_size)
test_dataset=test_dataset.batch(batch_size)

In [None]:
model=Sequential()
model.add(Embedding(max_words,300,input_length=110))
model.add(Bidirectional(LSTM(100,return_sequences=True)))
model.add(Bidirectional(LSTM(100,return_sequences=True)))
model.add(TimeDistributed(Dense(num_tags,activation='softmax')))

In [None]:
model.compile(loss='sparse_categorical_crossentropy',
              metrics='accuracy',
              optimizer='adam')

In [None]:
model.fit(train_dataset,validation_data=val_dataset,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d9b2d62e500>

In [None]:
model.evaluate(test_dataset)



[0.07755501568317413, 0.979855477809906]

In [None]:
words2id=tokenizer.word_index
id2words={}
for key,value in words2id.items():
  id2words[value]=key

In [None]:
def prediction(model,test_sentence,id2tags,id2words):
  test_sentence=test_sentence.reshape(1,110)
  sentence=test_sentence[test_sentence>0]
  word_list=[]
  for id in list(sentence):
    word_list.append(id2words[id])
  original_sentence=' '.join(word_list)
  len_original_sentence=len(original_sentence)

  prediction=model.predict(test_sentence)
  prediction=np.argmax(prediction[0],axis=1)
  prediction=prediction[:len_original_sentence]

  predicted_tags=[]
  for tag_id in list(prediction):
    predicted_tags.append(id2tags[tag_id])
  return original_sentence,predicted_tags

In [None]:
original_sentence,predicted_tags=prediction(model,X_test[0],id2tags,id2words)



In [None]:
print(original_sentence)

kosovo 's delegation for possible talks on the future of the united nations run province has met for the first time and stressed it would seek independence for the largely ethnic albanian region


In [None]:
print(predicted_tags)

['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'I-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
