In [None]:
# !git clone https://github.com/taslimamindia/NERC.git

# Importation

In [27]:
import pandas as pd

import numpy as np

from nltk import word_tokenize, sent_tokenize, download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec

import tensorflow as tf

from keras.utils import to_categorical, pad_sequences

# import string

In [None]:
download('wordnet') # for google colab

# Class define form data.

In [28]:
class Data(object):
    unique_words = {"<PAD>":0}
    unique_ner_tags = {"O":0}
    MAX_LENGTH = 50
    VOCAB_SIZE = 100
    
    def __init__(self):
        self.sentences = []
        self.sentences_num = None
        self.ner_tags = []
        self.ner_tags_num = None
        self.chunk_tags = []
        self.pos_tags = []
        self.x, self.y = None, None
    def word2vec(self, vector_size=100):
        VOCAB_SIZE = vector_size
        word2vec_model = Word2Vec(self.sentences, vector_size=vector_size, window=5, min_count=1, workers=4)
        return word2vec_model   
    def word2idx(self, word:str):
        return Data.unique_words.get(word, None)
    def idx2word(self, index:int):
        for word, value in Data.unique_words.items():
            if index is value: return word
        return None    
    def tag2idx(self, tag):
        return Data.unique_ner_tags.get(tag, None)
    def idx2tag(self, index):
        for tag, value in Data.unique_ner_tags.items():
            if index == value: return tag
        return None
    def unicity(self):
        unique_sent, unique_tag = set(), set()
        [unique_tag.update(tags) for tags in self.ner_tags_num]
        [unique_sent.update(tags) for tags in self.sentences_num]
        max_tags = len(Data.unique_ner_tags)
        max_words = len(Data.unique_words)
        for word in list(unique_sent):
            if Data.unique_words.get(word, None) == None:
                Data.unique_words[word] = max_words
                max_words += 1
        for tag in list(unique_tag):
            if Data.unique_ner_tags.get(tag, None) == None:
                Data.unique_ner_tags[tag] = max_tags
                max_tags += 1


# Loading data

In [29]:
class Loading():
    def __init__(self, data: Data, file):
        self.data = data
        self.load_sentences(file)
    def load_sentences(self, filepath):
        tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
        with open(filepath, 'r') as f:
            for line in f.readlines():
                if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                    if len(tokens) > 0:
                        self.data.sentences.append(tokens)
                        self.data.pos_tags.append(pos_tags)
                        self.data.chunk_tags.append(chunk_tags)
                        self.data.ner_tags.append(ner_tags)
                        tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
                else:
                    l = line.split(' ')
                    tokens.append(l[0])
                    pos_tags.append(l[1])
                    chunk_tags.append(l[2])
                    ner_tags.append(l[3].strip('\n'))

# Preprocessing

In [30]:
class Preprocessing():
    def __init__(self, data:Data, text=None, lang="english"):
        self.data = data
        self.text = text
        self.lang = lang
        if text == None:
            self.data.sentences_num = self.data.sentences
            self.data.ner_tags_num = self.data.ner_tags
    
    def tokenize(self):
        if self.text != None:
            sentenses = [word_tokenize(sentence, language=self.lang) for sentence in sent_tokenize(self.text, language=self.lang)]
            self.data.sentences = [[token for token in sentence if token not in stopwords.words(self.lang)] for sentence in sentenses]
            self.data.sentences_num = self.data.sentences
        
    def lowercasing(self):
        self.data.sentences_num = [[word.lower() for word in sentence] for sentence in self.data.sentences_num]
    
    def lemmatize(self):
        lemmatizer = WordNetLemmatizer()
        self.data.sentences_num = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in self.data.sentences_num]
    
    def remove_stopword(self):
        punctuation = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']
        sentences = [[(self.data.sentences_num[i][j], self.data.ner_tags[i][j]) for j in range(len(self.data.sentences_num[i]))] for i in range(len(self.data.sentences_num))]
        sentences = [[(token, tag) for token, tag in sentence if token not in stopwords.words(self.lang) + punctuation] for sentence in sentences]
        self.data.sentences_num = [[token for token, tag in sentence] for sentence in sentences]
        self.data.ner_tags_num = [[tag for token, tag in sentence] for sentence in sentences]

# Vectorization

In [32]:
class Vectorization():
    def __init__(self, data:Data):
        self.data = data
    
    def word2vec(self, min_count=1, window=5):
        word2vec_model = Word2Vec(self.data.sentences_num, min_count=min_count, vector_size=Data.VOCAB_SIZE, window=window)
        self.data.sentences_num = [[word2vec_model.wv[word] for word in sentence] for sentence in self.data.sentences_num]
    
    def padding_x(self, value=np.zeros((Data.VOCAB_SIZE,), dtype="float32"), dtype="float32"):
        self.data.x = pad_sequences(
            sequences=self.data.sentences_num, 
            maxlen=self.data.MAX_LENGTH, 
            dtype=dtype, 
            padding="post", 
            value=value
        )
    
    def vectorized_x(self):
        self.word2vec()
        self.padding_x()
        
    def tag2num(self):
        NUM_CLASSES = len(Data.unique_ner_tags)
        self.data.ner_tags_num = [[to_categorical(Data.unique_ner_tags.get(tag), num_classes=NUM_CLASSES) for tag in tags] for tags in self.data.ner_tags_num]
    
    def padding_y(self, value=to_categorical(Data.unique_ner_tags.get("O"), num_classes=len(Data.unique_ner_tags))):
        self.data.y = pad_sequences(
            sequences=self.data.ner_tags_num, 
            maxlen=self.data.MAX_LENGTH,
            padding="post", 
            dtype="float32",
            value=value
        )
    
    def vectorized_y(self):
        self.tag2num()
        self.padding_y()

def load_dataset(path: str):
    data = Data()
    base_file = "../Data/conll2003_english/"
    # base_file = "/content/NERC/Data/conll2003_english/"
    Loading(data = data, file=base_file + path)
    return data

# Main

### New input text

In [None]:
# test_text = Data()

# preprocessing = Preprocessing(data = test_text, text = "Obama is the president of the United States. I am from Guinea, nice to meet you.")
# preprocessing.tokenize()
# preprocessing.lowercasing()
# preprocessing.lemmatize()
# print(test_text.sentences)

# vector = Vectorization(test_text)
# vector.vectorized_x()
# print(test_text.x.shape)

### Parameters

In [33]:
# NUM_WORDS = len(Data.unique_words)
# NUM_CLASSES = len(Data.unique_ner_tags)
# Hyperparameters
EMBEDDING_DIM = 100
NUM_FILTERS = 256
KERNEL_SIZE = 3
DROPOUT_RATE = 0.5
BATCH_SIZE = 32
EPOCHS = 10

## Evaluation

In [34]:
def evaluation(test:Data, y_predict):
  true, false, total, predict = 0, 0, 0, 0
  x, y, z = test.y.shape
  for i in range(x):
    for j in range(y):
      real_tag = np.argmax(test.y[i][j]) 
      predict_tag = np.argmax(y_predict[i][j])
      if predict_tag == 0: predict +=1
      if real_tag != 0:
        total = total + 1
        if real_tag == predict_tag: true = true + 1
        else: false = false + 1
  print("----------------------- Evaluation -------------------------")
  print(test.y.shape)
  print(predict, x*y)
  print(true, false, total, round(true/total, 3), round(false/total, 3), end="\n\n")

In [35]:
def checkDataset(train, test, valid):    
    print("X_train", train.x.shape)
    print("y_train", train.y.shape, "\n")
    print("X_test", test.x.shape)
    print("y_test", test.y.shape, "\n")    
    print("X_valid", valid.x.shape)
    print("y_valid", valid.y.shape)

def main():
    train = load_dataset("train.txt")
    test = load_dataset("test.txt")
    valid = load_dataset("valid.txt")
    preprocess_lstm(train)
    preprocess_lstm(test)
    preprocess_lstm(valid)
    vectorize(train)
    vectorize(test)
    vectorize(valid)
    checkDataset(train, test, valid)
    return train, test, valid

## CNN model

In [53]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, MaxPooling1D

model = Sequential()
model.add(Conv1D(64, KERNEL_SIZE, activation='relu', input_shape=(100,1)))
model.add(Dropout(DROPOUT_RATE))
model.add(Conv1D(32, KERNEL_SIZE, activation='relu'))
model.add(Dropout(DROPOUT_RATE))
model.add(Dense(9, activation='softmax'))
model.summary()

Model: "sequential_31"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_17 (Conv1D)          (None, 98, 64)            256       
                                                                 
 dropout_14 (Dropout)        (None, 98, 64)            0         
                                                                 
 conv1d_18 (Conv1D)          (None, 96, 32)            6176      
                                                                 
 dropout_15 (Dropout)        (None, 96, 32)            0         
                                                                 
 dense_16 (Dense)            (None, 96, 9)             297       
                                                                 
Total params: 6,729
Trainable params: 6,729
Non-trainable params: 0
_________________________________________________________________


In [36]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, MaxPooling1D

class Model_CNN:
  def __init__(self):
    # Define the model architecture
    self.model = Sequential()
    self.model.add(Conv1D(64, KERNEL_SIZE, activation='relu', input_shape=(Data.MAX_LENGTH, EMBEDDING_DIM), padding='same'))
    self.model.add(Dropout(DROPOUT_RATE))
    self.model.add(Conv1D(32, KERNEL_SIZE, activation='relu', padding='same'))
    self.model.add(Dropout(DROPOUT_RATE))
    self.model.add(Dense(NUM_CLASSES, activation='softmax'))
    
  def summary(self):
    self.model.summary()
    
  def trainning(self, train:Data, valid:Data=None):
    cat_accuracy = tf.keras.metrics.CategoricalAccuracy()
    recall = tf.keras.metrics.Recall()
    self.model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[cat_accuracy, recall])
    if valid == None:
      self.model.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS)
    else:
      self.model.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(valid.x, valid.y))
      
  def testing(self, test:Data):
    return self.model.evaluate(test.x, test.y)
  
  def predicting(self, test:Data):
    return self.model.predict(test.x, batch_size=BATCH_SIZE)

In [None]:
def main_cnn(param:dict):
  dico = {"params":[], "metrics":[]}
  if param.get("max_length", 0) != 0:
      max_lengths = param["max_length"]
      for max_length in max_lengths:   
        Data.MAX_LENGTH = max_length     
        train, test, valid = main()
        model_cnn = Model_CNN()
        model_cnn.trainning(train, valid)
        model_cnn.testing(test)
        y_predict_cnn = model_cnn.predicting(test)
        evaluation(test, y_predict_cnn)

In [None]:
main_cnn({"max_length":[50]})

In [None]:
# # from sklearn.feature_extraction.text import TfidfVectorizer
# from keras.models import Model
# from keras.layers import Dense, Conv1D
# from tf2crf import CRF, ModelWithCRFLoss
# from keras import Input

# # Build CNN model
# # model = Sequential()
# inputs = Input(shape=(MAX_LENGTH, EMBEDDING_DIM))
# outputs = Conv1D(64, KERNEL_SIZE, activation='relu', padding='same')(inputs)
# # model.add(MaxPooling1D(2, padding='same'))
# # outputs = Dropout(DROPOUT_RATE)(inputs)
# outputs = Conv1D(32, KERNEL_SIZE, activation='relu', padding='same')(inputs)
# # model.add(MaxPooling1D(2))
# # model.add(Dropout(DROPOUT_RATE))
# # model.add(Dense(HIDDEN_DIM, activation='relu'))
# # outputs = Dropout(DROPOUT_RATE)(outputs)
# outputs = Dense(NUM_CLASSES, activation='relu')(outputs)
# # outputs.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# # outputs.summary()
# crf = CRF(units=9)
# # cnn_model.add(crf)
# output = crf(outputs)
# cnn_crf_model = Model(inputs, output)
# cnn_crf_model.summary()
# # cnn_crf_model = ModelWithCRFLoss(base_model, sparse_target=True)
# # cnn_crf_model.summary()

In [None]:
# cnn_crf_model.compile(optimizer='adam')
# cnn_crf_model.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(valid.x, valid.y))

In [None]:
# # Evaluation
# loss, accuracy = cnn_crf_model.evaluate(test.x, test.y, batch_size=BATCH_SIZE)

# print('Test Loss:', loss)
# print('Test Accuracy:', accuracy)

## Model LSTM

In [37]:
def preprocess_lstm(data:Data):
    preprocessing = Preprocessing(data=data)
    preprocessing.lowercasing()
    preprocessing.lemmatize()
    preprocessing.remove_stopword()
    data.unicity()

def vectorize(data:Data):
    vector = Vectorization(data=data)
    
    # # treat sentences
    # vector.vectorized_x()
    # vector.vectorized_y()
    
    # # treat words
    vector.word2vec()
    vector.tag2num()
    Sentences, Tags = [], []
    [[Sentences.append(word) for word in sentence] for sentence in data.sentences_num]
    [[Tags.append(tag) for tag in tags] for tags in data.ner_tags_num]
    data.x, data.y = np.array(Sentences, dtype="float32"), np.array(Tags, dtype="float32")
    print(data.x.shape, data.y.shape)

In [38]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

class Model_LSTM:
  def __init__(self):
    # # Define the model architecture
    # self.model_LSTM = Sequential()
    # self.model_LSTM.add(LSTM(256, input_shape=(Data.MAX_LENGTH, Data.VOCAB_SIZE), return_sequences=True, dropout=0.5))
    # self.model_LSTM.add(LSTM(128, return_sequences=True, dropout=0.5))
    # self.model_LSTM.add(LSTM(64, return_sequences=True, dropout=0.5))
    # self.model_LSTM.add(LSTM(32, return_sequences=True, dropout=0.5))
    # self.model_LSTM.add(Dense(len(Data.unique_ner_tags), activation='softmax'))
    
    # Define the model architecture
    self.model_LSTM = Sequential()
    self.model_LSTM.add(LSTM(128, input_shape=(1,Data.VOCAB_SIZE), return_sequences=True, dropout=0.5))
    self.model_LSTM.add(LSTM(64, return_sequences=True, dropout=0.5))
    self.model_LSTM.add(LSTM(32, return_sequences=True, dropout=0.5))
    self.model_LSTM.add(Dense(len(Data.unique_ner_tags), activation='softmax'))

  def summary(self):
    self.model_LSTM.summary()

  def trainning(self, train:Data, valid:Data=None):
    cat_accuracy = tf.keras.metrics.CategoricalAccuracy()
    recall = tf.keras.metrics.Recall()
    self.model_LSTM.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[cat_accuracy, recall])
    if valid == None:
      self.model_LSTM.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS)
    else:
      self.model_LSTM.fit(train.x, train.y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(valid.x, valid.y))

  def testing(self, test:Data):
    return self.model_LSTM.evaluate(test.x, test.y)

  def predicting(self, test:Data):
    return self.model_LSTM.predict(test.x, batch_size=BATCH_SIZE)

In [39]:
def main_lstm(param:dict):
    dico = {"params":[], "metrics":[]}
    if param.get("max_length", 0) != 0:
        max_lengths = param["max_length"]
        for max_length in max_lengths:   
            Data.MAX_LENGTH = max_length     
            train, test, valid = main()
            model_lstm = Model_LSTM()
            model_lstm.summary()
            # model_lstm.trainning(train, valid)
            # model_lstm.testing(test)
            # y_predict_lstm = model_lstm.predicting(test)
            # evaluation(test, y_predict_lstm)

In [46]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(128, input_shape=(50, 1), dropout=0.5))  # assuming you're using a univariate time series
# model.add(LSTM(64, activation="relu"))  # assuming you're using a univariate time series
model.add(Dense(9, activation='sigmoid'))  # assuming binary classification, adjust for your problem
model.summary()


Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_39 (LSTM)              (None, 128)               66560     
                                                                 
 dense_12 (Dense)            (None, 9)                 1161      
                                                                 
Total params: 67,721
Trainable params: 67,721
Non-trainable params: 0
_________________________________________________________________


In [40]:
main_lstm({"max_length":[50]})

(125907, 100) (125907, 9)
(29480, 100) (29480, 9)
(31205, 100) (31205, 9)
X_train (125907, 100)
y_train (125907, 9) 

X_test (29480, 100)
y_test (29480, 9) 

X_valid (31205, 100)
y_valid (31205, 9)
Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_29 (LSTM)              (None, 1, 128)            117248    
                                                                 
 lstm_30 (LSTM)              (None, 1, 64)             49408     
                                                                 
 lstm_31 (LSTM)              (None, 1, 32)             12416     
                                                                 
 dense_8 (Dense)             (None, 1, 9)              297       
                                                                 
Total params: 179,369
Trainable params: 179,369
Non-trainable params: 0
_______________________________________________

## Tools

In [None]:
# !pip install tensorflow-addons

In [None]:
# model_LSTM.save("../Data/model_lstm.keras")

In [None]:
# model_LSTM = tf.keras.models.load_model("../Data/model_lstm.keras")

In [None]:
# # np.quantile(sort([1, 2, 3, 8, 7]), 0.50)
# dico = {}
# for tags in test.sentences + train.sentences + valid.sentences:
#   if dico.get(len(tags), None) == None:
#     dico[len(tags)] = 1
#   dico[len(tags)] += 1
# sorted(list(dico.items()), key= lambda x: x[1])

In [None]:
# entities = dict(zip(Data.unique_ner_tags.keys(), [0 for i in range(len(Data.unique_ner_tags))]))
# for tags in test.ner_tags:
#     for tag in tags:
#         entities[tag] += 1
# is_entities = 0
# is_not_entities = 0
# for tag, nbr in entities.items():
#     if tag != 'O': is_entities += nbr
#     else: is_not_entities += nbr
# print(entities)
# print(is_entities, is_not_entities)

## TF-IDF CNN-Softmax

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, MaxPooling1D
# from keras_contrib.layers import CRF
# from keras_contrib.utils import save_load_utils

In [None]:
def load_tf_idf():
    train = load_dataset("train.txt")
    # test = load_dataset("test.txt")
    # valid = load_dataset("valid.txt")
    # return train, test, valid
    return train, None, None

def preprocess_tfidf(data:Data):
    preprocessing = Preprocessing(data=data)
    preprocessing.lowercasing()
    preprocessing.lemmatize()
    preprocessing.remove_stopword()
    data.unicity()
    # sentences = [" ".join(sentence) for sentence in data.sentences_num]
    # vectorizer = TfidfVectorizer(max_features=Data.MAX_LENGTH)
    # data.x = vectorizer.fit_transform(sentences).toarray()
    # y = []
    # [[y.append(to_categorical(Data.unique_ner_tags[tag], num_classes=NUM_CLASSES)) for tag in tags] for tags in data.ner_tags]
    # data.y = np.array(y, dtype="float32")

def vectorize_tf_idf(data:Data):
    vectorize = Vectorization(data=data)
    print(data.x)
    vectorize.padding_x(value="<PAD>", dtype="str")
    print(data.x)
    vectorize.padding_y(value="O")

def formalize_tfidf(data:Data):
    data.x = data.sentences_num
    data.y = data.ner_tags_num
    x = len(data.x)
    return [" ".join(["".join([data.x[i][j], "__", data.y[i][j]]) for j in range(len(data.x[i]))]) for i in range(x)]

def tf(train:Data, test:Data, valid:Data):
    vectorizer = TfidfVectorizer()
    data_train = formalize_tfidf(train)
    vectors = vectorizer.fit_transform(data_train)
    # vectorizer.get_
    train.x = vectors.toarray()
    print(vectors)
    print(train.x.shape)
    # test.x = vectorizer.transform(test.x).toarray()
    # valid.x = vectorizer.transform(valid.x).toarray()

In [None]:
class TF_IDF:
  def __init__(self):
    self.train, self.test, self.valid = load_tf_idf()
    # # Preprocessing
    preprocess_tfidf(self.train)
    # preprocess_tfidf(self.test)
    # preprocess_tfidf(self.valid)
    # # Vectorization
    # vectorize_tf_idf(self.train)
    # vectorize_tf_idf(self.test)
    # vectorize_tf_idf(self.valid)
    tf(self.train, self.test, self.valid)
  def training(self):
    pass
  def testing(self):
    pass
  def evaluation(self):
    pass
    # evaluation(test, y_predict)
tfidf = TF_IDF()

In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Activation, Embedding, Conv1D, GlobalMaxPooling1D

# # Load data
# df = pd.read_csv("ner_data.csv", encoding="ISO-8859-1", error_bad_lines=False)
# df = df.fillna(method="ffill")
# sentences = df.groupby("Sentence #")["Word"].apply(list).values
# tags = df.groupby("Sentence #")["Tag"].apply(list).values

# # Perform TF-IDF
# vectorizer = TfidfVectorizer(ngram_range=(1,2))
# X_tfidf = vectorizer.fit_transform([" ".join(sent) for sent in sentences])
# tfidf_vocab = vectorizer.vocabulary_
# tfidf_vocab_inv = {v:k for k,v in tfidf_vocab.items()}
# tfidf_weights = np.asarray(X_tfidf.mean(axis=0)).ravel()

# # Tokenize words
# MAX_NB_WORDS = 20000
# MAX_SEQ_LENGTH = 100
# tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
# tokenizer.fit_on_texts(sentences)
# word_index = tokenizer.word_index

# # Convert words to sequences
# X = tokenizer.texts_to_sequences(sentences)
# X = pad_sequences(X, maxlen=MAX_SEQ_LENGTH)

# # Convert tags to sequences
# tags_index = {"O": 0, "B-LOC": 1, "I-LOC": 2, "B-PER": 3, "I-PER": 4, "B-ORG": 5, "I-ORG": 6}
# y = [[tags_index[tag] for tag in sent] for sent in tags]
# y = pad_sequences(y, maxlen=MAX_SEQ_LENGTH)

# # Split data into train and test sets
# VALIDATION_SPLIT = 0.2
# nb_validation_samples = int(VALIDATION_SPLIT * len(X))
# X_train = X[:-nb_validation_samples]
# y_train = y[:-nb_validation_samples]
# X_test = X[-nb_validation_samples:]
# y_test = y[-nb_validation_samples:]

# # Define CNN model
# EMBEDDING_DIM = 100
# model = Sequential()
# model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQ_LENGTH))
# model.add(Conv1D(128, 5, activation="relu"))
# model.add(GlobalMaxPooling1D())
# model.add(Dense(7, activation="softmax"))
# model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# # Train model
# model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128)

# # Predict tags for new sentences
# def predict_tags(sentences):
#     X = tokenizer.texts_to_sequences(sentences)
#     X = pad_sequences(X, maxlen=MAX_SEQ_LENGTH)
#     y_pred = model.predict(X)
#     return [[tfidf_vocab_inv[np.argmax(tfidf_weights * y)] if np.max(tfidf_weights * y) > 0.2 else "O" for y in sent] for sent in y_pred]

# # Test predictions
# sentences_test = ["John lives in New York City.", "Steve Jobs was the founder of Apple."]
# tags_pred = predict_tags(sentences_test)
# print(tags_pred)