In [1]:
# Importing all the needed libraries for the project

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
# Preparing the Dataset

df = pd.read_csv('/content/drive/MyDrive/NER Project/ner_dataset.csv', encoding='latin1')
df.pop('POS')
df = df.fillna(method="ffill")
# df.head()
# df.shape

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df.Word, df.Tag, test_size=0.2, random_state=42)
# x_train.shape, y_train.shape

In [8]:
### Preparing Training Data

# Creating a list of words & Tags for the further steps

words = x_train.tolist()
ner_tags = y_train.tolist()

# Step 1: Encoding
word_to_index = {word: i + 1 for i, word in enumerate(set(words))}
ner_to_index = {ner: i + 1 for i, ner in enumerate(set(ner_tags))}

label_encoder = LabelEncoder()
ner_labels_encoded = label_encoder.fit_transform(ner_tags)

word_indices = [word_to_index[word] for word in words]
ner_indices = [ner_to_index[ner] for ner in ner_tags]

# Step 2: Padding
max_length = max(len(word_indices), len(ner_indices))
word_indices_padded = np.pad(word_indices, (0, max_length - len(word_indices)), mode='constant')
ner_indices_padded = np.pad(ner_indices, (0, max_length - len(ner_indices)), mode='constant')

# Step 3: Splitting the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(word_indices_padded, ner_indices_padded, test_size=0.2, random_state=42)

# Step 4: Preparing the inputs for the Model.
# A CNN Model needs inputs like word_indices_padded_reshaped, ner_indices_padded_reshaped, vocab_size, num_classes, max_length
word_indices_padded_reshaped = np.expand_dims(word_indices_padded, axis=-1)
ner_indices_padded_reshaped = np.expand_dims(ner_indices_padded, axis=-1)
vocab_size = len(word_to_index) + 1
num_classes = len(set(df["Tag"]))

In [9]:
### Building a CNN Model for prediction:

# Define model parameters
num_classes = len(set(ner_labels_encoded))
embedding_dim = 100
num_filters = 128
kernel_size = 3

def build_model(input_dim, output_dim, max_seq_length, embedding_dim=embedding_dim, num_filters=num_filters, kernel_size=kernel_size):
    inputs = Input(shape=(max_seq_length,))
    embedding_layer = Embedding(input_dim=input_dim, output_dim=embedding_dim, input_length=max_length)(inputs)
    conv_layer = Conv1D(filters=num_filters, kernel_size=kernel_size, activation='relu', padding='same')(embedding_layer)
    pooling_layer = GlobalMaxPooling1D()(conv_layer)
    outputs = Dense(output_dim, activation='softmax')(pooling_layer)

    model = Model(inputs=inputs, outputs=outputs)
    return model

# Build and compile the model
model = build_model(input_dim=vocab_size, output_dim=num_classes, max_seq_length=max_length,
                    embedding_dim=embedding_dim, num_filters=num_filters, kernel_size=kernel_size)
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(word_indices_padded_reshaped, ner_labels_encoded, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7b191ce5b700>

In [12]:
###Preparing Testing Data

# Creating a list of words & Tags for the further steps

words_test = x_test.tolist()
ner_tags_test = y_test.tolist()

# Step 1: Encoding
word_to_index_test = {word: i + 1 for i, word in enumerate(set(words_test))}
ner_to_index_test = {ner: i + 1 for i, ner in enumerate(set(ner_tags_test))}

label_encoder = LabelEncoder()
ner_labels_encoded_test = label_encoder.fit_transform(ner_tags_test)

word_indices_test = [word_to_index_test[word] for word in words_test]
ner_indices_test = [ner_to_index_test[ner] for ner in ner_tags_test]

# Step 2: Padding
word_indices_padded_test = np.pad(word_indices_test, (0, max_length - len(word_indices_test)), mode='constant')
ner_indices_padded_test = np.pad(ner_indices_test, (0, max_length - len(ner_indices_test)), mode='constant')

word_indices_padded_reshaped_test = np.expand_dims(word_indices_padded_test, axis=-1)

In [15]:
# Function to evaluate the model performance using metrics

y_pred = model.predict(word_indices_padded_reshaped_test)

# def evaluate_model(model, x_test, y_test):

#     y_pred = model.predict(x_test)
#     # Convert predicted probabilities to class labels
#     y_pred_labels = np.argmax(y_pred, axis=1)

#     accuracy = np.mean(y_pred_labels == y_test)
#     return accuracy

# accuracy = evaluate_model(model, word_indices_padded_reshaped_test, ner_labels_encoded_test)
# print("Accuracy:", accuracy)



In [None]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(ner_labels_encoded_test, y_pred)
print('Accuracy:', acc)

In [None]:
# Function to predict NER Tags for the words in a sentence given by the User.

def predict_ner_tags(model, tokenizer, sentence):

    tokenizer = Tokenizer()
    tokens = tokenizer.texts_to_sequences([sentence])

    padded_tokens = tf.keras.preprocessing.sequence.pad_sequences(tokens, maxlen=max_length, padding='post')

    predictions = model.predict(padded_tokens)

    predicted_ner_tags = [np.argmax(pred) for pred in predictions[0]]
    return predicted_ner_tags

user_input = str(input('Enter a Sentence: '))
predicted_tags = predict_ner_tags(model, tokenizer, user_input)
print("Predicted NER tags:", predicted_tags)

In [31]:
!python --version

Python 3.10.12
