In [14]:
# -----------------------------
# Step 1: Install and Upgrade Necessary Packages
# -----------------------------
!pip install --upgrade torchcrf seqeval datasets
%pip install spacy
%pip install spacy_conll
import random
import spacy
from spacy_conll import ConllFormatter





In [15]:
# -----------------------------
# Step 1 continued: Install and Upgrade Necessary Packages
# -----------------------------

!pip install --upgrade torchcrf seqeval datasets



In [16]:
# -----------------------------
# Step 2: Import Libraries and Set Device
# -----------------------------
import torch
import torch.nn as nn
import torch.optim as optim
from TorchCRF import CRF  # Corrected import statement
from datasets import load_dataset
from sklearn.metrics import classification_report
from typing import List
import numpy as np
from seqeval.metrics import classification_report as seq_classification_report
from torch.utils.data import TensorDataset, DataLoader
from collections import defaultdict
import pickle
from google.colab import files

# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# -----------------------------
# Step 3: Define the Custom LSTM and LSTM-CRF Model
# -----------------------------
class CustomLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CustomLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Input gate parameters
        self.W_ii = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.W_hi = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_i = nn.Parameter(torch.Tensor(hidden_size))

        # Forget gate parameters
        self.W_if = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.W_hf = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_f = nn.Parameter(torch.Tensor(hidden_size))

        # Cell gate parameters
        self.W_ig = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.W_hg = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_g = nn.Parameter(torch.Tensor(hidden_size))

        # Output gate parameters
        self.W_io = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.W_ho = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_o = nn.Parameter(torch.Tensor(hidden_size))

        self.init_weights()

    def init_weights(self):
        # Initialize weight matrices with Xavier Uniform
        nn.init.xavier_uniform_(self.W_ii)
        nn.init.xavier_uniform_(self.W_hi)
        nn.init.xavier_uniform_(self.W_if)
        nn.init.xavier_uniform_(self.W_hf)
        nn.init.xavier_uniform_(self.W_ig)
        nn.init.xavier_uniform_(self.W_hg)
        nn.init.xavier_uniform_(self.W_io)
        nn.init.xavier_uniform_(self.W_ho)

        # Initialize biases with zeros
        nn.init.zeros_(self.b_i)
        nn.init.zeros_(self.b_f)
        nn.init.zeros_(self.b_g)
        nn.init.zeros_(self.b_o)

    def forward(self, input_seq, h_0=None, c_0=None):
        """
        input_seq: Tensor of shape (batch_size, seq_length, input_size)
        h_0: Initial hidden state (batch_size, hidden_size)
        c_0: Initial cell state (batch_size, hidden_size)
        Returns:
            h_seq: Tensor containing hidden states for all time steps (batch_size, seq_length, hidden_size)
            (h_n, c_n): Final hidden and cell states
        """
        batch_size, seq_length, _ = input_seq.size()
        if h_0 is None:
            h_t = torch.zeros(batch_size, self.hidden_size, device=input_seq.device)
        else:
            h_t = h_0
        if c_0 is None:
            c_t = torch.zeros(batch_size, self.hidden_size, device=input_seq.device)
        else:
            c_t = c_0

        h_seq = []

        for t in range(seq_length):
            x_t = input_seq[:, t, :]  # (batch_size, input_size)

            i_t = torch.sigmoid(x_t @ self.W_ii.T + h_t @ self.W_hi.T + self.b_i)
            f_t = torch.sigmoid(x_t @ self.W_if.T + h_t @ self.W_hf.T + self.b_f)
            g_t = torch.tanh(x_t @ self.W_ig.T + h_t @ self.W_hg.T + self.b_g)
            o_t = torch.sigmoid(x_t @ self.W_io.T + h_t @ self.W_ho.T + self.b_o)

            c_t = f_t * c_t + i_t * g_t
            h_t = o_t * torch.tanh(c_t)

            h_seq.append(h_t.unsqueeze(1))  # (batch_size, 1, hidden_size)

        h_seq = torch.cat(h_seq, dim=1)  # (batch_size, seq_length, hidden_size)
        return h_seq, (h_t, c_t)

class BidirectionalCustomLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(BidirectionalCustomLSTM, self).__init__()
        self.forward_lstm = CustomLSTM(input_size, hidden_size)
        self.backward_lstm = CustomLSTM(input_size, hidden_size)
        self.hidden_size = hidden_size

    def forward(self, input_seq, h_0=None, c_0=None):
        # Forward direction
        forward_out, (h_f, c_f) = self.forward_lstm(input_seq, h_0, c_0)

        # Backward direction
        reversed_input = torch.flip(input_seq, [1])  # Reverse the sequence
        backward_out, (h_b, c_b) = self.backward_lstm(reversed_input, h_0, c_0)
        backward_out = torch.flip(backward_out, [1])  # Re-reverse to original order

        # Concatenate forward and backward outputs
        h_seq = torch.cat([forward_out, backward_out], dim=2)  # (batch_size, seq_length, 2*hidden_size)

        # Final hidden and cell states
        h_n = torch.cat([h_f, h_b], dim=1)  # (batch_size, 2*hidden_size)
        c_n = torch.cat([c_f, c_b], dim=1)  # (batch_size, 2*hidden_size)

        return h_seq, (h_n, c_n)

# 2. Modify the LSTM_CRF class to properly handle padding in the forward pass
class LSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, padding_idx):
        super(LSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = BidirectionalCustomLSTM(embedding_dim, hidden_dim // 2)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size)
        self.padding_idx = padding_idx

    def forward(self, sentences, tags, mask):
        embeds = self.embedding(sentences)  # (batch_size, seq_length, embedding_dim)
        lstm_out, _ = self.lstm(embeds)    # (batch_size, seq_length, hidden_dim)

        # Apply the mask to zero out padding token representations
        lstm_out = lstm_out * mask.unsqueeze(-1)

        emissions = self.hidden2tag(lstm_out)

        # CRF loss with mask
        loss = -self.crf(emissions, tags, mask=mask)
        return loss.mean()

    def predict(self, sentences, mask):
        self.eval()
        with torch.no_grad():
            embeds = self.embedding(sentences)
            lstm_out, _ = self.lstm(embeds)

            # Apply the mask to zero out padding token representations
            lstm_out = lstm_out * mask.unsqueeze(-1)

            emissions = self.hidden2tag(lstm_out)
            predictions = self.crf.viterbi_decode(emissions, mask=mask)
        return predictions

Using device: cuda


In [29]:
# -----------------------------
# Step 4: Load and Preprocess the CoNLL-2003 Dataset
# -----------------------------
# Load the CoNLL-2003 dataset
dataset = load_dataset('conll2003',trust_remote_code=True)

# Inspect the dataset
print(dataset)

# Example of the dataset
print(dataset['train'][0])

# Extract all unique words and tags
words = set()
tags = set()

for split in ['train', 'validation', 'test']:
    for sentence in dataset[split]:
        for word in sentence['tokens']:
            words.add(word.lower())  # Lowercasing for normalization
        for tag in sentence['ner_tags']:
            tags.add(tag)

# Create word2idx dictionary
word2idx = {"<PAD>": 0, "<UNK>": 1}
for word in sorted(words):
    word2idx[word] = len(word2idx)

# Mapping from tag indices to tag names
tag_names = dataset['train'].features['ner_tags'].feature.names
tag2idx = {"<PAD>": 0}
for tag in tag_names:
    tag2idx[tag] = len(tag2idx)

idx2tag = {v: k for k, v in tag2idx.items()}

print(f"Number of unique words: {len(word2idx)}")
print(f"Number of unique tags: {len(tag2idx)}")

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
{'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}
Number of unique words: 26871
Number of unique tags: 10


In [18]:
# -----------------------------
# Step 5: Encode the Dataset
# -----------------------------
# Parameters
MAX_LEN = 50  # Maximum sentence length
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
BATCH_SIZE = 32
EPOCHS = 10  # Increased from 5

# Encoding functions

def encode_sentences(sentences: List[List[str]], word2idx: dict, max_len: int) -> torch.Tensor:
    encoded = []
    for sentence in sentences:
        encoded_sentence = [word2idx.get(word.lower(), word2idx["<UNK>"]) for word in sentence]
        # Padding
        if len(encoded_sentence) < max_len:
            encoded_sentence += [word2idx["<PAD>"]] * (max_len - len(encoded_sentence))
        else:
            encoded_sentence = encoded_sentence[:max_len]
        encoded.append(encoded_sentence)
    return torch.tensor(encoded, dtype=torch.long)

def encode_labels(labels: List[List[str]], tag2idx: dict, max_len: int) -> torch.Tensor:
    """
    Encode NER labels using the tag2idx dictionary without shifting.

    Args:
        labels (List[List[str]]): List of label sequences.
        tag2idx (dict): Dictionary mapping tag names to indices.
        max_len (int): Maximum sequence length.

    Returns:
        torch.Tensor: Tensor of encoded labels.
    """
    encoded = []
    for label_seq in labels:
        # Directly map labels using tag2idx, defaulting to 'O' if tag not found
        encoded_label = [tag2idx.get(tag, tag2idx["O"]) for tag in label_seq]
        # Padding
        if len(encoded_label) < max_len:
            encoded_label += [tag2idx["<PAD>"]] * (max_len - len(encoded_label))
        else:
            encoded_label = encoded_label[:max_len]
        encoded.append(encoded_label)
    return torch.tensor(encoded, dtype=torch.long)

# Prepare training data
train_sentences = [example['tokens'] for example in dataset['train']]
# Convert numeric NER tags back to string labels using tag_names
train_labels = [[tag_names[tag] for tag in example['ner_tags']] for example in dataset['train']]

X_train = encode_sentences(train_sentences, word2idx, MAX_LEN)
y_train = encode_labels(train_labels, tag2idx, MAX_LEN)

# Prepare validation data
val_sentences = [example['tokens'] for example in dataset['validation']]
val_labels = [[tag_names[tag] for tag in example['ner_tags']] for example in dataset['validation']]

X_val = encode_sentences(val_sentences, word2idx, MAX_LEN)
y_val = encode_labels(val_labels, tag2idx, MAX_LEN)

# Prepare test data (optional, for later evaluation)
test_sentences = [example['tokens'] for example in dataset['test']]
test_labels = [[tag_names[tag] for tag in example['ner_tags']] for example in dataset['test']]

X_test = encode_sentences(test_sentences, word2idx, MAX_LEN)
y_test = encode_labels(test_labels, tag2idx, MAX_LEN)

print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")


Training samples: 14041
Validation samples: 3250
Test samples: 3453


In [19]:
# -----------------------------
# Step 6: Create DataLoaders
# -----------------------------

PAD_IDX = word2idx["<PAD>"]  # This should be 0 based on your word2idx creation

# First add the DataCollator class
# First, modify the DataCollatorWithPadding class to properly mask padding tokens
class DataCollatorWithPadding:
    def __init__(self, pad_idx, device):
        self.pad_idx = pad_idx
        self.device = device

    def __call__(self, batch):
        # Get max length of batch
        max_len = max(len(x[0]) for x in batch)

        # Initialize tensors
        padded_X = torch.ones(len(batch), max_len).long() * self.pad_idx
        padded_y = torch.ones(len(batch), max_len).long() * self.pad_idx
        # Create attention mask (1 for real tokens, 0 for padding)
        attention_mask = torch.zeros(len(batch), max_len).bool()

        for i, (x, y) in enumerate(batch):
            seq_len = len(x)
            padded_X[i, :seq_len] = x[:seq_len]
            padded_y[i, :seq_len] = y[:seq_len]
            attention_mask[i, :seq_len] = 1

        return padded_X.to(self.device), padded_y.to(self.device), attention_mask.to(self.device)


# Initialize the collator
data_collator = DataCollatorWithPadding(pad_idx=PAD_IDX, device=device)

# Create TensorDatasets
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

# Create DataLoaders with the custom collator
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=data_collator
)
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=data_collator
)

In [20]:
# -----------------------------
# Step 7: Train the LSTM-CRF Model
# -----------------------------

# Initialize Model
VOCAB_SIZE = len(word2idx)
TAGSET_SIZE = len(tag2idx)
PAD_IDX = word2idx["<PAD>"]

model = LSTM_CRF(VOCAB_SIZE, TAGSET_SIZE, EMBEDDING_DIM, HIDDEN_DIM, padding_idx=PAD_IDX)
model.to(device)

# Initialize Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, val_loader, optimizer, device, clip_value=1.0):
    model.train()
    total_loss = 0

    # Training
    for batch_X, batch_y, mask in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        mask = mask.to(device)

        optimizer.zero_grad()
        loss = model(batch_X, batch_y, mask)
        loss.backward()

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    val_predictions = []
    val_labels = []

    with torch.no_grad():
        for batch_X, batch_y, mask in val_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            mask = mask.to(device)

            predictions = model.predict(batch_X, mask)

            # Process predictions and labels
            for pred_seq, true_seq, seq_mask in zip(predictions, batch_y, mask):
                seq_len = seq_mask.sum().item()
                pred_seq = pred_seq[:seq_len]
                true_seq = true_seq[:seq_len].tolist()

                # Convert indices to tags
                pred_tags = [idx2tag.get(idx, 'O') for idx in pred_seq]
                true_tags = [idx2tag.get(idx, 'O') for idx in true_seq]

                val_predictions.append(pred_tags)
                val_labels.append(true_tags)

    # Print validation metrics
    print("\nValidation Metrics:")
    print(seq_classification_report(val_labels, val_predictions, zero_division=0))

    return total_loss / len(train_loader)

# Then in your training loop:
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss = train_model(model, train_loader, val_loader, optimizer, device)
    print(f"Training loss: {train_loss:.4f}")

Epoch 1/10

Validation Metrics:




              precision    recall  f1-score   support

         LOC       0.73      0.61      0.67      1834
        MISC       0.58      0.26      0.36       919
         ORG       0.65      0.29      0.41      1339
        PAD>       0.98      0.98      0.98      3221
         PER       0.65      0.41      0.50      1796

   micro avg       0.82      0.62      0.70      9109
   macro avg       0.72      0.51      0.58      9109
weighted avg       0.77      0.62      0.68      9109

Training loss: 14.3980
Epoch 2/10

Validation Metrics:
              precision    recall  f1-score   support

         LOC       0.83      0.72      0.77      1834
        MISC       0.77      0.55      0.64       919
         ORG       0.72      0.48      0.58      1339
        PAD>       0.99      0.99      0.99      3221
         PER       0.73      0.59      0.65      1796

   micro avg       0.86      0.74      0.79      9109
   macro avg       0.81      0.67      0.73      9109
weighted avg       0.8

In [21]:
# -----------------------------
# Step 7 (continued): Model Evaluation
# -----------------------------

def evaluate_predictions(model, data_loader, idx2tag, device):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch_X, batch_y, mask in data_loader:
            batch_X = batch_X.to(device)
            predictions = model.predict(batch_X, mask)

            # Process each sequence in the batch
            for pred_seq, true_seq, seq_mask in zip(predictions, batch_y, mask):
                # Get actual sequence length from mask
                seq_len = seq_mask.sum().item()

                # Convert predictions and true labels using actual sequence length
                pred_tags = [idx2tag[p] for p in pred_seq[:seq_len]]
                true_tags = [idx2tag[t.item()] for t in true_seq[:seq_len]]

                all_predictions.append(pred_tags)
                all_labels.append(true_tags)

    return all_predictions, all_labels

# Example usage with validation data
predictions, labels = evaluate_predictions(model, val_loader, idx2tag, device)
print("\nValidation Results:")
print(seq_classification_report(labels, predictions))


Validation Results:
              precision    recall  f1-score   support

         LOC       0.85      0.84      0.85      1834
        MISC       0.77      0.71      0.74       919
         ORG       0.74      0.67      0.70      1339
        PAD>       1.00      1.00      1.00      3221
         PER       0.80      0.70      0.74      1796

   micro avg       0.87      0.83      0.85      9109
   macro avg       0.83      0.78      0.81      9109
weighted avg       0.87      0.83      0.85      9109



In [22]:
def evaluate_and_show_results_with_metrics(model, X_test, y_test, test_sentences, idx2tag, device):
    model.eval()
    all_preds = []
    all_true = []

    # Move data to the specified device
    X_test, y_test = X_test.to(device), y_test.to(device)
    mask = (X_test != PAD_IDX)

    with torch.no_grad():
        # Get model predictions
        predictions = model.predict(X_test, mask=mask)

        for i, (pred_indices, true_labels) in enumerate(zip(predictions, y_test)):
            tokens = test_sentences[i]
            true_labels = true_labels.cpu().numpy()
            pred_indices = np.array(pred_indices)

            # Ensure pred_indices and true_labels are the same length before masking
            if len(pred_indices) != len(true_labels[true_labels != PAD_IDX]):
                print(f"Sequence {i} has mismatched lengths before masking: pred_indices={len(pred_indices)}, true_labels={len(true_labels[true_labels != PAD_IDX])}")

            # Use the mask to filter out padding tokens
            valid_positions = (true_labels != PAD_IDX)
            true_labels_filtered = true_labels[valid_positions]
            pred_indices_filtered = pred_indices

            # Map indices to tags
            true_labels_list = [idx2tag[label] for label in true_labels_filtered]
            pred_tags_list = [idx2tag[idx] for idx in pred_indices_filtered]

            # Handle the "O-O" special case
            true_labels_list = ["O" if tag == "O-O" else tag for tag in true_labels_list]

            # Filter out PAD> tags
            if len(true_labels_list) == len(pred_tags_list):
                filtered_true = []
                filtered_pred = []
                for true_tag, pred_tag in zip(true_labels_list, pred_tags_list):
                    if true_tag != "PAD>" and pred_tag != "PAD>":
                        filtered_true.append(true_tag)
                        filtered_pred.append(pred_tag)
                all_true.append(filtered_true)
                all_preds.append(filtered_pred)

            # Print tokens with true labels and predicted labels
            print(f"\nSequence {i}:")
            print(f"{'Token':15} {'True Label':15} {'Predicted Label'}")
            print('-' * 45)
            for token, true_label, pred_label in zip(tokens, true_labels_list, pred_tags_list):
                if true_label != "PAD>" and pred_label != "PAD>":
                    print(f"{token:15} {true_label:15} {pred_label}")

    # Print classification report without PAD>
    print("\nClassification Report:")
    print(seq_classification_report(all_true, all_preds, zero_division=0))

evaluate_and_show_results_with_metrics(model, X_test, y_test, test_sentences, idx2tag, device)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Token           True Label      Predicted Label
---------------------------------------------
Reading         B-ORG           B-ORG
0               O               O
Port            B-ORG           B-ORG
Vale            I-ORG           I-ORG
1               O               O

Sequence 3099:
Token           True Label      Predicted Label
---------------------------------------------
Sheffield       B-ORG           B-ORG
United          I-ORG           I-ORG
1               O               O
Portsmouth      B-ORG           B-ORG
0               O               O

Sequence 3100:
Token           True Label      Predicted Label
---------------------------------------------
Stoke           B-ORG           B-ORG
2               O               O
Tranmere        B-ORG           B-ORG
0               O               O

Sequence 3101:
Token           True Label      Predicted Label
---------------------------------------------
Pla

In [23]:
# -----------------------------
# Step 8: Save the Model and Dictionaries
# -----------------------------
# Save the retrained model state dictionary
torch.save(model.state_dict(), 'custom_lstm_crf_model_retrained.pth')
print("Retrained model saved successfully!")

# Save word2idx and tag2idx dictionaries using pickle (if not already saved)
with open('word2idx.pkl', 'wb') as f:
    pickle.dump(word2idx, f)
print("word2idx saved successfully!")

with open('tag2idx.pkl', 'wb') as f:
    pickle.dump(tag2idx, f)
print("tag2idx saved successfully!")


Retrained model saved successfully!
word2idx saved successfully!
tag2idx saved successfully!


In [30]:

# # -----------------------------
# # Step 9: Upload Your Text File to Colab
# # -----------------------------
# from google.colab import files
# import nltk
# from nltk.tokenize import sent_tokenize

# # Upload the file
# # uploaded = files.upload()

In [25]:
# -----------------------------
# Step 10: Load and Preprocess Your Text File
# -----------------------------
import random
import spacy
from spacy_conll import ConllFormatter

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load your uploaded text file
file_path = "combined_summaries.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

# Process the text using spaCy
doc = nlp(text)

# Prepare data in a structured format
conll_data = []
ner_tag_set = set()  # Collect all unique NER tags for feature mapping

for i, sent in enumerate(doc.sents):
    tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
    for token in sent:
        tokens.append(token.text)
        pos_tags.append(token.pos_)
        chunk_tags.append(token.dep_)
        ner_tag = f"{token.ent_iob_}-{token.ent_type_ if token.ent_iob_ != 'O' else 'O'}"
        if ner_tag == "O-O":  # Handle invalid tag
            ner_tag = "O"
        ner_tags.append(ner_tag)
        ner_tag_set.add(ner_tag)  # Add to the set of unique tags
    conll_data.append({
        "id": i,
        "tokens": tokens,
        "pos_tags": pos_tags,
        "chunk_tags": chunk_tags,
        "ner_tags": ner_tags
    })

# Since it's custom data, treat it as test data
test_data = conll_data

# Define the Dataset class (reuse existing class)
class Dataset:
    def __init__(self, split_data, ner_tag_names):
        self.data = split_data
        self.features = {
            "ner_tags": {
                "feature": {
                    "names": ner_tag_names
                }
            }
        }
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]
    def __repr__(self):
        return f"Dataset({{\n    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n    num_rows: {len(self.data)}\n}})"

# Create datasets
# **Important**: Use the loaded ner_tag_names from training
ner_tag_names = sorted(ner_tag_set)  # Sorted list of unique NER tag names
dataset = {
    "test": Dataset(test_data, ner_tag_names)
}

# **Do NOT recreate word2idx and tag2idx here**
# Instead, load the saved dictionaries from training

# Load word2idx and tag2idx dictionaries
with open('word2idx.pkl', 'rb') as f:
    word2idx_loaded = pickle.load(f)

with open('tag2idx.pkl', 'rb') as f:
    tag2idx_loaded = pickle.load(f)

idx2tag_loaded = {v: k for k, v in tag2idx_loaded.items()}

# Extract sentences and labels from the custom dataset
test_sentences = [example['tokens'] for example in dataset['test']]
test_labels = [example['ner_tags'] for example in dataset['test']]

# Encode sentences and labels using the **loaded** dictionaries
X_test = encode_sentences(test_sentences, word2idx_loaded, MAX_LEN)
y_test = encode_labels(test_labels, tag2idx_loaded, MAX_LEN)

print(f"Custom Test samples: {X_test.shape[0]}")


Custom Test samples: 307


In [26]:
# -----------------------------
# Step 11: Perform NER on Your Text File
# -----------------------------

import torch
from torch.utils.data import DataLoader, TensorDataset
import pickle

# Ensure that 'data_collator' is defined (from Step 6)
# If not already defined, make sure to include the DataCollatorWithPadding class and initialize 'data_collator'
# from Step 6:
# class DataCollatorWithPadding:
#     def __init__(self, pad_idx, device):
#         self.pad_idx = pad_idx
#         self.device = device
#     def __call__(self, batch):
#         # Padding logic
#         ...

# Load word2idx and tag2idx dictionaries
with open('word2idx.pkl', 'rb') as f:
    word2idx_loaded = pickle.load(f)

with open('tag2idx.pkl', 'rb') as f:
    tag2idx_loaded = pickle.load(f)

# Create idx2tag mapping
idx2tag_loaded = {v: k for k, v in tag2idx_loaded.items()}

# Assign loaded dictionaries to variables used in preprocessing
word2idx = word2idx_loaded
tag2idx = tag2idx_loaded

# Define model parameters based on loaded dictionaries
VOCAB_SIZE = len(word2idx)
TAGSET_SIZE = len(tag2idx)
PAD_IDX = word2idx["<PAD>"]
EMBEDDING_DIM = 100  # Ensure this matches Step 5
HIDDEN_DIM = 128      # Ensure this matches Step 5

# Initialize the model
model = LSTM_CRF(VOCAB_SIZE, TAGSET_SIZE, EMBEDDING_DIM, HIDDEN_DIM, padding_idx=PAD_IDX)
model.to(device)

# Load the saved model state
model_path = 'custom_lstm_crf_model_retrained.pth'  # Path to the retrained model
try:
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    print("Model loaded successfully!")
except FileNotFoundError:
    print(f"Model file '{model_path}' not found. Please ensure the path is correct.")

# Define the DataLoader creation function (reuse the one from Step 6 or Step 11's earlier definition)
def create_dataloader_custom(encoded_sentences: torch.Tensor, encoded_labels: torch.Tensor, batch_size: int=32) -> DataLoader:
    """
    Create a DataLoader for the encoded sentences and labels.

    Args:
        encoded_sentences (torch.Tensor): Tensor of encoded sentences.
        encoded_labels (torch.Tensor): Tensor of encoded labels.
        batch_size (int, optional): Batch size. Defaults to 32.

    Returns:
        DataLoader: DataLoader object.
    """
    test_dataset = TensorDataset(encoded_sentences, encoded_labels)
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=data_collator  # Ensure 'data_collator' is defined and handles padding
    )
    return test_loader

# Create DataLoader for custom test data
test_loader_custom = create_dataloader_custom(X_test, y_test, batch_size=BATCH_SIZE)

print(f"Custom Test DataLoader created with {len(test_loader_custom)} batches.")


Model loaded successfully!
Custom Test DataLoader created with 10 batches.


  model.load_state_dict(torch.load(model_path, map_location=device))


In [27]:
# -----------------------------
# Step 12: Display the NER Results
# -----------------------------
from typing import List, Tuple

def perform_ner_with_labels(model: nn.Module, dataloader: DataLoader, idx2tag: dict, device: torch.device) -> Tuple[List[List[str]], List[List[str]]]:
    """
    Perform NER and retrieve true labels.

    Args:
        model (nn.Module): Trained LSTM_CRF model.
        dataloader (DataLoader): DataLoader with sentences and true labels.
        idx2tag (dict): Mapping from tag indices to tag names.
        device (torch.device): Device to perform computation on.

    Returns:
        Tuple[List[List[str]], List[List[str]]]: Predicted tags and true tags.
    """
    all_preds = []
    all_true_labels = []
    with torch.no_grad():
        for batch_X, batch_y, mask in dataloader:
            batch_X, batch_y, mask = batch_X.to(device), batch_y.to(device), mask.to(device)
            # Get predictions
            preds = model.predict(batch_X, mask)
            for pred, true_label, seq_mask in zip(preds, batch_y, mask):
                seq_len = seq_mask.sum().item()
                pred = pred[:seq_len]
                true_label = true_label[:seq_len]
                # Convert indices to tag names
                pred_tags = [idx2tag.get(idx, 'O') for idx in pred]
                true_tags = [idx2tag.get(idx.item(), 'O') for idx in true_label]
                all_preds.append(pred_tags)
                all_true_labels.append(true_tags)
    return all_preds, all_true_labels

def display_ner_results_with_labels(sentences: List[List[str]], predicted_tags: List[List[str]], true_tags: List[List[str]]):
    """
    Display tokens with their predicted and true NER tags.

    Args:
        sentences (List[List[str]]): Original tokenized sentences.
        predicted_tags (List[List[str]]): Predicted NER tags for each token.
        true_tags (List[List[str]]): True NER tags for each token.
    """
    for i, (sentence, pred_tags, true_tags_seq) in enumerate(zip(sentences, predicted_tags, true_tags)):
        print(f"\nSentence {i+1}:")
        for token, pred_tag, true_tag in zip(sentence[:len(pred_tags)], pred_tags, true_tags_seq):
            print(f"{token:15}\tPredicted: {pred_tag:10}\tTrue: {true_tag}")
    # Prepare sequences for classification report by removing padding
    cleaned_true_tags = []
    cleaned_pred_tags = []
    for true_seq, pred_seq in zip(true_tags, predicted_tags):
        # Remove padding from true tags
        true_len = 0
        for j, tag in enumerate(true_seq):
            if tag == '<PAD>':
                break
            true_len = j + 1
        # Get the actual tags without padding
        true_without_pad = true_seq[:true_len]
        pred_without_pad = pred_seq[:true_len]  # Use same length for predictions
        if true_without_pad and pred_without_pad:  # Only add if both sequences have content
            cleaned_true_tags.append(true_without_pad)
            cleaned_pred_tags.append(pred_without_pad[:len(true_without_pad)])  # Ensure same length
    # Generate classification report
    print("\nClassification Report:")
    print(seq_classification_report(cleaned_true_tags, cleaned_pred_tags, zero_division=0))

# Perform NER and retrieve true labels with corrected function
predicted_tags_custom, true_tags_custom = perform_ner_with_labels(model, test_loader_custom, idx2tag_loaded, device)
print("NER prediction and true label extraction completed!")

# Display NER results with true labels
display_ner_results_with_labels(test_sentences, predicted_tags_custom, true_tags_custom)

# Now, print the classification report
print("\nClassification Report:")
print(seq_classification_report(true_tags_custom, predicted_tags_custom, zero_division=0))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
.              	Predicted: O         	True: O

Sentence 72:
Sutter         	Predicted: B-PER     	True: B-ORG
says           	Predicted: O         	True: O
instead        	Predicted: O         	True: O
of             	Predicted: O         	True: O
just           	Predicted: O         	True: O
extensively    	Predicted: O         	True: O
intensively    	Predicted: O         	True: O
farming        	Predicted: O         	True: O
,              	Predicted: O         	True: O
we             	Predicted: O         	True: O
need           	Predicted: O         	True: O
to             	Predicted: O         	True: O
use            	Predicted: O         	True: O
land           	Predicted: O         	True: O
and            	Predicted: O         	True: O
water          	Predicted: O         	True: O
more           	Predicted: O         	True: O
extensively    	Predicted: O         	True: O
instead        	Predicted: O         	True: