In [3]:
%pip install torch TorchCRF
%pip install torch torchaudio
%pip install datasets
%pip install transformers
%pip install --upgrade pip
%pip install --upgrade transformers accelerate datasets[audio]
%pip install soundfile
%pip install tensorflow

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
   ---------------------------------------- 0.0/10.0 MB ? eta -:--:--
   -------------------------------- ------- 8.1/10.0 MB 50.4 MB/s eta 0:00:01
   ---------------------------------------- 10.0/10.0 MB 44.6 MB/s eta 0:00:00
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.2
    Uninstalling transformers-4.46.2:
      Successfully uninstalled transformers-4.46.2
Successfully installed transformers-4.46.3
Note: you may need to rest

In [3]:
# Import necessary libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import BertTokenizer
from TorchCRF import CRF

# Define the path to your local file
local_file_path = "transcription_test_AimeeMullins_1249s_summarized.txt"

# Verify the file exists
import os
if not os.path.exists(local_file_path):
    raise FileNotFoundError(f"File not found: {local_file_path}")
else:
    print(f"File '{local_file_path}' found.")

# Load the dataset using the appropriate loader based on your file type
tedlium = load_dataset("text", data_files=local_file_path)

# Verify the dataset is loaded correctly
print(tedlium)

# Create a custom dataset class
class TEDLIUMDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        transcription = self.texts[idx]
        return transcription

# Prepare the dataset
train_texts = tedlium['train']['text']

# Creating the dataset
train_dataset = TEDLIUMDataset(train_texts)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)  # Adjust batch size as needed

# Tokenize the texts
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

vocab_size = len(tokenizer)
embed_dim = 100  # Dimension of the embedding layer
hidden_dim = 128  # Dimension of LSTM hidden states
output_dim = 2  # Number of classes for classification (adjust as necessary)

# Define the LSTM-CRF model
class LSTMCRF(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMCRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=tokenizer.pad_token_id)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Multiply by 2 for bidirectional

        # CRF layer for sequence tagging (without batch_first argument)
        self.crf = CRF(output_dim)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        logits = self.fc(lstm_out)
        return logits

    def decode(self, logits, mask):
        return self.crf.decode(logits, mask=mask)

# Instantiate the model
model = LSTMCRF(vocab_size, embed_dim, hidden_dim, output_dim)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model is running on {device}")

# Training hyperparameters
num_epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Tokenize the batch of texts
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Placeholder for labels (replace this with actual labels)
        labels = torch.randint(0, output_dim, (input_ids.size(0), input_ids.size(1))).to(device)  # Random labels for testing

        optimizer.zero_grad()
        logits = model(input_ids)  # Forward pass

        # Compute CRF loss
        try:
            loss = -model.crf(logits, labels, mask=attention_mask.bool())  # Negative log likelihood
            print(f"Loss shape: {loss.shape}")  # Print shape of loss to check

            if loss.ndimension() > 0:
                loss = loss.mean()  # Reduce to scalar if not already scalar

            print(f"Reduced Loss: {loss}")  # Print reduced loss value

            total_loss += loss.item()  # Convert to scalar and accumulate

            loss.backward()
            optimizer.step()

            # After training, make predictions using the CRF layer (using viterbi_decode)
            predicted_labels = model.crf.viterbi_decode(logits, mask=attention_mask.bool())  # Decode using CRF
            print("Predicted Labels:", predicted_labels)
            break  # Remove to evaluate on the entire dataset
        except IndexError as e:
            print(f"Skipping batch with shape mismatch: logits shape {logits.shape}, labels shape {labels.shape}")

    avg_loss = total_loss / len(train_loader)  # Average loss for the epoch
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Sample prediction
model.eval()
with torch.no_grad():
    for batch in train_loader:
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        logits = model(input_ids)
        predicted_label= model.crf.viterbi_decode(logits, mask=attention_mask.bool())  # Decode using CRF
        print("Predicted Labels:", predicted_labels)

        break  # Remove to evaluate on the entire dataset


File 'transcription_test_AimeeMullins_1249s_summarized.txt' found.
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1
    })
})
Model is running on cpu
Loss shape: torch.Size([1])
Reduced Loss: 310.81146240234375
Predicted Labels: [[1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 

In [6]:
# Import necessary libraries

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import BertTokenizer
from TorchCRF import CRF

# Define the path to your local file
local_file_path = "transcription_test_AimeeMullins_1249s_summarized.txt"

# Verify the file exists
import os
if not os.path.exists(local_file_path):
    raise FileNotFoundError(f"File not found: {local_file_path}")
else:
    print(f"File '{local_file_path}' found.")

# Load the dataset using the appropriate loader based on your file type
tedlium = load_dataset("text", data_files=local_file_path)

# Verify the dataset is loaded correctly
print(tedlium)

# Create a custom dataset class
class TEDLIUMDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        transcription = self.texts[idx]
        return transcription

# Prepare the dataset
train_texts = tedlium['train']['text']

# Creating the dataset
train_dataset = TEDLIUMDataset(train_texts)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)  # Adjust batch size as needed

# Tokenize the texts
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

vocab_size = len(tokenizer)
embed_dim = 100  # Dimension of the embedding layer
hidden_dim = 128  # Dimension of LSTM hidden states
output_dim = 2  # Number of classes for classification (adjust as necessary)

# Define the LSTM-CRF model
class LSTMCRF(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMCRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=tokenizer.pad_token_id)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Multiply by 2 for bidirectional

        # CRF layer for sequence tagging (without batch_first argument)
        self.crf = CRF(output_dim)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        logits = self.fc(lstm_out)
        return logits

    def decode(self, logits, mask):
        return self.crf.decode(logits, mask=mask)

# Instantiate the model
model = LSTMCRF(vocab_size, embed_dim, hidden_dim, output_dim)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model is running on {device}")

# Training hyperparameters
num_epochs = 10
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Tokenize the batch of texts
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Placeholder for labels (replace this with actual labels)
        labels = torch.randint(0, output_dim, (input_ids.size(0), input_ids.size(1))).to(device)  # Random labels for testing

        optimizer.zero_grad()
        logits = model(input_ids)  # Forward pass

        # Compute CRF loss
        try:
            loss = -model.crf(logits, labels, mask=attention_mask.bool())  # Negative log likelihood
            print(f"Loss shape: {loss.shape}")  # Print shape of loss to check

            if loss.ndimension() > 0:
                loss = loss.mean()  # Reduce to scalar if not already scalar

            print(f"Reduced Loss: {loss}")  # Print reduced loss value

            total_loss += loss.item()  # Convert to scalar and accumulate

            loss.backward()
            optimizer.step()

            # After training, make predictions using the CRF layer (using viterbi_decode)
            predicted_labels= model.crf.viterbi_decode(logits, mask=attention_mask.bool())  # Decode using CRF
            print("Predicted Labels:", predicted_labels)
            break  # Remove to evaluate on the entire dataset
        except IndexError as e:
            print(f"Skipping batch with shape mismatch: logits shape {logits.shape}, labels shape {labels.shape}")

    avg_loss = total_loss / len(train_loader)  # Average loss for the epoch
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Define the label map for NER (BIO tagging)
# label_map = {
#     0: "O",          # Outside any named entity
#     1: "B-PERSON",   # Beginning of a person entity
#     2: "I-PERSON",   # Inside a person entity
#     3: "B-ORG",      # Beginning of an organization entity
#     4: "I-ORG",      # Inside an organization entity
#     # Add other entities as needed
# }

# Sample prediction after training
# model.eval()
# with torch.no_grad():
#     for batch in train_loader:
#         # Tokenize the batch of texts
#         inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
#         input_ids = inputs['input_ids'].to(device)
#         attention_mask = inputs['attention_mask'].to(device)

#         # Forward pass through the model
#         logits = model(input_ids)

#         # Decode predictions using the CRF layer
#         predicted_labels = model.crf.viterbi_decode(logits, mask=attention_mask.bool())  # CRF decoding
        
#         # Debugging step: Print the predicted_labels to understand its structure
#         print("Predicted labels:", predicted_labels)  # Inspect the actual content of predicted_labels
        
        # # Check if predicted_labels is a list of lists (batch of sequences) or just a single sequence
        # if isinstance(predicted_labels, list) and isinstance(predicted_labels[0], list):
        #     print("Predicted labels shape:", len(predicted_labels[0]))  # First sequence in the batch
        # else:
        #     print("Predicted labels is not a list of sequences. It might be a single sequence or a different format.")
        
        # # Flatten the list of predicted labels (if it's a list of sequences)
        # if isinstance(predicted_labels, list) and isinstance(predicted_labels[0], list):
        #     # Flattening the list of lists (for a batch of sequences)
        #     flattened_predicted_labels = [label for sublist in predicted_labels for label in sublist]
        # else:
        #     flattened_predicted_labels = predicted_labels  # If it's a single sequence
        
        # # Now, map the flattened predicted labels to their BIO tag names
        # predicted_labels_mapped = [label_map[label] for label in flattened_predicted_labels]
        
        # # Display tokens and their predicted labels (BIO tags)
        # tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())  # Convert the first batch to tokens
        # print("Tokens:", tokens)
        # print("Predicted Labels:", predicted_labels_mapped)


        # break  # Remove to evaluate on the entire dataset


#updated the last prediction section...

label_map = {0: "O", 1: "B-PERSON", 2: "I-PERSON", 3: "B-ORG", 4: "I-ORG"}

# Predict and map labels
model.eval()
with torch.no_grad():
    for batch in train_loader:
        # Tokenize and move inputs to the device
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        
        # Forward pass
        logits = model(input_ids)
        
        # Decode predictions
        predicted_labels = model.crf.viterbi_decode(logits, mask=attention_mask.bool())
        
        # Convert label indices to names
        decoded_labels = [
            [label_map[label] for label in seq] for seq in predicted_labels
        ]
        
        # Print results for debugging
        for i, (text, labels) in enumerate(zip(batch, decoded_labels)):
            print(f"Text: {text}")
            print(f"Labels: {labels}")
        
        break  # Remove to evaluate the entire dataset



File 'transcription_test_AimeeMullins_1249s_summarized.txt' found.
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1
    })
})
Model is running on cpu
Loss shape: torch.Size([1])
Reduced Loss: 312.4609375
Predicted Labels: [[1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0

In [15]:
#sample text

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed
import re

# Sample data: input text with entities
texts = [
    "Amy Mullins says she never looked up the word disabled to see what she'd find.",
    "Her childhood doctor at the AI DuPont Institute in Wilmington, Delaware, opened doors for her, she says."
]

# Sample entity labels for the text
labels = [
    ["B-PERSON", "I-PERSON", "O", "O", "O", "O", "O", "O", "B-ORG", "I-ORG", "O", "O"],
    ["B-PERSON", "I-PERSON", "O", "O", "O", "O", "B-ORG", "I-ORG", "O"]
]

# Tokenizer (simple word-based tokenization)
tokenizer = re.compile(r"\w+")
word_to_idx = {}
tag_to_idx = {}
max_seq_len = 0

# Build the tokenizer and map words and tags to indices
for text, label in zip(texts, labels):
    words = tokenizer.findall(text)
    for word in words:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)
    for tag in label:
        if tag not in tag_to_idx:
            tag_to_idx[tag] = len(tag_to_idx)

# Convert the texts and labels to numerical indices
X = [[word_to_idx[word] for word in tokenizer.findall(text)] for text in texts]
y = [[tag_to_idx[tag] for tag in label] for label in labels]
max_seq_len = max(max(len(seq) for seq in X), max_seq_len)

# Padding sequences to ensure equal length
X_pad = np.array([seq + [0] * (max_seq_len - len(seq)) for seq in X])
y_pad = np.array([seq + [0] * (max_seq_len - len(seq)) for seq in y])

# Hyperparameters
vocab_size = len(word_to_idx) + 1  # Add 1 for padding index
tag_size = len(tag_to_idx)
embedding_dim = 100
lstm_units = 128
dropout_rate = 0.5

# Initialize the transition matrix globally (before the loss function)
trans = tf.Variable(np.random.randn(tag_size, tag_size), dtype=tf.float32)

# Model architecture: LSTM + CRF (Manual CRF implementation)
inputs = Input(shape=(max_seq_len,))
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_len)(inputs)
lstm = LSTM(units=lstm_units, return_sequences=True, dropout=dropout_rate)(embedding)
output = TimeDistributed(Dense(tag_size, activation='softmax'))(lstm)

# Manually define CRF loss function (negative log-likelihood for CRF)
def crf_loss(y_true, y_pred):
    global trans  # Use the global transition matrix

    # Sequence lengths for each example in the batch
    sequence_lengths = tf.reduce_sum(tf.cast(tf.not_equal(y_true, 0), tf.int32), axis=1)

    # Placeholder CRF implementation (this is an approximation)
    loss = tf.reduce_mean(tf.losses.sparse_categorical_crossentropy(y_true, y_pred))  # Placeholder for simplicity
    return loss

# Compile the model
model = Model(inputs, output)
model.summary()

# Compile the model with Adam optimizer and CRF loss function
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=crf_loss,
              metrics=["accuracy"])

# Train the model
history = model.fit(X_pad, y_pad, batch_size=16, epochs=1000, validation_split=0.1)

# Evaluating predictions manually
y_pred = model.predict(X_pad)

# Decoding predictions back to tags (index to tag)
idx_to_tag = {v: k for k, v in tag_to_idx.items()}

# Function to decode predictions into human-readable tags
def decode_predictions(predictions, idx_to_tag):
    decoded_preds = []
    for seq in predictions:
        decoded_seq = [idx_to_tag[np.argmax(p)] for p in seq]
        decoded_preds.append(decoded_seq)
    return decoded_preds

# Decoding the predictions
predictions = decode_predictions(y_pred, idx_to_tag)

# Print decoded predictions
for i, pred in enumerate(predictions):
    print(f"Text: {texts[i]}")
    print(f"Predictions: {pred}")
    print("-" * 80)
 

Epoch 1/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step - accuracy: 0.1176 - loss: 1.6092 - val_accuracy: 0.4118 - val_loss: 1.6039
Epoch 2/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 0.4706 - loss: 1.5996 - val_accuracy: 0.3529 - val_loss: 1.5991
Epoch 3/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 0.7059 - loss: 1.5850 - val_accuracy: 0.3529 - val_loss: 1.5938
Epoch 4/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step - accuracy: 0.7059 - loss: 1.5723 - val_accuracy: 0.3529 - val_loss: 1.5880
Epoch 5/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 0.7647 - loss: 1.5680 - val_accuracy: 0.3529 - val_loss: 1.5815
Epoch 6/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step - accuracy: 0.8235 - loss: 1.5500 - val_accuracy: 0.3529 - val_loss: 1.5739
Epoch 7/1000
[1m1/1[0m [32m━

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed
import re

# Sample data: input text with entities
texts = [
    "Amy Mullins says she never looked up the word disabled to see what she'd find.",
    "She says she was born into a world that perceived someone like me to have nothing positive going for them.",
    "Her childhood doctor at the AI DuPont Institute in Wilmington, Delaware, opened doors for her, she says.",
    "I have to wonder to what extent his vision and his declaration of me as a strong and powerful little girl shaped my own view of myself as an inherently strong, powerful and athletic person.",
    "The human ability to adapt is our greatest asset.",
    "Adversity isn't an obstacle that we need to get around in order to resume living our life.",
    "The only real and consistent disability I've had to confront is the world ever thinking that I could be described by those definitions.",
    "We have to make sure that we don't put the first brick in a wall.",
    "That will disable someone.",
    "By not treating the wholeness of a person, by not acknowledging their potency, we are creating another ill on top of whatever natural struggle they might have.",
    "Amy Mullins was born without the fibula bones and had feet turned in and a few toes in this foot and few toes on that.",
    "Anthropologists tell us that the one thing we, as humans, have always required of our community members is to be of use.",
    "If we can change this paradigm from one of achieving normalcy to one of possibility or potency, to be even a little bit more dangerous, we can release the power of many more children and invite them to engage their rare and valuable abilities with the community.",
    "The only true disability is a crushed spirit.",
    "A spirit that's been crushed doesn't have hope, it doesn't see beauty, it no longer fosters natural childlike curiosity and our innate ability to imagine.",
    "If, instead, we can bolster a human spirit to keep hope, to see beauty in themselves and others, to be curious and imaginative, then we are truly using our power.",
    "When a spirit has those qualities, we are able to create new realities and new ways of being.",
    "I'd like to leave you with a poem by a 14th century Persian poet named Hafiz."
]

# Adjusted labels for the text
labels = [
    ["B-PERSON", "I-PERSON", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "B-ORG", "I-ORG", "O", "B-LOCATION", "I-LOCATION", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
    ["B-PERSON", "I-PERSON", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"],
    ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]
]

# Tokenizer (simple word-based tokenization)
tokenizer = re.compile(r"\w+")
word_to_idx = {}
tag_to_idx = {}
max_seq_len = 0

# Build the tokenizer and map words and tags to indices
for text, label in zip(texts, labels):
    words = tokenizer.findall(text)
    for word in words:
        word = word.lower()
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)
    for tag in label:
        if tag not in tag_to_idx:
            tag_to_idx[tag] = len(tag_to_idx)

# Add an "unknown" token to handle unknown words
unknown_token = "<UNK>"
word_to_idx[unknown_token] = len(word_to_idx)

# Convert the texts and labels to numerical indices
X = [[word_to_idx.get(word.lower(), word_to_idx[unknown_token]) for word in tokenizer.findall(text)] for text in texts]
y = [[tag_to_idx.get(tag, 0) for tag in label] for label in labels]

max_seq_len = max(max(len(seq) for seq in X), max_seq_len)

# Padding sequences to ensure equal length
X_pad = np.array([seq + [0] * (max_seq_len - len(seq)) for seq in X])
y_pad = np.array([seq + [0] * (max_seq_len - len(seq)) for seq in y])

# Hyperparameters
vocab_size = len(word_to_idx) + 1
tag_size = len(tag_to_idx)
embedding_dim = 100
lstm_units = 128
dropout_rate = 0.5

# Initialize the transition matrix globally
trans = tf.Variable(np.random.randn(tag_size, tag_size), dtype=tf.float32)

# Model architecture: LSTM + CRF
inputs = Input(shape=(max_seq_len,))
embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_len)(inputs)
lstm = LSTM(units=lstm_units, return_sequences=True, dropout=dropout_rate)(embedding)
output = TimeDistributed(Dense(tag_size, activation='softmax'))(lstm)

# Manually define CRF loss function
def crf_loss(y_true, y_pred):
    global trans
    sequence_lengths = tf.reduce_sum(tf.cast(tf.not_equal(y_true, 0), tf.int32), axis=1)
    loss = tf.reduce_mean(tf.losses.sparse_categorical_crossentropy(y_true, y_pred))
    return loss

# Compile the model
model = Model(inputs, output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=crf_loss,
              metrics=["accuracy"])

# Train the model
history = model.fit(X_pad, y_pad, batch_size=16, epochs=1000, validation_split=0.1)

# Evaluate predictions
y_pred = model.predict(X_pad)
idx_to_tag = {v: k for k, v in tag_to_idx.items()}

# Function to decode predictions into tags
def decode_predictions(predictions, idx_to_tag, original_lengths):
    decoded_preds = []
    for seq, original_len in zip(predictions, original_lengths):
        # Decode predictions only up to the original sequence length
        decoded_seq = [idx_to_tag.get(np.argmax(p), "O") for p in seq[:original_len]]
        decoded_preds.append(decoded_seq)
    return decoded_preds

# Calculate original lengths of the sequences (non-padded lengths)
original_lengths = [len(seq) for seq in X]

# Decoding predictions
predictions = decode_predictions(y_pred, idx_to_tag, original_lengths)

# Print predictions
for i, pred in enumerate(predictions):
    print(f"Text: {texts[i]}")
    print(f"Predictions: {pred}")
    print("-" * 80)


Epoch 1/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.0638 - loss: 1.9614 - val_accuracy: 0.6809 - val_loss: 1.8851
Epoch 2/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step - accuracy: 0.6888 - loss: 1.8925 - val_accuracy: 0.7766 - val_loss: 1.8061
Epoch 3/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step - accuracy: 0.7367 - loss: 1.8185 - val_accuracy: 0.7553 - val_loss: 1.7125
Epoch 4/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step - accuracy: 0.7513 - loss: 1.7458 - val_accuracy: 0.7447 - val_loss: 1.5932
Epoch 5/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step - accuracy: 0.7247 - loss: 1.6398 - val_accuracy: 0.7340 - val_loss: 1.4405
Epoch 6/1000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step - accuracy: 0.7168 - loss: 1.4996 - val_accuracy: 0.7340 - val_loss: 1.2546
Epoch 7/1000
[1m1/1[0m [32m━