In [1]:
%pip install torch TorchCRF
%pip install torch torchaudio
%pip install datasets
%pip install transformers
%pip install --upgrade pip
%pip install --upgrade transformers accelerate datasets[audio]
%pip install soundfile

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [16]:
# Import necessary libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import BertTokenizer
from TorchCRF import CRF

# Define the path to your local file
local_file_path = "transcription_test_AimeeMullins_1249s_summarized.txt"

# Verify the file exists
import os
if not os.path.exists(local_file_path):
    raise FileNotFoundError(f"File not found: {local_file_path}")
else:
    print(f"File '{local_file_path}' found.")

# Load the dataset using the appropriate loader based on your file type
tedlium = load_dataset("text", data_files=local_file_path)

# Verify the dataset is loaded correctly
print(tedlium)

# Create a custom dataset class
class TEDLIUMDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        transcription = self.texts[idx]
        return transcription

# Prepare the dataset
train_texts = tedlium['train']['text']

# Creating the dataset
train_dataset = TEDLIUMDataset(train_texts)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)  # Adjust batch size as needed

# Tokenize the texts
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

vocab_size = len(tokenizer)
embed_dim = 100  # Dimension of the embedding layer
hidden_dim = 128  # Dimension of LSTM hidden states
output_dim = 2  # Number of classes for classification (adjust as necessary)

# Define the LSTM-CRF model
class LSTMCRF(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMCRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=tokenizer.pad_token_id)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Multiply by 2 for bidirectional

        # CRF layer for sequence tagging (without batch_first argument)
        self.crf = CRF(output_dim)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        logits = self.fc(lstm_out)
        return logits

    def decode(self, logits, mask):
        return self.crf.decode(logits, mask=mask)

# Instantiate the model
model = LSTMCRF(vocab_size, embed_dim, hidden_dim, output_dim)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model is running on {device}")

# Training hyperparameters
num_epochs = 100
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Tokenize the batch of texts
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Placeholder for labels (replace this with actual labels)
        labels = torch.randint(0, output_dim, (input_ids.size(0), input_ids.size(1))).to(device)  # Random labels for testing

        optimizer.zero_grad()
        logits = model(input_ids)  # Forward pass

        # Compute CRF loss
        try:
            loss = -model.crf(logits, labels, mask=attention_mask.bool())  # Negative log likelihood
            print(f"Loss shape: {loss.shape}")  # Print shape of loss to check

            if loss.ndimension() > 0:
                loss = loss.mean()  # Reduce to scalar if not already scalar

            print(f"Reduced Loss: {loss}")  # Print reduced loss value

            total_loss += loss.item()  # Convert to scalar and accumulate

            loss.backward()
            optimizer.step()

            # After training, make predictions using the CRF layer (using viterbi_decode)
            predicted_labels = model.crf.viterbi_decode(logits, mask=attention_mask.bool())  # Decode using CRF
            print("Predicted Labels:", predicted_labels)
            break  # Remove to evaluate on the entire dataset
        except IndexError as e:
            print(f"Skipping batch with shape mismatch: logits shape {logits.shape}, labels shape {labels.shape}")

    avg_loss = total_loss / len(train_loader)  # Average loss for the epoch
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Sample prediction
model.eval()
with torch.no_grad():
    for batch in train_loader:
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        logits = model(input_ids)
        predicted_label= model.crf.viterbi_decode(logits, mask=attention_mask.bool())  # Decode using CRF
        print("Predicted Labels:", predicted_labels)

        break  # Remove to evaluate on the entire dataset


File 'transcription_test_AimeeMullins_1249s_summarized.txt' found.
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1
    })
})
Model is running on cpu
Loss shape: torch.Size([1])
Reduced Loss: 356.3069152832031
Predicted Labels: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [19]:
# Import necessary libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import BertTokenizer
from TorchCRF import CRF

# Define the path to your local file
local_file_path = "transcription_test_AimeeMullins_1249s_summarized.txt"

# Verify the file exists
import os
if not os.path.exists(local_file_path):
    raise FileNotFoundError(f"File not found: {local_file_path}")
else:
    print(f"File '{local_file_path}' found.")

# Load the dataset using the appropriate loader based on your file type
tedlium = load_dataset("text", data_files=local_file_path)

# Verify the dataset is loaded correctly
print(tedlium)

# Create a custom dataset class
class TEDLIUMDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        transcription = self.texts[idx]
        return transcription

# Prepare the dataset
train_texts = tedlium['train']['text']

# Creating the dataset
train_dataset = TEDLIUMDataset(train_texts)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)  # Adjust batch size as needed

# Tokenize the texts
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

vocab_size = len(tokenizer)
embed_dim = 100  # Dimension of the embedding layer
hidden_dim = 128  # Dimension of LSTM hidden states
output_dim = 2  # Number of classes for classification (adjust as necessary)

# Define the LSTM-CRF model
class LSTMCRF(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMCRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=tokenizer.pad_token_id)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Multiply by 2 for bidirectional

        # CRF layer for sequence tagging (without batch_first argument)
        self.crf = CRF(output_dim)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        logits = self.fc(lstm_out)
        return logits

    def decode(self, logits, mask):
        return self.crf.decode(logits, mask=mask)

# Instantiate the model
model = LSTMCRF(vocab_size, embed_dim, hidden_dim, output_dim)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model is running on {device}")

# Training hyperparameters
num_epochs = 100
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Tokenize the batch of texts
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Placeholder for labels (replace this with actual labels)
        labels = torch.randint(0, output_dim, (input_ids.size(0), input_ids.size(1))).to(device)  # Random labels for testing

        optimizer.zero_grad()
        logits = model(input_ids)  # Forward pass

        # Compute CRF loss
        try:
            loss = -model.crf(logits, labels, mask=attention_mask.bool())  # Negative log likelihood
            print(f"Loss shape: {loss.shape}")  # Print shape of loss to check

            if loss.ndimension() > 0:
                loss = loss.mean()  # Reduce to scalar if not already scalar

            print(f"Reduced Loss: {loss}")  # Print reduced loss value

            total_loss += loss.item()  # Convert to scalar and accumulate

            loss.backward()
            optimizer.step()

            # After training, make predictions using the CRF layer (using viterbi_decode)
            predicted_labels= model.crf.viterbi_decode(logits, mask=attention_mask.bool())  # Decode using CRF
            print("Predicted Labels:", predicted_labels)
            break  # Remove to evaluate on the entire dataset
        except IndexError as e:
            print(f"Skipping batch with shape mismatch: logits shape {logits.shape}, labels shape {labels.shape}")

    avg_loss = total_loss / len(train_loader)  # Average loss for the epoch
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Define the label map for NER (BIO tagging)
label_map = {
    0: "O",          # Outside any named entity
    1: "B-PERSON",   # Beginning of a person entity
    2: "I-PERSON",   # Inside a person entity
    3: "B-ORG",      # Beginning of an organization entity
    4: "I-ORG",      # Inside an organization entity
    # Add other entities as needed
}

# Sample prediction after training
model.eval()
with torch.no_grad():
    for batch in train_loader:
        # Tokenize the batch of texts
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Forward pass through the model
        logits = model(input_ids)

        # Decode predictions using the CRF layer
        predicted_labels = model.crf.viterbi_decode(logits, mask=attention_mask.bool())  # CRF decoding
        
        # Debugging step: Print the predicted_labels to understand its structure
        print("Predicted labels:", predicted_labels)  # Inspect the actual content of predicted_labels
        
        # Check if predicted_labels is a list of lists (batch of sequences) or just a single sequence
        if isinstance(predicted_labels, list) and isinstance(predicted_labels[0], list):
            print("Predicted labels shape:", len(predicted_labels[0]))  # First sequence in the batch
        else:
            print("Predicted labels is not a list of sequences. It might be a single sequence or a different format.")
        
        # Flatten the list of predicted labels (if it's a list of sequences)
        if isinstance(predicted_labels, list) and isinstance(predicted_labels[0], list):
            # Flattening the list of lists (for a batch of sequences)
            flattened_predicted_labels = [label for sublist in predicted_labels for label in sublist]
        else:
            flattened_predicted_labels = predicted_labels  # If it's a single sequence
        
        # Now, map the flattened predicted labels to their BIO tag names
        predicted_labels_mapped = [label_map[label] for label in flattened_predicted_labels]
        
        # Display tokens and their predicted labels (BIO tags)
        tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())  # Convert the first batch to tokens
        print("Tokens:", tokens)
        print("Predicted Labels:", predicted_labels_mapped)


        break  # Remove to evaluate on the entire dataset



File 'transcription_test_AimeeMullins_1249s_summarized.txt' found.
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1
    })
})
Model is running on cpu
Loss shape: torch.Size([1])
Reduced Loss: 356.193603515625
Predicted Labels: [[1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,