In [None]:
# Check if CUDA is available
import torch
torch.cuda.is_available()
%pip install datasets
%pip install torch==2.0.1 torchtext==0.15.2
%pip install torch TorchCRF
%pip install torch torchaudio
%pip install datasets
%pip install transformers
%pip install --upgrade pip
%pip install --upgrade transformers accelerate datasets[audio]
%pip install soundfile
%pip install evaluate
%pip install seqeval
%pip install spacy
%pip install spacy_conll

Collecting torch
  Using cached torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Using cached nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1

Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.3.1
Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl.metadata (44 kB)
Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.46.2
    Uninstalling transformers-4.46.2:
      Successfully uninstalled transformers-4.46.2
Successfully installed transformers-4.46.3

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from TorchCRF import CRF
from datasets import load_dataset
from typing import List
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
from seqeval.metrics import classification_report as seq_classification_report

# Load the CoNLL-2003 dataset
dataset = load_dataset('conll2003')

# Extract all unique words and tags
words = set()
tags = set()

for split in ['train', 'validation', 'test']:
    for sentence in dataset[split]:
        for word in sentence['tokens']:
            words.add(word.lower())  # Lowercasing for normalization
        for tag in sentence['ner_tags']:
            tags.add(tag)

# Create word2idx and tag2idx dictionaries
word2idx = {"<PAD>": 0, "<UNK>": 1}
for word in sorted(words):
    word2idx[word] = len(word2idx)

# Mapping from tag indices to tag names
tag_names = dataset['train'].features['ner_tags'].feature.names
tag2idx = {"<PAD>": 0}
for idx, tag in enumerate(tag_names, start=1):
    tag2idx[tag] = len(tag2idx)

idx2tag = {v: k for k, v in tag2idx.items()}

# Parameters
MAX_LEN = 50
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
BATCH_SIZE = 32
EPOCHS = 5
PAD_IDX = word2idx["<PAD>"]

# Encoding functions
def encode_sentences(sentences: List[List[str]], word2idx: dict, max_len: int) -> torch.Tensor:
    encoded = []
    for sentence in sentences:
        encoded_sentence = [word2idx.get(word.lower(), word2idx["<UNK>"]) for word in sentence]
        if len(encoded_sentence) < max_len:
            encoded_sentence += [word2idx["<PAD>"]] * (max_len - len(encoded_sentence))
        else:
            encoded_sentence = encoded_sentence[:max_len]
        encoded.append(encoded_sentence)
    return torch.tensor(encoded, dtype=torch.long)

def encode_labels(labels: List[List[int]], tag2idx: dict, max_len: int) -> torch.Tensor:
    encoded = []
    for label_seq in labels:
        encoded_label = [label + 1 for label in label_seq]  # +1 to account for <PAD> tag
        if len(encoded_label) < max_len:
            encoded_label += [tag2idx["<PAD>"]] * (max_len - len(encoded_label))
        else:
            encoded_label = encoded_label[:max_len]
        encoded.append(encoded_label)
    return torch.tensor(encoded, dtype=torch.long)

# Prepare data
train_sentences = [example['tokens'] for example in dataset['train']]
train_labels = [example['ner_tags'] for example in dataset['train']]
val_sentences = [example['tokens'] for example in dataset['validation']]
val_labels = [example['ner_tags'] for example in dataset['validation']]
test_sentences = [example['tokens'] for example in dataset['test']]
test_labels = [example['ner_tags'] for example in dataset['test']]

X_train = encode_sentences(train_sentences, word2idx, MAX_LEN)
y_train = encode_labels(train_labels, tag2idx, MAX_LEN)
X_val = encode_sentences(val_sentences, word2idx, MAX_LEN)
y_val = encode_labels(val_labels, tag2idx, MAX_LEN)
X_test = encode_sentences(test_sentences, word2idx, MAX_LEN)
y_test = encode_labels(test_labels, tag2idx, MAX_LEN)

# Create DataLoaders
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

# Load GloVe embeddings
def load_glove_embeddings(file_path, word2idx, embedding_dim):
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), embedding_dim))
    embeddings[word2idx["<PAD>"]] = np.zeros(embedding_dim)

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            split = line.strip().split()
            word = split[0]
            if word in word2idx:
                vector = np.array(split[1:], dtype='float32')
                embeddings[word2idx[word]] = vector
    return torch.tensor(embeddings, dtype=torch.float32)

# Assuming using GloVe 100d
glove_path = 'glove.6B.100d.txt'  # Replace with your path to GloVe embeddings
embeddings = load_glove_embeddings(glove_path, word2idx, EMBEDDING_DIM)

class LSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, padding_idx, dropout=0.5, pretrained_embeddings=None):
        super(LSTM_CRF, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, sentences, tags, mask):
        embeds = self.embedding(sentences)
        embeds = self.dropout(embeds)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout(lstm_out)
        emissions = self.hidden2tag(lstm_out)
        loss = -self.crf(emissions, tags, mask=mask)
        return loss

    def decode(self, sentences, mask):
        embeds = self.embedding(sentences)
        embeds = self.dropout(embeds)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout(lstm_out)
        emissions = self.hidden2tag(lstm_out)
        # Use the Viterbi algorithm to decode the emissions and obtain the most likely tag sequence
        # Use viterbi_decode instead of decode
        return self.crf.viterbi_decode(emissions, mask=mask)

# Initialize the model
VOCAB_SIZE = len(word2idx)
TAGSET_SIZE = len(tag2idx)

model = LSTM_CRF(VOCAB_SIZE, TAGSET_SIZE, EMBEDDING_DIM, HIDDEN_DIM, PAD_IDX, pretrained_embeddings=embeddings)
optimizer = optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        mask = (batch_X != PAD_IDX).to(device)
        optimizer.zero_grad()
        loss = model(batch_X, batch_y, mask)
        loss.mean().backward()
        optimizer.step()
        epoch_loss += loss.mean().item()
    print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {epoch_loss / len(train_loader):.4f}")

def evaluate_and_show_results_with_metrics(model, X_test, y_test, test_sentences, idx2tag, device):
    model.eval()
    all_preds = []
    all_true = []
    X_test, y_test = X_test.to(device), y_test.to(device)
    mask = (X_test != PAD_IDX).to(device)

    with torch.no_grad():
        predictions = model.decode(X_test, mask=mask)

    for i, pred_indices in enumerate(predictions):
        tokens = test_sentences[i]
        true_labels = y_test[i].cpu().numpy()
        pred_tags = [idx2tag[idx] for idx in pred_indices]

        # Collect true and predicted labels for metrics
        true_labels_list = [idx2tag[label] for label in true_labels if label != 0]  # Exclude padding
        pred_tags_list = [tag for tag in pred_tags if tag != "<PAD>"]

        all_true.append(true_labels_list)
        all_preds.append(pred_tags_list)

        # Print sentence with true and predicted labels
        print("\nSentence:")
        for token, true_label, pred_tag in zip(tokens, true_labels, pred_tags):
            true_label_name = idx2tag[true_label] if true_label != 0 else "<PAD>"
            print(f"{token:15} True: {true_label_name:10} Predicted: {pred_tag}")

    # Print classification report
    print("\nClassification Report:")
    print(seq_classification_report(all_true, all_preds))

# Call the updated evaluation function
evaluate_and_show_results_with_metrics(model, X_test, y_test, test_sentences, idx2tag, device)


In [None]:
!pip install spacy_conll

import spacy
from spacy_conll import ConllFormatter

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Add the ConllFormatter to the pipeline
nlp.add_pipe("conll_formatter", last=True)

# File path to input text
file_path = "transcription_test_AimeeMullins_1249s.txt"

# Read in the file
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

# Process the text using spaCy
doc = nlp(text)

# Initialize the CoNLL output string
conll_str = ""

# Start processing sentence by sentence
for sent in doc.sents:
    for token in sent:
        # Token, POS, Chunk, and NER (format: word POS B-chunk B-NER)
        conll_str += f"{token.text}\t{token.pos_}\t{token.dep_}\t{token.ent_iob_}-{token.ent_type_ if token.ent_iob_ != 'O' else 'O'}\n"

    # Add a blank line after each sentence
    conll_str += "\n"

# Output file path
output_path = "output.conll"

# Write the CoNLL formatted output to a file
with open(output_path, "w", encoding="utf-8") as output_file:
    output_file.write(conll_str)

print("CoNLL output file saved at:", output_path)


CoNLL output file saved at: output.conll


In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from TorchCRF import CRF
from datasets import load_dataset
from typing import List
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
from seqeval.metrics import classification_report as seq_classification_report
import random
import spacy
from spacy_conll import ConllFormatter

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Read the file
file_path = "transcription_test_AimeeMullins_1249s.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

# Process the text using spaCy
doc = nlp(text)

# Prepare data in a structured format
conll_data = []
ner_tag_set = set()  # Collect all unique NER tags for feature mapping

for i, sent in enumerate(doc.sents):
    tokens, pos_tags, chunk_tags, ner_tags = [], [], [], []
    for token in sent:
        tokens.append(token.text)
        pos_tags.append(token.pos_)
        chunk_tags.append(token.dep_)
        ner_tag = f"{token.ent_iob_}-{token.ent_type_ if token.ent_iob_ != 'O' else 'O'}"
        ner_tags.append(ner_tag)
        ner_tag_set.add(ner_tag)  # Add to the set of unique tags
    conll_data.append({
        "id": i,
        "tokens": tokens,
        "pos_tags": pos_tags,
        "chunk_tags": chunk_tags,
        "ner_tags": ner_tags
    })


# Shuffle and split the data into 80% train, 10% validation, 10% test
random.shuffle(conll_data)

num_train = int(0.8 * len(conll_data))
num_valid = int(0.1 * len(conll_data))

train_data = conll_data[:num_train]
valid_data = conll_data[num_train:num_train + num_valid]
test_data = conll_data[num_train + num_valid:]

# Define the Dataset class
class Dataset:
    def __init__(self, split_data, ner_tag_names):
        self.data = split_data
        self.features = {
            "ner_tags": {
                "feature": {
                    "names": ner_tag_names
                }
            }
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

    def __repr__(self):
        return f"Dataset({{\n    features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n    num_rows: {len(self.data)}\n}})"

# Create datasets
ner_tag_names = sorted(ner_tag_set)  # Sorted list of unique NER tag names
dataset = {
    "train": Dataset(train_data, ner_tag_names),
    "validation": Dataset(valid_data, ner_tag_names),
    "test": Dataset(test_data, ner_tag_names)
}
# Extract unique words and tags
words = set()
tags = set()

for split in ['train', 'validation', 'test']:
    for sentence in dataset[split]:
        for word in sentence['tokens']:
            words.add(word.lower())  # Lowercasing for normalization
        for tag in sentence['ner_tags']:
            tags.add(tag)

# Create word2idx and tag2idx dictionaries
word2idx = {"<PAD>": 0, "<UNK>": 1}
for word in sorted(words):
    word2idx[word] = len(word2idx)

# Mapping from tag indices to tag names
tag_names = dataset['train'].features['ner_tags']['feature']['names']
tag2idx = {"<PAD>": 0}
for idx, tag in enumerate(tag_names, start=1):
    tag2idx[tag] = len(tag2idx)

idx2tag = {v: k for k, v in tag2idx.items()}

# Parameters
MAX_LEN = 50
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
BATCH_SIZE = 32
EPOCHS = 5
PAD_IDX = word2idx["<PAD>"]

# Encoding functions
def encode_sentences(sentences: List[List[str]], word2idx: dict, max_len: int) -> torch.Tensor:
    encoded = []
    for sentence in sentences:
        encoded_sentence = [word2idx.get(word.lower(), word2idx["<UNK>"]) for word in sentence]
        if len(encoded_sentence) < max_len:
            encoded_sentence += [word2idx["<PAD>"]] * (max_len - len(encoded_sentence))
        else:
            encoded_sentence = encoded_sentence[:max_len]
        encoded.append(encoded_sentence)
    return torch.tensor(encoded, dtype=torch.long)

def encode_labels(labels: List[List[str]], tag2idx: dict, max_len: int) -> torch.Tensor:
    encoded = []
    for label_seq in labels:
        encoded_label = [tag2idx.get(label, tag2idx["<PAD>"]) for label in label_seq]  # Map NER tags to indices
        if len(encoded_label) < max_len:
            encoded_label += [tag2idx["<PAD>"]] * (max_len - len(encoded_label))  # Padding
        else:
            encoded_label = encoded_label[:max_len]
        encoded.append(encoded_label)
    return torch.tensor(encoded, dtype=torch.long)


# Prepare data
train_sentences = [example['tokens'] for example in dataset['train']]
train_labels = [example['ner_tags'] for example in dataset['train']]
val_sentences = [example['tokens'] for example in dataset['validation']]
val_labels = [example['ner_tags'] for example in dataset['validation']]
test_sentences = [example['tokens'] for example in dataset['test']]
test_labels = [example['ner_tags'] for example in dataset['test']]

X_train = encode_sentences(train_sentences, word2idx, MAX_LEN)
y_train = encode_labels(train_labels, tag2idx, MAX_LEN)
X_val = encode_sentences(val_sentences, word2idx, MAX_LEN)
y_val = encode_labels(val_labels, tag2idx, MAX_LEN)
X_test = encode_sentences(test_sentences, word2idx, MAX_LEN)
y_test = encode_labels(test_labels, tag2idx, MAX_LEN)

# Create DataLoaders
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip glove.6B.zip

# Load GloVe embeddings
def load_glove_embeddings(file_path, word2idx, embedding_dim):
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), embedding_dim))
    embeddings[word2idx["<PAD>"]] = np.zeros(embedding_dim)

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            split = line.strip().split()
            word = split[0]
            if word in word2idx:
                vector = np.array(split[1:], dtype='float32')
                embeddings[word2idx[word]] = vector
    return torch.tensor(embeddings, dtype=torch.float32)

# Assuming using GloVe 100d
glove_path = 'glove.6B.100d.txt'  # Replace with your path to GloVe embeddings
embeddings = load_glove_embeddings(glove_path, word2idx, EMBEDDING_DIM)

class LSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, padding_idx, dropout=0.5, pretrained_embeddings=None):
        super(LSTM_CRF, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, sentences, tags, mask):
        embeds = self.embedding(sentences)
        embeds = self.dropout(embeds)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout(lstm_out)
        emissions = self.hidden2tag(lstm_out)
        loss = -self.crf(emissions, tags, mask=mask)
        return loss

    def decode(self, sentences, mask):
        embeds = self.embedding(sentences)
        embeds = self.dropout(embeds)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout(lstm_out)
        emissions = self.hidden2tag(lstm_out)
        # Use the Viterbi algorithm to decode the emissions and obtain the most likely tag sequence
        # Use viterbi_decode instead of decode
        return self.crf.viterbi_decode(emissions, mask=mask)

# Initialize the model
VOCAB_SIZE = len(word2idx)
TAGSET_SIZE = len(tag2idx)

model = LSTM_CRF(VOCAB_SIZE, TAGSET_SIZE, EMBEDDING_DIM, HIDDEN_DIM, PAD_IDX, pretrained_embeddings=embeddings)
optimizer = optim.Adam(model.parameters(), lr=0.01)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        mask = (batch_X != PAD_IDX).to(device)
        optimizer.zero_grad()
        loss = model(batch_X, batch_y, mask)
        loss.mean().backward()
        optimizer.step()
        epoch_loss += loss.mean().item()
    print(f"Epoch {epoch + 1}/{EPOCHS}, Loss: {epoch_loss / len(train_loader):.4f}")

def evaluate_and_show_results_with_metrics(model, X_test, y_test, test_sentences, idx2tag, device):
    model.eval()
    all_preds = []
    all_true = []
    X_test, y_test = X_test.to(device), y_test.to(device)
    mask = (X_test != PAD_IDX).to(device)

    with torch.no_grad():
        predictions = model.decode(X_test, mask=mask)

    for i, pred_indices in enumerate(predictions):
        tokens = test_sentences[i]
        true_labels = y_test[i].cpu().numpy()
        pred_tags = [idx2tag[idx] for idx in pred_indices]

        # Collect true and predicted labels for metrics
        true_labels_list = [idx2tag[label] for label in true_labels if label != 0]  # Exclude padding
        pred_tags_list = [tag for tag in pred_tags if tag != "<PAD>"]

        all_true.append(true_labels_list)
        all_preds.append(pred_tags_list)

        # Print sentence with true and predicted labels
        for token, true_label, pred_tag in zip(tokens, true_labels, pred_tags):
            true_label_name = idx2tag[true_label] if true_label != 0 else "<PAD>"
            print(f"{token:15} True: {true_label_name:10} Predicted: {pred_tag}")

    # Print classification report
    print("\nClassification Report:")
    print(seq_classification_report(all_true, all_preds))

# Call the updated evaluation function
evaluate_and_show_results_with_metrics(model, X_test, y_test, test_sentences, idx2tag, device)


Epoch 1/5, Loss: 11.2716
Epoch 2/5, Loss: 4.7373
Epoch 3/5, Loss: 4.2603
Epoch 4/5, Loss: 3.7218
Epoch 5/5, Loss: 3.2947
This            True: O-O        Predicted: O-O
is              True: O-O        Predicted: O-O
a               True: O-O        Predicted: O-O
game            True: O-O        Predicted: O-O
done            True: O-O        Predicted: O-O
with            True: O-O        Predicted: O-O
the             True: B-ORG      Predicted: O-O
World           True: I-ORG      Predicted: O-O
Bank            True: I-ORG      Predicted: O-O
Institute       True: I-ORG      Predicted: O-O
.               True: O-O        Predicted: O-O
The             True: O-O        Predicted: O-O
thing           True: O-O        Predicted: O-O
you             True: O-O        Predicted: O-O
get             True: O-O        Predicted: O-O
into            True: O-O        Predicted: O-O
big             True: O-O        Predicted: O-O
money           True: O-O        Predicted: O-O
on             