<a href="https://colab.research.google.com/github/vvikasreddy/lexically_constrained_beam_search_/blob/main/beam_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

References:

Marian MT model : https://huggingface.co/docs/transformers/model_doc/marian

Code to get the logits : https://huggingface.co/docs/transformers/main_classes/output

to get the BOS and EOS tokens: https://huggingface.co/docs/transformers/main_classes/configuration#transformers.PretrainedConfig.decoder_start_token_id

get topk values : https://pytorch.org/docs/stable/generated/torch.topk.html

ideas and core implementation drawn from this paper: https://arxiv.org/pdf/1704.07138

reference to link google colab with .py file from git : https://colab.research.google.com/github/jckantor/cbe61622/blob/master/docs/A.02-Downloading_Python_source_files_from_github.ipynb


## downloading essential modules

In [76]:
!pip install datasets

NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

## Importing necessary libraries

In [77]:
import torch, random
from datasets import load_dataset
from transformers import MarianMTModel, MarianTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

## loading the dataset, considering the wmt turkish - english translation

In [78]:
ds = load_dataset("wmt/wmt16", "tr-en")

## Glancing the organization of the dataset

In [79]:
ds

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 205756
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1001
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
})

In [80]:
ds['train'][0]

{'translation': {'en': "Kosovo's privatisation process is under scrutiny",
  'tr': "Kosova'nın özelleştirme süreci büyüteç altında"}}

## Loading the tokenizer and model, based of Marian-NMT

In [81]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tr-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-tr-en")



In [82]:
model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62389, 512, padding_idx=62388)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62389, 512, padding_idx=62388)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [83]:
model.config.num_beams

6

In [84]:
# print(generate_translation(ds['validation'][1]['translation']['tr']))

In [85]:
ds["validation"][1]["translation"]["tr"]

"Norveç'in beş milyon insanı en yüksek yaşam standartlarının tadını çıkarıyor, sadece Avrupa'da değil, dünyada."

In [86]:
ds["validation"][1]["translation"]["en"]

"Norway's five million people enjoy one of the highest standards of living, not just in Europe, but in the world."

## Extracting the constraints

In [89]:
# code to import constraints and store in a local directory, from my git

user = "vvikasreddy"
repo = "lexically_constrained_beam_search_"
pyfile = "constraints.py"

# i.e url is "https://github.com/vvikasreddy/lexically_constrained_beam_search_/blob/main/constraints.py"

url = f"https://raw.githubusercontent.com/{user}/{repo}/main/{pyfile}"
# !wget --no-cache --backups=1 {url}

import constraints

In [90]:
# takes almost 4 minutes to get the constraints, you will see 3 progress bars
c = constraints.get_constraints()

100%|██████████| 205756/205756 [00:44<00:00, 4673.58it/s]
100%|██████████| 205756/205756 [01:04<00:00, 3177.54it/s]
100%|██████████| 26221852/26221852 [00:42<00:00, 623351.19it/s]


In [None]:
print("some of the constraints are :")

# Extract 5 random keys
random_keys = random.sample(list(c.keys()), 5)

for key in random_keys:
  print(key, c[key])

print("The length of the constraints is", len(c))

In [None]:
def generate_translation(src_text, decoder_input = [], probabilites = [], get_constrained_token_probability = -1, k = 5):

  """
    returns decoder_input_tokens, probs, vis_data

    decoder_input_tokens : next top k tokens, or constraints probability if get_constrained_token_probabliity != -1
    probs : corresponding probablities of decoder_input_tokens
    vis_data : top k beams

    generate the decoder input ids and corresponding probabilities
    src_text : It is the source text
    decoder_input : Represents the decoder tokens
    probabilities : Represents corresponding decoder token probablities
    get_constrained_token_probabliity : holds the constraint, -1 indicates no constraint,
    k : number of beams to be generated, default is 5
  """


  decoder_input_tokens = []
  probs = []
  vis_data = []

  # Tokenize input
  encoder_inputs = tokenizer(src_text, return_tensors="pt")

  # if decoder_input is empty, then include the decoder start token
  if decoder_input == []:
    # intial decoder start token has probability 1
    probabilites = torch.tensor([[1]])
    decoder_input = torch.tensor([[model.config.decoder_start_token_id]])

  # change the model to eval mode and stop the computation of gradients.
  model.eval()
  with torch.no_grad():

    generated_tokens = []

    outputs = model(
        input_ids=encoder_inputs.input_ids,
        attention_mask=encoder_inputs.attention_mask,
        decoder_input_ids=decoder_input
    )

    # gets the most frequenlty generated token.
    next_token_logits = outputs.logits[:, -1, :]

    # constraint, if provided, returns the probability.
    if get_constrained_token_probability != -1:
      softmax_  = torch.softmax(next_token_logits, dim=-1)
      return softmax_[0][get_constrained_token_probability]

    # get the top k tokens with maximum logits value
    top_probs, top_indices = torch.topk(torch.softmax(next_token_logits, dim=-1), k = k)

  for indx, id in enumerate(top_indices[0]):
    decoder_input_tokens.append(torch.cat([decoder_input, id.unsqueeze(0).unsqueeze(0)], dim=1))
    probs.append(torch.cat([probabilites, top_probs[0][indx].unsqueeze(0).unsqueeze(0)], dim=1))
    vis_data.append((vis_data, tokenizer.decode(decoder_input_tokens[indx].squeeze(), skip_special_tokens = True)))

  return decoder_input_tokens, probs, vis_data

x,y,z = generate_translation(ds['validation'][1]['translation']['tr'], decoder_input = torch.tensor([[62388,  1969]]), probabilites = torch.tensor([[0.0000, 0.0000]]))
print( x)
print(y)
print(z)
# c




    """
    Returns decoder_input_tokens, probs, vis_data

    Args:
        src_text (str): The source text to translate
        decoder_input (torch.Tensor, optional): Decoder tokens. Defaults to [].
        probabilities (torch.Tensor, optional): Corresponding decoder token probabilities. Defaults to [].
        get_constrained_token_probability (int, optional): Constraint token index. Defaults to -1.
        k (int, optional): Number of beams to generate. Defaults to 5.
        device (torch.device, optional): Device to run the model on. Defaults to None (auto-detect).

    Returns:
        tuple: decoder_input_tokens, probs, vis_data
    """
  

In [None]:
import torch

def generate_translation(src_text, decoder_input=[], probabilities=[], get_constrained_token_probability=-1, k=5, device=None):
  # print("he")
  # Auto-detect device if not specified
  if device is None:
      device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  # Move model to the specified device
  model.to(device)

    # Tokenize input
  encoder_inputs = tokenizer(src_text, return_tensors="pt").to(device)

  # If decoder_input is empty, include the decoder start token
  if len(decoder_input) == 0:

    # Initial decoder start token has probability 1
    probabilities = torch.tensor([[1.0]]).to(device)
    decoder_input = torch.tensor([[model.config.decoder_start_token_id]]).to(device)
  else:
    # Ensure decoder_input and probabilities are on the correct device
    # print(probabilities, "am in else")
    # print(decoder_input, "am in else")
    decoder_input = decoder_input.to(device)
    if probabilities != [] :
      probabilities = probabilities.to(device)

  # Change the model to eval mode and stop the computation of gradients
  model.eval()
  with torch.no_grad():
      # Generate tokens
    outputs = model(
        input_ids=encoder_inputs.input_ids,
        attention_mask=encoder_inputs.attention_mask,
        decoder_input_ids=decoder_input
    )

  # Get the most frequently generated token
    next_token_logits = outputs.logits[:, -1, :]

    # Constraint handling
    if get_constrained_token_probability != -1:
        softmax_ = torch.softmax(next_token_logits, dim=-1)
        return softmax_[0][get_constrained_token_probability]

    # Get the top k tokens with maximum logits value
    top_probs, top_indices = torch.topk(torch.softmax(next_token_logits, dim=-1), k=k)

  # Prepare output containers
  decoder_input_tokens = []
  probs = []
  vis_data = []

  for indx, id in enumerate(top_indices[0]):
      # Concatenate new tokens and probabilities
      new_decoder_input = torch.cat([decoder_input, id.unsqueeze(0).unsqueeze(0)], dim=1)
      new_probs = torch.cat([probabilities, top_probs[0][indx].unsqueeze(0).unsqueeze(0)], dim=1)

      decoder_input_tokens.append(new_decoder_input)
      probs.append(new_probs)

      # Generate visualization data
      vis_data.append((vis_data, tokenizer.decode(new_decoder_input.squeeze(), skip_special_tokens=True)))

  return decoder_input_tokens, probs, vis_data

# Example usage
# Assuming 'ds', 'model', and 'tokenizer' are defined
x, y, z = generate_translation(
    ds['validation'][1]['translation']['tr'],
    decoder_input=torch.tensor([[62388, 1969]]),
    probabilities=torch.tensor([[0.0000, 0.0000]])
)
print(x)
print(y)
print(z)

In [None]:
def get_ngrams(src, n = 2, ):

  src = src.split(" ")
  src = [tuple(src[i:i+n]) for i in range(len(src) - n + 1)]

  return src

def constraints_tokens(src, c):
  ngrams = get_ngrams(src)
  constraints_src = []
  for ngram in ngrams:
    # print(ngram)
    if ngram in c:
      f = c[ngram][0]
      for gram in f:

        if  gram in constraints_src: continue
        out = tokenizer(gram, return_tensors="pt")
        constraints_src.append(out["input_ids"])
        # print(constraints_src)
  return constraints_src

constraints_tokens("Southeast European Times için Priştine'den Muhamet Brayşori'nin haberi -- 21/03/12", c)

[tensor([[3113,   56,   47, 1517,    0]]),
 tensor([[5827, 1786,  373,    0]]),
 tensor([[5827, 1786,  373,    0]]),
 tensor([[3762,    0]]),
 tensor([[27,  0]]),
 tensor([[3113,   56,   47, 1517,    0]]),
 tensor([[21,  0]]),
 tensor([[ 4388, 10158,   204,     0]]),
 tensor([[1417,    0]]),
 tensor([[6041,   47, 2628,    0]]),
 tensor([[27,  0]]),
 tensor([[3113,   56,   47, 1517,    0]])]

In [None]:
# x = constraints_tokens(ds['validation'][1]['translation']['tr'] + ' Times' +  ' için', c)
# # c
# print(x)
# print(get_input_ids(x))

In [None]:
def visualize_data(decoder_input):
  return tokenizer.decode(decoder_input.squeeze(), skip_special_tokens = True)

In [None]:
import torch

def get_top_k_prob(A, B, k=2):

  d = {}
  # cummulative sum
  for indx, val in enumerate(B):

    cum_sum = torch.prod(val)
    d[cum_sum] = indx

  sorted_keys = sorted(d.keys(), reverse = True)

  top_k_indices = []
  top_k_sequences = []

  for key in sorted_keys[:k]:
    top_k_indices.append(A[d[key]])
    top_k_sequences.append(B[d[key]])

  return top_k_sequences, top_k_indices

# sanity
k = 2

# A = [torch.tensor([[62388,   626,    13]]), torch.tensor([[62388,   626,     9]]), torch.tensor([[62388,   626,  1341]]), torch.tensor([[62388,   626,    27]])]
# B = [torch.tensor([[1.0000, 0.0038, 0.2500]]), torch.tensor([[1.0000, 0.0038, 0.0619]]), torch.tensor([[1.0000, 0.0038, 0.0474]]), torch.tensor([[1.0000, 0.0038, 0.0425]])]


A = [torch.tensor([[62388,  1969]]), torch.tensor([[62388,   323]]), torch.tensor([[62388,    67]]), torch.tensor([[62388,  1132]]), torch.tensor([[62388,   626]])]
B = [torch.tensor([[1.0000, 0.8746]]), torch.tensor([[1.0000, 0.0156]]), torch.tensor([[1.0000, 0.0114]]), torch.tensor([[1.0000, 0.0039]]), torch.tensor([[1.0000, 0.0038]])]

top_sequences, indices = get_top_k_prob(A, B, k)
print(f"Top {k} sequences:", top_sequences)
print("Their indices:", indices)

Top 2 sequences: [tensor([[1.0000, 0.8746]]), tensor([[1.0000, 0.0156]])]
Their indices: [tensor([[62388,  1969]]), tensor([[62388,   323]])]


In [None]:
# Assuming x[1] is a list of tensors
def get_the_text(x):
  token_id_list = x  # Extract the list of tensors

  # Move all tensors to CPU and convert to a list of lists
  token_ids_batch = [tensor_item.cpu().squeeze().tolist() for tensor_item in token_id_list]

  # Batch decode token IDs into sentences
  decoded_sentences = tokenizer.batch_decode(token_ids_batch, skip_special_tokens=True)

  return decoded_sentences

get_the_text(x[1])

['Kost']

In [124]:
# since wmt dataset is not domain speicific, I am trying to only take the sentences, which have constraints defined.

count = 0
indices = []
count = 3000
i = 0

while count:
  x = ds["train"][i]["translation"]["tr"]
  if constraints_tokens(x, c):
    count -=1
    indices.append(i)
  i+=1

print("The last index in all of the 3000 is,", indices[-1])

The last index in all of the 3000 is, 6304


In [160]:
len(ds["train"])
import torch
from torch.utils.data import Dataset

class Results(Dataset):

  def __init__(self, ds, c, indices):
      """
      Args:
          data : represents the turkish sentences to be passed
      """
      self.data = ds
      self.c = c
      self.indices = indices

  def generate_translation(self, src_text, decoder_input=[], probabilities=[], get_constrained_token_probability=-1, k=5, device=None):
    # print("he")
    # Auto-detect device if not specified
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Move model to the specified device
    model.to(device)

      # Tokenize input
    encoder_inputs = tokenizer(src_text, return_tensors="pt").to(device)

    # If decoder_input is empty, include the decoder start token
    if len(decoder_input) == 0:

      # Initial decoder start token has probability 1
      probabilities = torch.tensor([[1.0]]).to(device)
      decoder_input = torch.tensor([[model.config.decoder_start_token_id]]).to(device)
    else:
      # Ensure decoder_input and probabilities are on the correct device
      # print(probabilities, "am in else")
      # print(decoder_input, "am in else")
      decoder_input = decoder_input.to(device)
      if probabilities != [] :
        probabilities = probabilities.to(device)

    # Change the model to eval mode and stop the computation of gradients
    model.eval()
    with torch.no_grad():
        # Generate tokens
      outputs = model(
          input_ids=encoder_inputs.input_ids,
          attention_mask=encoder_inputs.attention_mask,
          decoder_input_ids=decoder_input
      )

    # Get the most frequently generated token
      next_token_logits = outputs.logits[:, -1, :]

      # Constraint handling
      if get_constrained_token_probability != -1:
          softmax_ = torch.softmax(next_token_logits, dim=-1)
          return softmax_[0][get_constrained_token_probability]

      # Get the top k tokens with maximum logits value
      top_probs, top_indices = torch.topk(torch.softmax(next_token_logits, dim=-1), k=k)

    # Prepare output containers
    decoder_input_tokens = []
    probs = []
    vis_data = []

    for indx, id in enumerate(top_indices[0]):
        # Concatenate new tokens and probabilities
        new_decoder_input = torch.cat([decoder_input, id.unsqueeze(0).unsqueeze(0)], dim=1)
        new_probs = torch.cat([probabilities, top_probs[0][indx].unsqueeze(0).unsqueeze(0)], dim=1)

        decoder_input_tokens.append(new_decoder_input)
        probs.append(new_probs)

        # Generate visualization data
        vis_data.append((vis_data, tokenizer.decode(new_decoder_input.squeeze(), skip_special_tokens=True)))

    return decoder_input_tokens, probs, vis_data

  def get_the_text(self, x):
    token_id_list = x  # Extract the list of tensors

    # Move all tensors to CPU and convert to a list of lists
    token_ids_batch = [tensor_item.cpu().squeeze().tolist() for tensor_item in token_id_list]

    # Batch decode token IDs into sentences
    decoded_sentences = tokenizer.batch_decode(token_ids_batch, skip_special_tokens=True)

    return decoded_sentences
  def get_top_k_prob(self, A, B, k=2):
    d = {}
    # cummulative sum
    for indx, val in enumerate(B):

      cum_sum = torch.prod(val)
      d[cum_sum] = indx

    sorted_keys = sorted(d.keys(), reverse = True)

    top_k_indices = []
    top_k_sequences = []

    for key in sorted_keys[:k]:
      top_k_indices.append(A[d[key]])
      top_k_sequences.append(B[d[key]])

    return top_k_sequences, top_k_indices

  def constraints_tokens(self, src):
    ngrams = get_ngrams(src)
    constraints_src = []
    for ngram in ngrams:
      # print(ngram)
      if ngram in self.c:
        f = self.c[ngram][0]
        for gram in f:

          if  gram in constraints_src: continue
          out = tokenizer(gram, return_tensors="pt")
          constraints_src.append(out["input_ids"])
          # print(constraints_src)
    return constraints_src

  def beam_search(self, maxlen, numC, k, src, constrained_tokens):

    decoder_start_token = model.config.decoder_start_token_id

    # initialize the grids
    grids = [[[] for _ in range(numC + 1)] for _ in range(maxlen + 1)]
    probs_grid  = [[[] for _ in range(numC + 1)] for _ in range(maxlen + 1)]

    # intialize the first grid to start hyp
    grids[0][0] = [1]

    # remove during testsrc
    # constrained_tokens = get_input_ids(constraints_tokens(, constraints))
    # temporary
    # constrained_tokens = constraints_tokens("Southeast European Times için Priştine'den Muhamet Brayşori'nin haberi -- 21/03/12", constraints)

    generated_constraint_index = 0

    for t in range(1, maxlen):

        index_c = max(0, (numC - t) - maxlen)

        for c in range(index_c, min(t, numC) + 1):

            # Prepare batched generation to reduce individual calls
            s = []
            g = []

            # storing decoder inputs
            decoder_inputs = []
            probs = []
            vis_data = []

            # Batch generation of translations for current hypotheses
            for indx, element in enumerate(grids[t-1][c]):

              # guess there is no need for conditioning, just generate.
              if type(element) == int:
                decoder_input = []
                prev_probs =[]
              else:
                decoder_input = element.cuda()  # Move to GPU
                prev_probs = probs_grid[t-1][c][indx].cuda()  # Move to GPU

              # Batch collection of translations
              t_g, t_probs, t_vis_data = self.generate_translation(src_text=src, decoder_input=decoder_input, probabilities=prev_probs)

              # Extend lists more efficiently
              g.extend(t_g)
              probs.extend(t_probs)
              vis_data.extend(t_vis_data)

            # retrieve the probability of the constraint and add that to the decoder_input.
            if c > 0 and constrained_tokens:

              for indx, element in enumerate(grids[t-1][c-1]):

                if c == 1 and t == 1:
                  decoder_inputs = torch.tensor([[model.config.decoder_start_token_id]]).cuda()  # Move to GPU
                  prob = torch.tensor([[1]]).cuda()  # Move to GPU
                else:
                  decoder_inputs = element.cuda()  # Move to GPU
                  prob = probs_grid[t-1][c-1][indx].cuda()  # Move to GPU

                # print(constrained_tokens[c - 1])

                # iterating, because a constraint can be made up of many token ids
                partial_constraints = constrained_tokens[c - 1].tolist()
                # print(partial_constraints, "gandu")
                for partial_constraint in partial_constraints[0]:
                  if partial_constraint == 0: continue
                  # print(partial_constraint, "tryint to dissolve")
                  cons = self.generate_translation(src, decoder_input=decoder_input, get_constrained_token_probability=partial_constraint)
                  # print(cons, "cons", decoder_inputs, torch.tensor(partial_constraint).unsqueeze(0).unsqueeze(0))
                  # Concatenate constraints
                  decoder_inputs = torch.cat([decoder_inputs, torch.tensor(partial_constraint).unsqueeze(0).unsqueeze(0).cuda()], dim=1)
                  prob = torch.cat([prob, torch.tensor(cons).unsqueeze(0).unsqueeze(0).cuda()], dim=1)

                  g.append(decoder_inputs)
                  probs.append(prob)

            # Select top-k hypotheses
            probs_grid[t][c], grids[t][c] = get_top_k_prob(g, probs, k)

    return get_top_k_prob(grids[maxlen - 1][numC], probs_grid[maxlen - 1][numC], k = 1), get_top_k_prob(grids[maxlen -1][0], probs_grid[maxlen - 1][0], k = 1)

  def __len__(self):
      """Returns the total number of samples in the dataset."""
      return len(self.indices)

  def __getitem__(self, idx):
      """
      Retrieve a single sample and its label.

      Args:
          idx (int): Index of the sample to retrieve.

      Returns:
          tuple: (sample, label) where sample is the data and label is the corresponding label.
      """
      sample = self.data["test"][self.indices[idx]]["translation"]["tr"]
      constraints = self.constraints_tokens(sample)

      prediction, prediction_without_constraints = self.beam_search(maxlen= 30, numC=len(constraints), k = 4, src = sample, constrained_tokens = constraints)
      prediction = self.get_the_text(prediction[1])
      prediction_without_constraints = self.get_the_text(prediction_without_constraints[1])
      result = self.data["test"][self.indices[idx]]["translation"]["en"]
      return sample, prediction[0], prediction_without_constraints[0], result

In [161]:
results = Results(ds,c, indices = indices)
results[1]

  prob = torch.cat([prob, torch.tensor(cons).unsqueeze(0).unsqueeze(0).cuda()], dim=1)


('Polisler evde Amy Prentiss\'in cansız bedenini ve beyaz bir not defterine el yazısıyla yazılmış bir not buldu: "Çok üzgünüm, keşke her şeyi geri döndürebilsem, Amy\'yi çok sevmiştim ve o da bugüne kadar beni seven tek kadındı" okunan notu Lamb\'in imzaladığını belirtti memurlar.',
 'so far, the police found Amy Prentiss\' lifeless body and a note handwritten in a white notebook: "I\'m so',
 'Police found Amy Prentiss\' lifeless body in the house and a note handwritten in a white notebook: "I\'m so sorry',
 'Inside the home, officers found Amy Prentiss\' body and a hand-written note scribbled on a white legal pad: "I am so very sorry I wish I could take it back I loved Amy and she is the only woman who ever loved me," read the letter authorities say was signed by Lamb.')

In [None]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu

# Initialize lists to store predictions and actual texts
predictions = []
predictions_without_constraints = []
references = []


# Assuming 'results' is your dataset
dataloader = DataLoader(results, batch_size=32, shuffle=True)

# Iterate through the entire dataloader
for tr_text, pred_text, pred_cons_text, actual_text in tqdm(dataloader):
    # Extend the lists with the current batch
    predictions.extend(pred_text)
    predictions_without_constraints.extend(pred_cons_text)
    # For BLEU score, we need references to be a list of lists of tokenized references
    # Assuming actual_text is a list of strings
    references.extend([[ref.split()] for ref in actual_text])


  prob = torch.cat([prob, torch.tensor(cons).unsqueeze(0).unsqueeze(0).cuda()], dim=1)


In [None]:
import torch
from torch.utils.data import Dataset
import torch.nn.functional as F

class OptimizedResults(Dataset):
    def __init__(self, ds, c):
        """
        Args:
            ds: Dataset containing translations
            c: Constraints dictionary
        """
        self.data = ds
        self.c = c
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def constraints_tokens(self, src):
        """Efficiently extract constraint tokens"""
        ngrams = get_ngrams(src)
        constraints_src = []
        seen_grams = set()

        for ngram in ngrams:
            if ngram in self.c:
                for gram in self.c[ngram][0]:
                    if gram not in seen_grams:
                        seen_grams.add(gram)
                        out = tokenizer(gram, return_tensors="pt")
                        constraints_src.append(out["input_ids"])

        return constraints_src

    def optimized_beam_search(self, src, maxlen=25, k=6, constrained_tokens=None):
        """
        Optimized beam search with batched processing and reduced overhead

        Args:
            src (str): Source text to translate
            maxlen (int): Maximum length of generated sequence
            k (int): Number of beams
            constrained_tokens (list, optional): List of constraint tokens

        Returns:
            Best translation candidates
        """
        # Move model to device once
        model.to(self.device)
        model.eval()

        # Prepare initial inputs
        start_token = model.config.decoder_start_token_id
        encoder_inputs = tokenizer(src, return_tensors="pt").to(self.device)

        # Initialize beam search state
        batch_size = 1
        num_constraints = len(constrained_tokens) if constrained_tokens else 0

        # Initialize beam candidates
        sequences = torch.full((batch_size, k, 1), start_token, dtype=torch.long, device=self.device)
        sequence_scores = torch.zeros(batch_size, k, device=self.device)

        # Track completed sequences
        completed_sequences = []
        completed_scores = []

        with torch.no_grad():
            for step in range(maxlen):
                # Prepare current decoder inputs for all beams
                current_sequences = sequences.view(batch_size * k, -1)

                # Generate next tokens for all beams simultaneously
                outputs = model(
                    input_ids=encoder_inputs.input_ids.repeat(k, 1),
                    attention_mask=encoder_inputs.attention_mask.repeat(k, 1),
                    decoder_input_ids=current_sequences
                )

                # Process logits
                logits = outputs.logits[:, -1, :]
                log_probs = F.log_softmax(logits, dim=-1)

                # Combine previous sequence scores with new log probabilities
                next_sequence_scores = (
                    sequence_scores.unsqueeze(-1) +
                    log_probs.view(batch_size, k, -1)
                )

                # Flatten and find top-k candidates
                next_sequence_scores = next_sequence_scores.view(batch_size, -1)
                topk_scores, topk_indices = torch.topk(next_sequence_scores, k, dim=-1)

                # Reconstruct beam states
                next_sequences = []
                next_scores = []

                for i in range(batch_size):
                    # Compute beam and token indices
                    beam_indices = topk_indices[i] // logits.shape[-1]
                    token_indices = topk_indices[i] % logits.shape[-1]

                    # Gather top-k sequences and their scores
                    batch_sequences = torch.stack([
                        torch.cat([sequences[i, beam_idx], token_indices[j].unsqueeze(0)])
                        for j, beam_idx in enumerate(beam_indices)
                    ])
                    batch_scores = topk_scores[i]

                    next_sequences.append(batch_sequences)
                    next_scores.append(batch_scores)

                # Update sequences and scores
                sequences = torch.stack(next_sequences)
                sequence_scores = torch.stack(next_scores)

                # Optional: Apply constraints if available
                # (This is a placeholder and would need specific implementation)

                # Check for end of sequence
                eos_mask = (sequences[0, :, -1] == tokenizer.eos_token_id)
                if eos_mask.any():
                    completed_sequences.extend(sequences[0, eos_mask])
                    completed_scores.extend(sequence_scores[0, eos_mask])

                    # Remove completed sequences from active beams
                    active_mask = ~eos_mask
                    sequences = sequences[:, active_mask]
                    sequence_scores = sequence_scores[:, active_mask]

                # Early stopping if no active beams
                if len(sequences[0]) == 0:
                    break

            # Return best translation
            if completed_sequences:
                best_idx = torch.argmax(torch.tensor(completed_scores))
                return completed_sequences[best_idx]
            else:
                return sequences[0, 0]  # Best current sequence

    def __len__(self):
        """Returns the total number of samples in the dataset."""
        return len(self.data["train"])

    def __getitem__(self, idx):
        """
        Retrieve a single sample and process it with optimized beam search.

        Returns:
            tuple: (source text, predicted translation, reference translation)
        """
        sample = self.data["train"][idx]["translation"]["tr"]
        constraints = self.constraints_tokens(sample)

        # Optimized beam search
        prediction = self.optimized_beam_search(
            src=sample,
            maxlen=25,
            k=6,
            constrained_tokens=constraints
        )

        # Decode the prediction
        decoded_prediction = tokenizer.decode(prediction, skip_special_tokens=True)
        result = self.data["train"][idx]["translation"]["en"]

        return sample, decoded_prediction, result

In [None]:

# Calculate BLEU score
# Note: corpus_bleu expects references to be a list of list of lists
# (multiple references per prediction) and predictions to be list of lists
bleu_score = corpus_bleu(references, [[pred.split()] for pred in predictions])

print(f"BLEU Score: {bleu_score}")

# Additional analysis
print(f"Total Predictions: {len(predictions)}")
print(f"Total References: {len(references)}")

# Optional: Print a few samples if you want to inspect the data
print("\nSample Predictions:")
for i in range(min(5, len(predictions))):
    print(f"Prediction {i+1}: {predictions[i]}")
    print(f"Reference {i+1}: {references[i][0]}")
    print()

In [None]:
print(len(tr_text))
print(len(ds["train"]))

In [None]:
cs = constraints_tokens(ds["train"][1]["translation"]["tr"], c)
print(cs)
beam_search(maxlen= 50, numC=0, k =6, src = ds["train"][1]["translation"]["tr"], constraints = cs)

In [None]:
print(ds["train"][1]["translation"]["tr"])
ds["train"][1]["translation"]["en"]

In [None]:

text = "Kosova, tekrar eden şikayetler ışığında özelleştirme sürecini incelemeye alıyor."

# Tokenize input text
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Generate translation
translated_tokens = model.generate(**inputs)

# Decode and print the translation
translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
print("Translated text:", translated_text)