<a href="https://colab.research.google.com/github/vvikasreddy/lexically_constrained_beam_search_/blob/main/beam_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## downloading essential modules

In [1]:
!pip install datasets



In [2]:
# Using the cs-en translation

In [3]:
from datasets import load_dataset

import torch
from transformers import MarianMTModel, MarianTokenizer



In [4]:
ds = load_dataset("wmt/wmt16", "tr-en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
ds

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 205756
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1001
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
})

In [6]:
ds['train'][0]

{'translation': {'en': "Kosovo's privatisation process is under scrutiny",
  'tr': "Kosova'nın özelleştirme süreci büyüteç altında"}}

In [7]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tr-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-tr-en")



In [8]:
model

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62389, 512, padding_idx=62388)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62389, 512, padding_idx=62388)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [9]:
# https://huggingface.co/docs/transformers/model_doc/marian

In [10]:
def generate_translation(src_text, max_length = 50):

  # Tokenize input
  encoder_inputs = tokenizer(src_text, return_tensors="pt")

  # intializes the decoder input with decoder start token
  decoder_input = torch.tensor([[model.config.decoder_start_token_id]])

  # change the model to eval mode.
  model.eval()
  with torch.no_grad():

    generated_tokens = []

    while len(generated_tokens) < max_length:

      outputs = model(
          input_ids=encoder_inputs.input_ids,
          attention_mask=encoder_inputs.attention_mask,
          decoder_input_ids=decoder_input
      )


      next_token_logits = outputs.logits[:, -1, :]


      # get the token with maximum logits value
      next_token = torch.argmax(next_token_logits, dim=-1)

      generated_tokens.append(next_token.item())

      if next_token.item() == tokenizer.eos_token_id:
          break
      print(decoder_input)
      decoder_input = torch.cat([decoder_input, next_token.unsqueeze(0)], dim=1)
      print(decoder_input)
    translated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
  return translated_text


In [11]:

### print("\nStep by step generation:")
# for i in range(len(generated_tokens)):
#     partial_text = tokenizer.decode(generated_tokens[:i+1], skip_special_tokens=True)
#     print(f"Step {i+1}: {partial_text}")

In [12]:
model.config.num_beams

6

In [13]:
print(generate_translation(ds['validation'][1]['translation']['tr']))

tensor([[62388]])
tensor([[62388, 31058]])
tensor([[62388, 31058]])
tensor([[62388, 31058,     4]])
tensor([[62388, 31058,     4]])
tensor([[62388, 31058,     4,     8]])
tensor([[62388, 31058,     4,     8]])
tensor([[62388, 31058,     4,     8,  1129]])
tensor([[62388, 31058,     4,     8,  1129]])
tensor([[62388, 31058,     4,     8,  1129,  1782]])
tensor([[62388, 31058,     4,     8,  1129,  1782]])
tensor([[62388, 31058,     4,     8,  1129,  1782,   206]])
tensor([[62388, 31058,     4,     8,  1129,  1782,   206]])
tensor([[62388, 31058,     4,     8,  1129,  1782,   206,  3094]])
tensor([[62388, 31058,     4,     8,  1129,  1782,   206,  3094]])
tensor([[62388, 31058,     4,     8,  1129,  1782,   206,  3094,     9]])
tensor([[62388, 31058,     4,     8,  1129,  1782,   206,  3094,     9]])
tensor([[62388, 31058,     4,     8,  1129,  1782,   206,  3094,     9, 11363]])
tensor([[62388, 31058,     4,     8,  1129,  1782,   206,  3094,     9, 11363]])
tensor([[62388, 31058,     4

In [14]:
ds["validation"][1]["translation"]["tr"]

"Norveç'in beş milyon insanı en yüksek yaşam standartlarının tadını çıkarıyor, sadece Avrupa'da değil, dünyada."

In [15]:
ds["validation"][1]["translation"]["en"]

"Norway's five million people enjoy one of the highest standards of living, not just in Europe, but in the world."

In [16]:
# reference to get the python file from github



https://colab.research.google.com/github/jckantor/cbe61622/blob/master/docs/A.02-Downloading_Python_source_files_from_github.ipynb

In [17]:
# get constraints

user = "vvikasreddy"
repo = "lexically_constrained_beam_search_"
pyfile = "constraints.py"

# url = "https://github.com/vvikasreddy/lexically_constrained_beam_search_/blob/main/constraints.py"
url = f"https://raw.githubusercontent.com/{user}/{repo}/main/{pyfile}"
!wget --no-cache --backups=1 {url}

import constraints

--2024-11-04 22:08:59--  https://raw.githubusercontent.com/vvikasreddy/lexically_constrained_beam_search_/main/constraints.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4406 (4.3K) [text/plain]
Saving to: ‘constraints.py’


2024-11-04 22:08:59 (34.1 MB/s) - ‘constraints.py’ saved [4406/4406]



In [18]:
# takes nice 5 minutes to get all the constraints of length 2
c = constraints.get_constraints()

In [19]:
def generate_translation(src_text, decoder_input = [], probabilites = [],  max_length = 1, get_constrained_token_probability = -1):

  """generate the decoder input ids and corresponding probabilities"""


  # Tokenize input
  encoder_inputs = tokenizer(src_text, return_tensors="pt")

  # initialize if not initialized.
  if decoder_input == []:
    # because the intial decoder start token has probabilitty 1
    probabilites = torch.tensor([[1]])
    decoder_input = torch.tensor([[model.config.decoder_start_token_id]])

  # change the model to eval mode.
  model.eval()
  with torch.no_grad():

    generated_tokens = []

    outputs = model(
        input_ids=encoder_inputs.input_ids,
        attention_mask=encoder_inputs.attention_mask,
        decoder_input_ids=decoder_input
    )

    next_token_logits = outputs.logits[:, -1, :]

    if get_constrained_token_probability != -1:
      # print(next_token_logits.shape)
      softmax_  = torch.softmax(next_token_logits, dim=-1)
      return softmax_[0][get_constrained_token_probability]

    # get the token with maximum logits value

    next_token = torch.argmax(next_token_logits, dim=-1)


    top_probs, top_indices = torch.topk(torch.softmax(next_token_logits, dim=-1), k = 5)


  decoder_inputs = []
  probs = []
  # to visualize data
  vis_data = []
  for indx, id in enumerate(top_indices[0]):
    # print(decoder_input, id.unsqueeze(0).unsqueeze(0), id, "jhere")
    # print(probabilities, top_probs[0][indx].unsqueeze(0).unsqueeze(0))
    decoder_inputs.append(torch.cat([decoder_input, id.unsqueeze(0).unsqueeze(0)], dim=1))
    probs.append(torch.cat([probabilites, top_probs[0][indx].unsqueeze(0).unsqueeze(0)], dim=1))
    # print(decoder_inputs[indx], indx)
    vis_data.append((vis_data, tokenizer.decode(decoder_inputs[indx].squeeze(), skip_special_tokens = True)))
  # translated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
  # print(decoder_inputs, probs)
  # print(vis_data)9
  return decoder_inputs, probs, vis_data
  # return decoder_input, probabilities


In [20]:
print(generate_translation(ds['validation'][1]['translation']['tr'], decoder_input = torch.tensor([[62388,  1969]]), probabilites = torch.tensor([[0.0000, 0.0000]])))
# c

([tensor([[62388,  1969,   261]]), tensor([[62388,  1969,    15]]), tensor([[62388,  1969,   510]]), tensor([[62388,  1969,     8]]), tensor([[62388,  1969,    47]])], [tensor([[0.0000, 0.0000, 0.3950]]), tensor([[0.0000, 0.0000, 0.1113]]), tensor([[0.0000, 0.0000, 0.0247]]), tensor([[0.0000, 0.0000, 0.0110]]), tensor([[0.0000, 0.0000, 0.0105]])], [([...], 'Koso'), ([...], 'Kost'), ([...], 'Kosum'), ([...], 'Koss'), ([...], 'Kosa')])


In [21]:
def get_ngrams(src, n = 2, ):
  src = src.split(" ")

  src = [tuple(src[i:i+n]) for i in range(len(src) - n + 1)]

  return src

def constraints_tokens(src, c):
  ngrams = get_ngrams(src)
  constraints_src = []
  for ngram in ngrams:
    # print(ngram)
    if ngram in c:
      for gram in ngram:

        constraints_src.append(tokenizer(gram, return_tensors="pt"))
  return constraints_src

def get_input_ids(data):

  input_ids = []
  for example in data:
    input_ids.append((example['input_ids'][0].tolist())[0])
  return input_ids

In [22]:
x = constraints_tokens(ds['validation'][1]['translation']['tr'] + ' Times' +  ' için', c)
# c
print(x)
print(get_input_ids(x))

[{'input_ids': tensor([[3762,    0]]), 'attention_mask': tensor([[1, 1]])}, {'input_ids': tensor([[37,  0]]), 'attention_mask': tensor([[1, 1]])}]
[3762, 37]


In [23]:
def visualize_data(decoder_input):
  return tokenizer.decode(decoder_input.squeeze(), skip_special_tokens = True)

In [24]:
import torch

def get_top_k_prob(A, B, k=2):

  d = {}
  # cummulative sum
  for indx, val in enumerate(B):

    cum_sum = torch.prod(val)
    d[cum_sum] = indx

  sorted_keys = sorted(d.keys(), reverse = True)

  top_k_indices = []
  top_k_sequences = []

  for key in sorted_keys[:k]:
    top_k_indices.append(A[d[key]])
    top_k_sequences.append(B[d[key]])

  return top_k_sequences, top_k_indices

# sanity
k = 2

A = [torch.tensor([[62388,   626,    13]]), torch.tensor([[62388,   626,     9]]), torch.tensor([[62388,   626,  1341]]), torch.tensor([[62388,   626,    27]])]
B = [torch.tensor([[1.0000, 0.0038, 0.2500]]), torch.tensor([[1.0000, 0.0038, 0.0619]]), torch.tensor([[1.0000, 0.0038, 0.0474]]), torch.tensor([[1.0000, 0.0038, 0.0425]])]

top_sequences, indices = get_top_k_prob(A, B, k)
print(f"Top {k} sequences:", top_sequences)
print("Their indices:", indices)

Top 2 sequences: [tensor([[1.0000, 0.0038, 0.2500]]), tensor([[1.0000, 0.0038, 0.0619]])]
Their indices: [tensor([[62388,   626,    13]]), tensor([[62388,   626,     9]])]


In [25]:
def beam_search(maxlen, numC, k, src, constraints):

    decoder_start_token = model.config.decoder_start_token_id

    # initialize the grids
    grids = [[[] for _ in range(numC + 1)] for _ in range(maxlen + 1)]
    probs_grid  = [[[] for _ in range(numC + 1)] for _ in range(maxlen + 1)]

    # intialize the first grid to start hyp
    grids[0][0] = [1]


    # remove during testsrc
    # constrained_tokens = get_input_ids(constraints_tokens(, constraints))
    # temporary
    constrained_tokens = [3762, 37]

    generated_constraint_index = 0

    for t in range(1, maxlen):

      index_c = max(0, (numC - t) - maxlen)

      for c in range(index_c, min(t, numC) + 1):


          print("cur iteration ", t, c )
          s = []
          g = []

          # storing decoder inputs
          decoder_inputs = []
          probs = []

          # print(grids)
          # print(probs_grid[t-1][c], "hey yo ", t - 1, c)
          for indx, element in enumerate(grids[t-1][c]):

            # guess there is no need for conditioning, just generate.
            # print(element, "this is the element")
            if type(element) == int:
              decoder_input = []
              probs =[]
            else:
              decoder_input = element
              probs = probs_grid[t-1][c][indx]

            # print(element)
            # print(decoder_input)
            # print(probs)
            # print("----------------------------")
            g, probs, vis_data = generate_translation(src_text= src, decoder_input = decoder_input, probabilites = probs)


          # retrieve the  probability of the constraint and add that to the decoder_input.
          if c > 0 and constrained_tokens:

            for indx, element in enumerate(grids[t-1][c-1]):

              if c == 1 and t == 1:
                decoder_inputs = torch.tensor([[model.config.decoder_start_token_id]])
                prob = torch.tensor([[1]])

              else:
                decoder_inputs = element
                prob = probs_grid[t-1][c-1][indx]

              # Gets the constraints probability and stores them
              cons = generate_translation(src, decoder_input = decoder_input, get_constrained_token_probability = constrained_tokens[c - 1])
              # print(cons, decoder_inputs, element)
              decoder_inputs = torch.cat([decoder_inputs, torch.tensor(constrained_tokens[c-1]).unsqueeze(0).unsqueeze(0)], dim=1)
              prob = torch.cat([prob, torch.tensor(cons).unsqueeze(0).unsqueeze(0)], dim=1)


              g.append(decoder_inputs)

              probs.append(prob)

          probs_grid[t][c], grids[t][c] = get_top_k_prob(g, probs, k)

          # print(grids[t][c], t, c )
          # print(probs_grid[t][c])
          for i in  grids[t][c]:
            print(visualize_data(i), i )


          print('asd-------------------------------------------')

  # sanity : print grids
    print(grids)


beam_search(maxlen= 10, numC=2, k = 2, src = ds["train"][1]["translation"]["tr"], constraints = c)

cur iteration  1 0
Kos tensor([[62388,  1969]])
In tensor([[62388,   323]])
asd-------------------------------------------
cur iteration  1 1
Times tensor([[62388,  3762]])
asd-------------------------------------------
cur iteration  2 0


  prob = torch.cat([prob, torch.tensor(cons).unsqueeze(0).unsqueeze(0)], dim=1)


In light tensor([[62388,   323,  1341]])
In view tensor([[62388,   323,  4316]])
asd-------------------------------------------
cur iteration  2 1
Kos Times tensor([[62388,  1969,  3762]])
In Times tensor([[62388,   323,  3762]])
asd-------------------------------------------
cur iteration  2 2
Times için tensor([[62388,  3762,    37]])
asd-------------------------------------------
cur iteration  3 0
In view of tensor([[62388,   323,  4316,    16]])
In view to tensor([[62388,   323,  4316,    11]])
asd-------------------------------------------
cur iteration  3 1
In light Times tensor([[62388,   323,  1341,  3762]])
In view Times tensor([[62388,   323,  4316,  3762]])
asd-------------------------------------------
cur iteration  3 2
Kos Times için tensor([[62388,  1969,  3762,    37]])
In Times için tensor([[62388,   323,  3762,    37]])
asd-------------------------------------------
cur iteration  4 0
In view to repeated tensor([[62388,   323,  4316,    11, 31187]])
In view to the te