In [59]:
from transformer import Transformer # this is the transformer.py file?
import torch
import numpy as np

In [60]:
english_file = './train.en'
telugu_file = './train.te'

START_TOKEN = ''
PADDING_TOKEN = ''
END_TOKEN = ''

telugu_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                      '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '?', 'ˌ',
                      '౧', '౨' '౩', '౪', '౫', '౬', '౭', '౮', '౯', '౦',
                      'అ', 'ఆ', 'ఇ', 'ఈ', 'ఉ', 'ఊ', 'ఋ', 'ౠ', 'ఎ', 'ఏ', 'ఐ', 'ఒ', 'ఓ', 'ఔ', 'అం', 'అః',
                      'క', 'ఖ', 'గ', 'ఘ', 'ఙ',
                      'చ', 'ఛ', 'జ', 'ఝ', 'ఞ',
                      'ట', 'ఠ', 'డ', 'ఢ', 'ణ',
                      'త', 'థ', 'ద', 'ధ', 'న',
                      'ప', 'ఫ', 'బ', 'భ', 'మ',
                      'య', 'ర', 'ల', 'వ', 'శ',
                      'ష', 'స', 'హ', 'ళ', 'క్ష', 'ఱ',
                    '  ా','ి','ీ','ు','ూ','ృ',  'ె','ే','్','ొ','ో' ,'ౌ','ం' ,  PADDING_TOKEN, END_TOKEN]

english_vocabulary = [START_TOKEN, ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
                        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                        ':', '<', '=', '>', '?', '@',
                        '[', ']', '^', '_', '`',  '/',
                        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
                        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
                        'y', 'z',
                        '{', '|', '}', '~', PADDING_TOKEN, END_TOKEN]




In [61]:


index_to_telugu = {k:v for k,v in enumerate(telugu_vocabulary)}
telugu_to_index = {v:k for k,v in enumerate(telugu_vocabulary)}
index_to_english = {k:v for k,v in enumerate(english_vocabulary)}
english_to_index = {v:k for k,v in enumerate(english_vocabulary)}

In [62]:

with open(english_file, 'r', encoding='utf-8', errors='ignore') as file:
    english_sentences = file.readlines()
with open(telugu_file, 'r', encoding='utf-8', errors='ignore') as file:
    telugu_sentences = file.readlines()

# Limit Number of sentences
TOTAL_SENTENCES = 200000
english_sentences = english_sentences[:TOTAL_SENTENCES]
telugu_sentences = telugu_sentences[:TOTAL_SENTENCES]
english_sentences = [sentence.rstrip('\n').lower() for sentence in english_sentences]
telugu_sentences = [sentence.rstrip('\n') for sentence in telugu_sentences]

In [63]:
english_sentences[:10]


['have you heard about foie gras?',
 'i never thought of acting in films.',
 'installed software',
 'a case has been registered under sections 302 and 376, ipc.',
 'of this, 10 people succumbed to the injuries.',
 'her acting has been praised by critics.',
 'the bibles viewpoint on this is clearly indicated at colossians 3: 9: do not be lying to one another.',
 'the incident was recorded in the cctv footage.',
 'respect privacy',
 '5 lakh would be provided.']

In [64]:
telugu_sentences[:10]

['ఇక ఫ్రూట్ ఫ్లైస్ గురించి మీరు విన్నారా?',
 'సూర్య సినిమాల్లో నటించాలని ఎప్పుడూ అనుకోలేదు.',
 'స్థాపించబడిన సాఫ్ట్\u200dవేర్',
 'నిందితులపై సెక్షన్ 376 మరియు 302ల కింద కేసు నమోదు చేశాం.',
 'అందులో 10 మంది తీవ్రంగా గాయపడ్డారు.',
 'నటనకు గాను విమర్శకుల నుంచి ప్రశంసలు పొందింది.',
 'ఈ విషయంపై బైబిలు దృక్కోణం కొలొస్సయులు 3 :\u2060 9లో “ఒకనితో ఒకడు అబద్ధమాడకుడి ” అని స్పష్టంగా సూచించబడింది.',
 'ఈ ప్రమాద దృశ్యాలు సీసీటీవీ ఫుటేజ్\u200cలో రికార్డ్ అయ్యాయి.',
 'గోప్యత పాటించండి',
 '5లక్షలు సాయం అందజేశారు.']

In [65]:
import numpy as np
PERCENTILE = 97
print( f"{PERCENTILE}th percentile length telugu: {np.percentile([len(x) for x in telugu_sentences], PERCENTILE)}" )
print( f"{PERCENTILE}th percentile length English: {np.percentile([len(x) for x in english_sentences], PERCENTILE)}" )

97th percentile length telugu: 169.0
97th percentile length English: 178.0


In [66]:
max_sequence_length = 200

def is_valid_tokens(sentence, vocab):
    for token in list(set(sentence)):
        if token not in vocab:
            return False
    return True

def is_valid_length(sentence, max_sequence_length):
    return len(list(sentence)) < (max_sequence_length - 1) # need to re-add the end token so leaving 1 space

valid_sentence_indicies = []
for index in range(len(telugu_sentences)):
    telugu_sentence, english_sentence = telugu_sentences[index], english_sentences[index]
    if is_valid_length(telugu_sentence, max_sequence_length) \
      and is_valid_length(english_sentence, max_sequence_length) \
      and is_valid_tokens(telugu_sentence, telugu_vocabulary):
        valid_sentence_indicies.append(index)

print(f"Number of sentences: {len(telugu_sentences)}")
print(f"Number of valid sentences: {len(valid_sentence_indicies)}")

Number of sentences: 200000
Number of valid sentences: 16103


In [67]:


telugu_sentences = [telugu_sentences[i] for i in valid_sentence_indicies]
english_sentences = [english_sentences[i] for i in valid_sentence_indicies]

In [68]:
telugu_sentences[:3]


['"""సూపర్ బౌల్."', 'ఏ పనిచేయలేరు.', '02 లక్షల వరకు తగ్గించింది.']

In [69]:
import torch

d_model = 512
batch_size = 30
ffn_hidden = 2048
num_heads = 8
drop_prob = 0.1
num_layers = 1
max_sequence_length = 200
kn_vocab_size = len(telugu_vocabulary)

transformer = Transformer(d_model,
                          ffn_hidden,
                          num_heads,
                          drop_prob,
                          num_layers,
                          max_sequence_length,
                          kn_vocab_size,
                          english_to_index,
                          telugu_to_index,
                          START_TOKEN,
                          END_TOKEN,
                          PADDING_TOKEN)

TypeError: SentenceEmbedding.__init__() takes 3 positional arguments but 7 were given

In [50]:
transformer


NameError: name 'transformer' is not defined

In [51]:


from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, english_sentences, telugu_sentences):
        self.english_sentences = english_sentences
        self.telugu_sentences = telugu_sentences

    def __len__(self):
        return len(self.english_sentences)

    def __getitem__(self, idx):
        return self.english_sentences[idx], self.telugu_sentences[idx]


In [52]:
dataset = TextDataset(english_sentences, telugu_sentences)


In [53]:
len(dataset)


16103

In [54]:
dataset[1]


('it cannot work.', 'ఏ పనిచేయలేరు.')

In [55]:

train_loader = DataLoader(dataset, batch_size)
iterator = iter(train_loader)

In [56]:
for batch_num, batch in enumerate(iterator):
    print(batch)
    if batch_num > 3:
        break

[('super bowl.', 'it cannot work.', 'rs 6.02 crore.', 'he is just one...', 'i have never sulked.', 'inability to control urination', 'prices drop', 'it means elder sister.', 'the hair is porous.', 'the girl died in the incident.', 'nuzivedu road', 'why do you need the power?', 'however, police were silent on the matter.', 'we can do this thing.', 'advantage bjp!', 'floods in kerala', 'under this scheme, government earmarked rs.', 'this is a major blow', 'availability & pricing', 'go well', 'yet another couple commit suicide', 'step two: segregation', '49 people dead.', 'what is the matter?', 'category: telugu stage actors', 'come back!', 'smith hit', 'the process', 'whats the venue?', 'its gone.'), ('"""సూపర్ బౌల్."', 'ఏ పనిచేయలేరు.', '02 లక్షల వరకు తగ్గించింది.', 'అతనొక్కడే .', 'నేనెప్పుడూ చీట్ చేయలేదు', 'మూత్రవిసర్జన నియంత్రణ కోల్పోవడం', 'ధరలు తగ్గేవి ఇవే', 'అంటే పెద్దన్నయ్య అని అర్థం.', 'జుట్టు బోలెడంత ఉంది.', 'ఈ ఘటనలో కూతురు చనిపోయింది.', 'నూజివీడు రోడ్డు', 'నీకు శక్తి ఎందుకు అవసరం

In [57]:


from torch import nn

criterian = nn.CrossEntropyLoss(ignore_index=telugu_to_index[PADDING_TOKEN],
                                reduction='none')

# When computing the loss, we are ignoring cases when the label is the padding token
for params in transformer.parameters():
    if params.dim() > 1:
        nn.init.xavier_uniform_(params)

optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

NameError: name 'transformer' is not defined

In [58]:
NEG_INFTY = -1e9

def create_masks(eng_batch, kn_batch):
    num_sentences = len(eng_batch)
    look_ahead_mask = torch.full([max_sequence_length, max_sequence_length] , True)
    look_ahead_mask = torch.triu(look_ahead_mask, diagonal=1)
    encoder_padding_mask = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_self_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)
    decoder_padding_mask_cross_attention = torch.full([num_sentences, max_sequence_length, max_sequence_length] , False)

    for idx in range(num_sentences):
      eng_sentence_length, kn_sentence_length = len(eng_batch[idx]), len(kn_batch[idx])
      eng_chars_to_padding_mask = np.arange(eng_sentence_length + 1, max_sequence_length)
      kn_chars_to_padding_mask = np.arange(kn_sentence_length + 1, max_sequence_length)
      encoder_padding_mask[idx, :, eng_chars_to_padding_mask] = True
      encoder_padding_mask[idx, eng_chars_to_padding_mask, :] = True
      decoder_padding_mask_self_attention[idx, :, kn_chars_to_padding_mask] = True
      decoder_padding_mask_self_attention[idx, kn_chars_to_padding_mask, :] = True
      decoder_padding_mask_cross_attention[idx, :, eng_chars_to_padding_mask] = True
      decoder_padding_mask_cross_attention[idx, kn_chars_to_padding_mask, :] = True

    encoder_self_attention_mask = torch.where(encoder_padding_mask, NEG_INFTY, 0)
    decoder_self_attention_mask =  torch.where(look_ahead_mask + decoder_padding_mask_self_attention, NEG_INFTY, 0)
    decoder_cross_attention_mask = torch.where(decoder_padding_mask_cross_attention, NEG_INFTY, 0)
    return encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask

In [64]:


transformer.train()
transformer.to(device)
total_loss = 0
num_epochs = 10

for epoch in range(num_epochs):
    print(f"Epoch {epoch}")
    iterator = iter(train_loader)
    for batch_num, batch in enumerate(iterator):
        transformer.train()
        eng_batch, te_batch = batch
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, te_batch)
        optim.zero_grad()
        te_predictions = transformer(eng_batch,
                                     te_batch,
                                     encoder_self_attention_mask.to(device),
                                     decoder_self_attention_mask.to(device),
                                     decoder_cross_attention_mask.to(device),
                                     enc_start_token=False,
                                     enc_end_token=False,
                                     dec_start_token=True,
                                     dec_end_token=True)
        labels = transformer.decoder.sentence_embedding.batch_tokenize(kn_batch, start_token=False, end_token=True)
        loss = criterian(
            te_predictions.view(-1, kn_vocab_size).to(device),
            labels.view(-1).to(device)
        ).to(device)
        valid_indicies = torch.where(labels.view(-1) == telugu_to_index[PADDING_TOKEN], False, True)
        loss = loss.sum() / valid_indicies.sum()
        loss.backward()
        optim.step()
        #train_losses.append(loss.item())
        if batch_num % 100 == 0:
            print(f"Iteration {batch_num} : {loss.item()}")
            print(f"English: {eng_batch[0]}")
            print(f"Kannada Translation: {te_batch[0]}")
            te_sentence_predicted = torch.argmax(te_predictions[0], axis=1)
            predicted_sentence = ""
            for idx in te_sentence_predicted:
              if idx == telugu_to_index[END_TOKEN]:
                break
              predicted_sentence += index_to_telugu[idx.item()]
            print(f"Kannada Prediction: {predicted_sentence}")


            transformer.eval()
            te_sentence = ("",)
            eng_sentence = ("should we go to the mall?",)
            for word_counter in range(max_sequence_length):
                encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask= create_masks(eng_sentence, te_sentence)
                predictions = transformer(eng_sentence,
                                          te_sentence,
                                          encoder_self_attention_mask.to(device),
                                          decoder_self_attention_mask.to(device),
                                          decoder_cross_attention_mask.to(device),
                                          enc_start_token=False,
                                          enc_end_token=False,
                                          dec_start_token=True,
                                          dec_end_token=False)
                next_token_prob_distribution = predictions[0][word_counter] # not actual probs
                next_token_index = torch.argmax(next_token_prob_distribution).item()
                next_token = index_to_telugu[next_token_index]
                te_sentence = (te_sentence[0] + next_token, )
                if next_token == END_TOKEN:
                  break

            print(f"Evaluation translation (should we go to the mall?) : {te_sentence}")
            print("-------------------------------------------")

Epoch 0


IndexError: index out of range in self

In [1]:
import torch
import torch.optim as optim

# Define your transformer model

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
transformer.to(device)

# Define optimizer and learning rate
optimizer = optim.Adam(transformer.parameters(), lr=0.001)

# Assuming `criterion` is your loss function (e.g., CrossEntropyLoss)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
transformer.train()
num_epochs = 10

for epoch in range(num_epochs):
    print(f"Epoch {epoch}")
    iterator = iter(train_loader)
    
    for batch_num, batch in enumerate(iterator):
        eng_batch, te_batch = batch
        encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_batch, te_batch)
        
        # Forward pass
        optimizer.zero_grad()
        te_predictions = transformer(eng_batch,
                                     te_batch,
                                     encoder_self_attention_mask.to(device),
                                     decoder_self_attention_mask.to(device),
                                     decoder_cross_attention_mask.to(device),
                                     enc_start_token=False,
                                     enc_end_token=False,
                                     dec_start_token=True,
                                     dec_end_token=True)
        
        # Compute loss
        labels = transformer.decoder.sentence_embedding.batch_tokenize(te_batch, start_token=False, end_token=True)
        loss = criterion(te_predictions.view(-1, kn_vocab_size).to(device),
                         labels.view(-1).to(device))
        
        # Apply masking to ignore padding tokens
        valid_indices = (labels.view(-1) != telugu_to_index[PADDING_TOKEN])
        masked_loss = (loss * valid_indices.float()).sum() / valid_indices.sum()
        
        # Backward pass and optimization
        masked_loss.backward()
        optimizer.step()
        
        # Logging
        if batch_num % 100 == 0:
            print(f"Iteration {batch_num} : {masked_loss.item()}")
            print(f"English: {eng_batch[0]}")
            print(f"telugu Translation: {te_batch[0]}")
            
            # Inference (Evaluation mode)
            transformer.eval()
            with torch.no_grad():
                eng_sentence = ("should we go to the mall?",)
                te_sentence = ("",)
                
                for word_counter in range(max_sequence_length):
                    encoder_self_attention_mask, decoder_self_attention_mask, decoder_cross_attention_mask = create_masks(eng_sentence, te_sentence)
                    
                    predictions = transformer(eng_sentence,
                                              te_sentence,
                                              encoder_self_attention_mask.to(device),
                                              decoder_self_attention_mask.to(device),
                                              decoder_cross_attention_mask.to(device),
                                              enc_start_token=False,
                                              enc_end_token=False,
                                              dec_start_token=True,
                                              dec_end_token=False)
                    
                    next_token_prob_distribution = predictions[0][word_counter]
                    next_token_index = torch.argmax(next_token_prob_distribution).item()
                    next_token = index_to_telugu[next_token_index]
                    te_sentence = (te_sentence[0] + next_token,)
                    
                    if next_token == END_TOKEN:
                        break
                
                print(f"Evaluation translation (should we go to the mall?) : {te_sentence}")
                print("-------------------------------------------")
                
            transformer.train()  # Switch back to training mode


NameError: name 'transformer' is not defined