In [72]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab, build_vocab_from_iterator, vocab
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Subset
from collections import Counter
import os
import spacy
import csv
from sklearn.model_selection import train_test_split

In [73]:
! spacy download en_core_web_lg

Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [74]:
nlp = spacy.load("en_core_web_lg")

In [75]:
 ! pip install -q kaggle

In [None]:
os.environ['KAGGLE_USERNAME'] = "iiitomiii" # username from the json file
os.environ['KAGGLE_KEY'] = "f0df173a0107862ad5018d7a1ef47736" # key from the json file
!kaggle datasets download -d rtatman/ubuntu-dialogue-corpus

Downloading ubuntu-dialogue-corpus.zip to /content
100% 799M/799M [00:27<00:00, 39.0MB/s]
100% 799M/799M [00:27<00:00, 30.6MB/s]


In [None]:
! unzip ubuntu-dialogue-corpus.zip

Archive:  ubuntu-dialogue-corpus.zip
  inflating: Ubuntu-dialogue-corpus/dialogueText.csv  
  inflating: Ubuntu-dialogue-corpus/dialogueText_196.csv  
  inflating: Ubuntu-dialogue-corpus/dialogueText_301.csv  
  inflating: toc.csv                 


In [151]:

def question_tokenizer(text):
    return [token.text.strip().lower() for token in nlp.tokenizer(text) if not token.is_punct and not token.like_url and token.text.strip() != '']

def answer_tokenizer(text):
    return [token.text.strip().lower() for token in nlp.tokenizer(text) if not token.is_punct and not token.like_url and token.text.strip() != '']

In [152]:
FRM_IDX = 0
TO_IDX = 1
TEXT_IDX = 2

question_answer_pairs = []
failed_rows = []
with open('Ubuntu-dialogue-corpus/dialogueText.csv', newline='') as csvfile:
    dialogue = csv.reader(csvfile)
    question = ''
    answer = ''
    current_dialog_id = ''
    current_dialog = []

    for idx, row in enumerate(dialogue):
      # skip the first row
      if idx ==0:
        continue

      if current_dialog_id != '' and row[1] != current_dialog_id:
        # 2 answers 1 question
        if(current_dialog[1][TO_IDX] == current_dialog[0][FRM_IDX]
           and current_dialog[2][TO_IDX] == current_dialog[0][FRM_IDX]):
           question = current_dialog[0][TEXT_IDX]
           answer = current_dialog[1][TEXT_IDX]
        # 1 answer 2 questions
        elif(current_dialog[2][TO_IDX] == current_dialog[1][FRM_IDX]):
           question = f'{current_dialog[0][TEXT_IDX]} {current_dialog[1][TEXT_IDX]}'
           answer = current_dialog[2][TEXT_IDX]
        # 0 answers 3 questions
        elif(len(set([current_dialog[0][FRM_IDX],
                     current_dialog[1][FRM_IDX],
                     current_dialog[2][FRM_IDX]])) == 1):
          question = ''
          answer = ''
        # invalid combination
        else:
          question = ''
          answer = ''
          failed_rows.append(current_dialog)

        if question != '' and answer != '' \
           and len(answer.split(' ')) < 20 and len(question.split(' ')) < 20:
          question_answer_pairs.append((question, answer))
        current_dialog = []


      current_dialog_id = row[1]
      current_dialog.append((row[3], row[4], row[5]))



print(len(question_answer_pairs))

179264


In [None]:
print(failed_rows[:10])

[[('psusi', 'maxxism', 'there is no need... the proper drivers will be used with the standard cd'), ('maxxism', 'soundray', ' well I like to play a bit too much.  and sometimes when things get borked a reinstall is easiest'), ('psusi', 'maxxism', 'also you might try using backups... a restore from backup is easier than reinstall')], [('trism', 'sacarlson1', 'https://launchpad.net/ubuntu/+source/linux-lts-backport-natty'), ('sacarlson1', 'jarray52', 'I assume they forgot to add the ppa to add, oh yes as trism just published for us'), ('trism', 'sacarlson1', 'not a ppa, it is in main')], [('DarkMageZ', 'dbe', 'non-free software is available in the multiverse & restricted repositories'), ('dbe', 'DarkAudit', 'Not in UTUTO'), ('DarkMageZ', 'dbe', 'oh, opps :)')], [('selig5', 'ksbalaji', "it should be 'connect irc.freenode.com'"), ('ksbalaji', 'tunys', 'done  /connect irc.freenode.net Irssi says Not connected to server.'), ('selig5', 'ksbalaji', "hey, it's freenode.com not freenode.net")], 

In [146]:
with open('pairs4.csv','w') as myfile:
  wr = csv.writer(myfile) #, quoting=csv.QUOTE_ALL)
  for row in question_answer_pairs:
    wr.writerow(row)


In [None]:
len(question_answer_pairs)
#question_answer_pairs = question_answer_pairs[:10]

In [153]:


def question_iter():
  for question, _ in question_answer_pairs:
    yield question_tokenizer(question)

def answer_iter():
  for _, answer in question_answer_pairs:
    yield answer_tokenizer(answer)

vocab_q = build_vocab_from_iterator(question_iter(), specials=['<pad>','<bos>','<eos>'])
vocab_a = build_vocab_from_iterator(answer_iter(), specials=['<pad>','<bos>','<eos>'])

In [None]:
# different way to build vocab.
def build_vocab():
  counter_q = Counter()
  counter_a = Counter()
  for question, answer in question_answer_pairs:
      counter_q.update(question_tokenizer(question))
      counter_a.update(answer_tokenizer(answer))
  return vocab(counter_q, specials=['<unk>', '<pad>', '<bos>', '<eos>']), vocab(counter_a, specials=['<unk>', '<pad>', '<bos>', '<eos>'])


vocab_q, vocab_a = build_vocab()

In [None]:
vocab_a['<eos>']

2

In [None]:
max(vocab_q.get_stoi().values())

3

In [154]:
data = []
for question, answer in question_answer_pairs:
  question_tensor_ = torch.tensor([vocab_q[token] for token in question_tokenizer(question)],
                            dtype=torch.long)
  answer_tensor_ = torch.tensor([vocab_a[token] for token in answer_tokenizer(answer)],
                            dtype=torch.long)
  data.append((question_tensor_, answer_tensor_))

In [None]:
torch.cuda.is_available()

True

In [155]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')

BATCH_SIZE = 64
PAD_IDX = vocab_a['<pad>']
BOS_IDX = vocab_a['<bos>']
EOS_IDX = vocab_a['<eos>']

def generate_batch(data_batch):
  batch_question, batch_answer = [], []
  for question, answer in data_batch:
    batch_answer.append(torch.cat([torch.tensor([BOS_IDX]), answer, torch.tensor([EOS_IDX])], dim=0))
    batch_question.append(torch.cat([torch.tensor([BOS_IDX]), question, torch.tensor([EOS_IDX])], dim=0))
  batch_answer = pad_sequence(batch_answer, padding_value=PAD_IDX)
  batch_question = pad_sequence(batch_question, padding_value=PAD_IDX)
  return batch_question, batch_answer

train_idx, test_idx = train_test_split(list(range(len(data))), test_size=0.1)
train_idx, val_idx = train_test_split(train_idx, test_size=0.1)


train_iter = DataLoader(Subset(data, train_idx), batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)
test_iter = DataLoader(Subset(data, test_idx), batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)
val_iter = DataLoader(Subset(data, val_idx), batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch)

In [None]:
_, value = next(enumerate(train_iter))

In [156]:
vocab_q.lookup_tokens([i[0].item() for i in value[0]])

['<bos>',
 'why',
 'is',
 'it',
 'when',
 'i',
 'reduce',
 'fine',
 'programs',
 'it',
 'says',
 'c',
 'and',
 'webcam',
 'are',
 'nt',
 'found',
 '<eos>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [157]:
vocab_a.lookup_tokens([i[0].item() for i in value[1]])

['<bos>',
 'what',
 'are',
 'you',
 'trying',
 'to',
 've',
 '<eos>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [84]:
from binascii import Error
import random
from typing import Tuple

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor


class Encoder(nn.Module):
    def __init__(self,
                 input_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: float):
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)

        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self,
                src: Tensor) -> Tuple[Tensor]:
        try:
          embedded = self.dropout(self.embedding(src))

          outputs, hidden = self.rnn(embedded)

          hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))

          return outputs, hidden
        except Error as e:
          print(e)
          raise


class Attention(nn.Module):
    def __init__(self,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 attn_dim: int):
        super().__init__()

        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim

        self.attn = nn.Linear(self.attn_in, attn_dim)

    def forward(self,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor) -> Tensor:

        src_len = encoder_outputs.shape[0]

        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, src_len, 1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        energy = torch.tanh(self.attn(torch.cat((
            repeated_decoder_hidden,
            encoder_outputs),
            dim = 2)))

        attention = torch.sum(energy, dim=2)

        return F.softmax(attention, dim=1)


class Decoder(nn.Module):
    def __init__(self,
                 output_dim: int,
                 emb_dim: int,
                 enc_hid_dim: int,
                 dec_hid_dim: int,
                 dropout: int,
                 attention: nn.Module):
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)

        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)

        self.dropout = nn.Dropout(dropout)


    def _weighted_encoder_rep(self,
                              decoder_hidden: Tensor,
                              encoder_outputs: Tensor) -> Tensor:

        a = self.attention(decoder_hidden, encoder_outputs)

        a = a.unsqueeze(1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        weighted_encoder_rep = torch.bmm(a, encoder_outputs)

        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)

        return weighted_encoder_rep


    def forward(self,
                input: Tensor,
                decoder_hidden: Tensor,
                encoder_outputs: Tensor) -> Tuple[Tensor]:

        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        weighted_encoder_rep = self._weighted_encoder_rep(decoder_hidden,
                                                          encoder_outputs)

        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)

        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)

        output = self.out(torch.cat((output,
                                     weighted_encoder_rep,
                                     embedded), dim = 1))

        return output, decoder_hidden.squeeze(0)


class Seq2Seq(nn.Module):
    def __init__(self,
                 encoder: nn.Module,
                 decoder: nn.Module,
                 device: torch.device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self,
                src: Tensor,
                trg: Tensor,
                teacher_forcing_ratio: float = 0.5) -> Tensor:

        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim




        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)

        # first input to the decoder is the <sos> token
        output = trg[0,:]

        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)
            #print(vocab_a.lookup_token(trg[t][0].item()))
        return outputs


INPUT_DIM = len(vocab_q)
OUTPUT_DIM = len(vocab_a)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ATTN_DIM = 64
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

#ENC_EMB_DIM = 64
#DEC_EMB_DIM = 64
#ENC_HID_DIM = 128
#DEC_HID_DIM = 128
#ATTN_DIM = 16
#ENC_DROPOUT = 0.5
#DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)

attn = Attention(ENC_HID_DIM, DEC_HID_DIM, ATTN_DIM)

dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, device).to(device)


def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)


model.apply(init_weights)

optimizer = optim.Adam(model.parameters())


def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 108,736,997 trainable parameters


In [None]:
print(OUTPUT_DIM)

69114


In [82]:
PAD_IDX = vocab_a['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [85]:
import math
import time


def train(model: nn.Module,
          iterator: torch.utils.data.DataLoader,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):

    model.train()

    epoch_loss = 0

    for _, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        output = model(src, trg)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model: nn.Module,
             iterator: torch.utils.data.DataLoader,
             criterion: nn.Module):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for _, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg, 0) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)


            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_iter, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

    torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/model.data')

test_loss = evaluate(model, test_iter, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

Epoch: 01 | Time: 11m 59s
	Train Loss: 5.779 | Train PPL: 323.472
	 Val. Loss: 5.978 |  Val. PPL: 394.653
Epoch: 02 | Time: 12m 0s
	Train Loss: 5.184 | Train PPL: 178.439
	 Val. Loss: 5.995 |  Val. PPL: 401.227
Epoch: 03 | Time: 12m 8s
	Train Loss: 4.784 | Train PPL: 119.562
	 Val. Loss: 6.158 |  Val. PPL: 472.370
Epoch: 04 | Time: 12m 13s
	Train Loss: 4.448 | Train PPL:  85.462
	 Val. Loss: 6.342 |  Val. PPL: 568.185
Epoch: 05 | Time: 12m 7s
	Train Loss: 4.159 | Train PPL:  63.976
	 Val. Loss: 6.427 |  Val. PPL: 618.491
Epoch: 06 | Time: 12m 10s
	Train Loss: 3.989 | Train PPL:  53.983
	 Val. Loss: 6.536 |  Val. PPL: 689.704
Epoch: 07 | Time: 12m 18s
	Train Loss: 3.849 | Train PPL:  46.969
	 Val. Loss: 6.623 |  Val. PPL: 751.879
Epoch: 08 | Time: 12m 16s
	Train Loss: 3.703 | Train PPL:  40.563
	 Val. Loss: 6.689 |  Val. PPL: 803.155
Epoch: 09 | Time: 12m 10s
	Train Loss: 3.577 | Train PPL:  35.752
	 Val. Loss: 6.848 |  Val. PPL: 941.664
Epoch: 10 | Time: 12m 13s
	Train Loss: 3.469 | Tr

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/Colab Notebooks/model.data')

In [163]:
#model = Seq2Seq(*args, **kwargs)
model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/model.data'))
model.eval()

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(73155, 256)
    (rnn): GRU(256, 512, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=64, bias=True)
    )
    (embedding): Embedding(41125, 256)
    (rnn): GRU(1280, 512)
    (out): Linear(in_features=1792, out_features=41125, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [171]:

def generate_response(sentence, src_field, trg_field, model, device, max_len = 50):

    model.eval()

    if isinstance(sentence, str):
      tokens = question_tokenizer(sentence)
    else:
      tokens = sentence


    tokens = ['<bos>'] + tokens + ['<eos>']

    src_indexes = [src_field[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

    src_len = torch.LongTensor([len(src_indexes)])

    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor)

    trg_indexes = [trg_field['<bos>']]

    for i in range(max_len):

        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)

        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden, encoder_outputs)


        pred_token = output.argmax(1).item()

        trg_indexes.append(pred_token)

        if pred_token == trg_field['<eos>']:
            break
    try:
      trg_tokens = trg_field.lookup_tokens(trg_indexes)
    except:
      return []

    return trg_tokens[1:]

In [164]:
generate_response('is ubuntu good?', vocab_q, vocab_a, model, device)

['ubuntu', 'is', 'great', '<eos>']

In [172]:
expected_responses = []
actual_responses = []

def filter_specials(tokens):
  return [tok for tok in tokens if tok not in ('<bos>', '<eos>', '<pad>') ]

for test in test_iter:

  for idx in range(0, 63):

    question_tokens = filter_specials(vocab_q.lookup_tokens([i[idx].item() for i in test[0]]))
    expected_answer_tokens = filter_specials(vocab_a.lookup_tokens([i[idx].item() for i in test[1]]))
    if len(expected_answer_tokens) != 0:
      actual_answer_tokens = filter_specials(generate_response(question_tokens, vocab_q, vocab_a, model, device))
      expected_responses.append(expected_answer_tokens)
      actual_responses.append(actual_answer_tokens)
  value

IndexError: ignored

In [162]:
filter_specials(vocab_q.lookup_tokens([i[idx].item() for i in test[1]]))

['possible',
 'can',
 'thanks',
 'ubuntu',
 'kernel',
 'been',
 'join',
 'the',
 'brb',
 'a']

In [169]:
question_tokens

['help', 'help', 'help']

In [177]:
generate_response('help me', vocab_q, vocab_a, model, device)

['what', "'s", 'your', 'issue', '<eos>']

In [173]:
import nltk
from nltk.translate.bleu_score import corpus_bleu

bleu_score = corpus_bleu(expected_responses, actual_responses)
print("BLEU score:", bleu_score)

BLEU score: 7.088753671315663e-156


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
exact_match_count = sum([1 if ref == gen else 0 for ref, gen in zip(reference_translations, generated_outputs)])
exact_match_ratio = exact_match_count / len(reference_translations) * 100
print("Exact Match Ratio:", exact_match_ratio)

In [None]:
def wer(r, h):
    # Initialize the dynamic programming table
    d = [[0] * (len(h) + 1) for _ in range(len(r) + 1)]

    for i in range(len(r) + 1):
        for j in range(len(h) + 1):
            if i == 0:
                d[i][j] = j
            elif j == 0:
                d[i][j] = i
            elif r[i - 1] == h[j - 1]:
                d[i][j] = d[i - 1][j - 1]
            else:
                d[i][j] = 1 + min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1])

    return float(d[len(r)][len(h)]) / len(r)

wer_score = wer([word for ref in reference_translations for word in nltk.word_tokenize(ref[0])],
               [word for gen in generated_outputs for word in nltk.word_tokenize(gen)])
print("Word Error Rate (WER):", wer_score)

In [None]:
def calculate_token_overlap(reference_tokens, generated_tokens):
    common_tokens = set(reference_tokens) & set(generated_tokens)
    precision = len(common_tokens) / len(generated_tokens) if len(generated_tokens) > 0 else 0
    recall = len(common_tokens) / len(reference_tokens) if len(reference_tokens) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1_score

all_reference_tokens = [word for ref in reference_translations for word in nltk.word_tokenize(ref[0])]
all_generated_tokens = [word for gen in generated_outputs for word in nltk.word_tokenize(gen)]

precision, recall, f1_score = calculate_token_overlap(all_reference_tokens, all_generated_tokens)
print("Token Precision:", precision)
print("Token Recall:", recall)
print("Token F1-score:", f1_score)

In [None]:
import nltk
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

reference_translations_tokenized = [[token.lower() for token in nltk.word_tokenize(sentence)] for sentence in [ref[0] for ref in reference_translations]]
generated_outputs_tokenized = [[token.lower() for token in nltk.word_tokenize(sentence)] for sentence in generated_outputs]

# Train a Word2Vec model on a large corpus (not provided in this example)
model = Word2Vec(sentences=reference_translations_tokenized + generated_outputs_tokenized, vector_size=100, window=5, min_count=1, sg=1)

# Get word embeddings for each token in the reference translations and generated outputs
reference_embeddings = [model.wv[token] for ref in reference_translations_tokenized for token in ref]
generated_embeddings = [model.wv[token] for token in generated_outputs_tokenized]

# Calculate the semantic similarity (cosine similarity) between the embeddings
cosine_similarities = cosine_similarity(reference_embeddings, generated_embeddings)
average_cosine_similarity = cosine_similarities.mean()

print("Average Semantic Similarity (Cosine Similarity):", average_cosine_similarity)