This notebook will operate on the created datasets to:
1. Filter sentence-pairs based on edit distance. Only sentences for which the log probability of the counterfactual sentence does not decrease by more than 10 points relative to the original sentence are kept.

2. Perform blanking/masking based on a fixed strategy. Keep common words in both sentences unmasked as long as they also appear in the same order in both sentences and create a prompt that can adequately represent both the original sentence and the counterfactual, were the blanks to be filled.

3. Create final dataset by merging original sentence with the masked version and control-code token.

4. Finetune IndicBART on this dataset.

Install necessary libraries

In [1]:
!pip install transformers
!pip install sentencepiece==0.1.95
!pip install numpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m82.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://us

In [2]:
!ls

10.208.38.94	    finetune_model.py  masked_hindi.txt
assam_input.txt     hindi_input.txt    sentence_score.ipynb
combined_hindi.txt  hindi_output.txt   tamil_input.txt


In [35]:
#!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
#!pip install indic-nlp-library

from indicnlp import common
common.set_resources_path("../indic_nlp_resources")

from indicnlp import loader
loader.load()
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator

In [36]:
# Import all the necessary classes and initialize the tokenizer and model.
from transformers import AlbertTokenizer, MBartForConditionalGeneration

tokenizer = AlbertTokenizer.from_pretrained("ai4bharat/IndicBART", do_lower_case=False, use_fast=False, keep_accents=True)

model = MBartForConditionalGeneration.from_pretrained("ai4bharat/IndicBART")

In [37]:
bos_id = tokenizer._convert_token_to_id_with_added_voc("<s>")
eos_id = tokenizer._convert_token_to_id_with_added_voc("</s>")
pad_id = tokenizer._convert_token_to_id_with_added_voc("<pad>")
# To get lang_id use any of ['<2as>', '<2bn>', '<2en>', '<2gu>', '<2hi>', '<2kn>', '<2ml>', '<2mr>', '<2or>', '<2pa>', '<2ta>', '<2te>']

Testing tokenizer

In [38]:
# First tokenize the input and outputs. 
# The format below is how IndicBART was trained so the input should be "Sentence </s> <2xx>" where xx is the language code. 
# Similarly, the output should be "<2yy> Sentence </s>". 

input_sentence = "I am a boy"
output_sentence = "मैं एक लड़का हूँ"

# The input and output sentence is what the model needs for learning.
# Current example is translation but consider the following examples:
# Summarization: 
# Input is "There was a child. She liked food. She decided to learn to make food. She became a world class chef. She won an award." 
# Output is "A child learned cooking and won an award."
# Paraphrasing: 
# Input is "I love to eat food."
# Output is "I am a foodie"

inp = tokenizer(input_sentence, add_special_tokens=False, return_tensors="pt", padding=True).input_ids

out = tokenizer(output_sentence, add_special_tokens=False, return_tensors="pt", padding=True).input_ids

print("Original input sentence:", input_sentence)
print("Segmented input sentence:", tokenizer.convert_ids_to_tokens(inp[0]))
print("Input sentence as tensor: ", inp)
print("Original output sentence:", output_sentence)
print("Segmented output sentence:", tokenizer.convert_ids_to_tokens(out[0]))
print("Output sentence as tensor:", out)

Original input sentence: I am a boy
Segmented input sentence: ['▁I', '▁am', '▁a', '▁boy']
Input sentence as tensor:  tensor([[  466,  1981,    80, 25573]])
Original output sentence: मैं एक लड़का हूँ
Segmented output sentence: ['▁मैं', '▁एक', '▁लड़का', '▁हूँ']
Output sentence as tensor: tensor([[  942,    43, 32720,  8384]])


In [39]:
import torch
import pandas as pd
import math

In [22]:
# def sent_scoring(model_tokenizer, text, cuda):
#     model = model_tokenizer[0]
#     tokenizer = model_tokenizer[1]
#     assert model is not None
#     assert tokenizer is not None
#     input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
#     if cuda:
#         input_ids = input_ids.to('cuda')
#     with torch.no_grad():
#         outputs = model(input_ids, labels=input_ids)
#     loss, logits = outputs[:2]
#     sentence_prob = loss.item()
#     return sentence_prob

Scoring sentences by log-probability

In [66]:
import sys
import numpy as np
 
import torch
from transformers import BertTokenizer,BertForMaskedLM
# Load pre-trained model (weights)
with torch.no_grad():
    model = MBartForConditionalGeneration.from_pretrained("ai4bharat/IndicBART")
    #model = BertForMaskedLM.from_pretrained('bert-large-cased')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = AlbertTokenizer.from_pretrained("ai4bharat/IndicBART", do_lower_case=False, use_fast=False, keep_accents=True)
    #tokenizer = BertTokenizer.from_pretrained('bert-large-cased')
def score(sentence):
    tensor_input = tokenizer(sentence, add_special_tokens=False, return_tensors="pt", padding=True).input_ids
    #tokenize_input = ["[CLS]"]+tokenize_input+["[SEP]"]
    #tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    with torch.no_grad():
        loss=model(tensor_input, labels=tensor_input)[0]
    return np.exp(loss.detach().numpy())
 
if __name__=='__main__':
    for line in ["मैं एक लड़का हूँ","वह एक लड़का है","वह आदमी कहाँ है"]:
        if line.strip() !='':
            print(line.strip()+'\t'+ str(score(line.strip())))
        else:
            break

मैं एक लड़का हूँ	2.8959212
वह एक लड़का है	2.9155743
वह आदमी कहाँ है	4.5577245


In [67]:
l1=[]
l2=[]
with open('../hindi_input_nonpara.txt') as f:
    l1 = f.readlines()

with open('../hindi_output_nonpara.txt') as f:
    l2 = f.readlines()

In [68]:
for i in range(0,len(l1)):
  l1[i]=l1[i].strip()
  l2[i]=l2[i].strip()  

In [30]:
l1[2]

'बुलिंगस - धारणा का परिचय सन्\u200c 1900 के बीच, टेक्सस सेंटर में किया जा रहा है ।'

In [28]:
l2[2]

'राष्ट्रीय बास्केटबॉल संघ का 1975-76 सत्र एनबीए का 30 वां सत्र था।'

In [9]:
from tqdm import tqdm

In [30]:
len(l1)

5000

Filtering by sentence score

In [69]:
c=0
for i in tqdm(range(0,
                    30)):
  s1=score(l1[i].strip())
  s2=score(l2[i].strip())
  if s2<0.9*s1:
    c+=1
print(c)
print(c/30)
print(1-(c/30)) 

100%|██████████| 30/30 [00:04<00:00,  6.85it/s]

5
0.16666666666666666
0.8333333333333334





In [32]:
tokenizer.tokenize(l1[2])

['▁1975',
 '-',
 '76',
 '▁का',
 '▁एन',
 'बीए',
 '▁सत्र',
 '▁राष्ट्रीय',
 '▁बास्केट',
 'बॉल',
 '▁संघ',
 '▁का',
 '▁30',
 'वां',
 '▁सत्र',
 '▁था',
 '।']

In [33]:
t1=tokenizer(l1[2], add_special_tokens=False, return_tensors="pt", padding=True).input_ids
#t1=tokenizer.tokenize(l1[2])

In [34]:
t1

tensor([[36019,    16, 12985,    44,   796, 31884,  5492,  1134, 41483, 41665,
           869,    44,   921,  1412,  5492,   243,     8]])

In [35]:
t2=tokenizer(l2[2], add_special_tokens=False, return_tensors="pt", padding=True).input_ids
#t2=tokenizer.tokenize(l2[2])

In [36]:
t2

tensor([[ 1134, 41483, 41665,   869,    44, 36019,    16, 12985,  5492,   796,
         31884,    44,   921, 10248,  5492,   243,     8]])

In [41]:
def longest_common_subsequence(lst1, lst2):
    m, n = len(lst1), len(lst2)
    jh = [[0 for j in range(n+1)] for i in range(m+1)]
    
    for i in range(1, m+1):
        for j in range(1, n+1):
            if lst1[i-1] == lst2[j-1]:
                jh[i][j] = 1 + jh[i-1][j-1]
            else:
                jh[i][j] = max(jh[i-1][j], jh[i][j-1])
    
    result = []
    i, j = m, n
    c=0 
    minind=-1
    maxind=-1
    while i > 0 and j > 0:
        c+=1
        if lst1[i-1] == lst2[j-1]:
            if c==1:
              minind=j
            result.append(lst1[i-1])
            maxind=j
            i -= 1
            j -= 1
        elif jh[i-1][j] > jh[i][j-1]:
            i -= 1
        else:
            j -= 1
    
    return result[::-1]

In [44]:
def find_common_subsequences(list1, list2):
    m = len(list1)
    n = len(list2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    max_len = 0
    common_subsequences = []

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if list1[i - 1] == list2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
                if dp[i][j] > max_len:
                    max_len = dp[i][j]
                    common_subsequences = [[list1[i - max_len:i]]]
                elif dp[i][j] == max_len:
                    common_subsequences.append([list1[i - max_len:i]])
            else:
                dp[i][j] = 0

    return common_subsequences


In [24]:
#I have to maintain the relative order of unblanked words
#find the longest possible subsequence that maintains relative order
def common_elements(l1,l2):
  indices_list=[]
  c_ref=-1
  for i in l1:
    if i in l2:
      c=l2.index(i)
      if c>c_ref:
        indices_list.append(i)
        c_ref=c
  return indices_list

In [47]:
common_elements(t1[0].tolist(),t2[0].tolist())
#common_elements(t1,t2)

[36019, 16, 12985, 796, 31884, 921, 243, 8]

In [49]:
t1=["yes","what","am","the","yes","in","why","what"]
t2=["where","are","yes","what","am","the","in"]

In [50]:
l_final=[]
mask_token="[MASK]"
c=0
c_former=0
for i in t1:
  # print(i)
  # print(c)
  if i in t2[c:]:
    c=t2.index(i)
    if c!=0:
      #l_final.append("[MASK]")
      l_final.extend(["[MASK]"]*(c-c_former-1))
    l_final.append(i)
    c_former=c
  else:
    l_final.append("[MASK]")
l_final

['[MASK]', 'yes', 'what', 'am', 'the', '[MASK]', 'in', '[MASK]', '[MASK]']

In [51]:
new_list=[l_final[0]]
for i in range(1, len(l_final)):
    if l_final[i] == l_final[i-1] and l_final[i]=='[MASK]':
        continue  # If the current element is the same as the previous, skip it
    else:
        new_list.append(l_final[i])  # Otherwise, add it to the new list

print(new_list)  # Output: [1, 2, 3, 4, 5, 6]


['[MASK]', 'yes', 'what', 'am', 'the', '[MASK]', 'in', '[MASK]']


In [54]:
l_t1=[]
c=0
for i in t1:
  if i in common_elements(t1,t2):
    l_t1.append(i)
  else:
    if len(l_t1)!=0:
      if l_t1[-1]!='[MASK]':
        l_t1.append('[MASK]')
    else:
      l_t1.append('[MASK]')
l_t1


['yes', 'what', 'am', 'the', 'yes', 'in', '[MASK]', 'what']

In [25]:
def blanking_sentences(t1,t2):
  l_t1=[]
  c=0
  for i in t1:
    if i in common_elements(t1,t2):
      l_t1.append(i)
      c=0
    else:
      if c==0:
        l_t1.append('[MASK]')
        c=1
  return l_t1


In [14]:
masked_sentences=[]
for i in tqdm(range(0,len(l1))):
  t1=tokenizer(l1[i], add_special_tokens=False, return_tensors="pt", padding=True).input_ids#tokenizer.tokenize(l1[i])
  t2=tokenizer(l2[i], add_special_tokens=False, return_tensors="pt", padding=True).input_ids#tokenizer.tokenize(l2[i])
  t1=t1[0].tolist()
  t2=t2[0].tolist()
  masked_sentences.append(blanking_sentences(t1,t2))


  0%|          | 0/49402 [00:00<?, ?it/s]


NameError: name 'blanking_sentences' is not defined

In [58]:
common_elements(tokenizer.tokenize(l1[3]),tokenizer.tokenize(l2[3]))

['▁विशिष्ट', '▁और', '▁परियोजना', '▁भी', '▁होती', '।']

In [59]:
masked_sentences[3]

['[MASK]', 5094, '[MASK]', 81, 14457, '[MASK]', 143, 649, '[MASK]', 8]

In [60]:
l1[3]

'इस दौरान विशिष्ट चर्चाएं, सार्वजनिक प्रोफाइल वाद-विवाद और परियोजना पर चर्चा भी होती है।'

In [61]:
l2[3]

'सार्वजनिक चर्चाएं, प्रोफाइल विशिष्ट चर्चाएं और परियोजना चर्चाएं भी होती हैं।'

In [70]:
def blanking_sentences_updated(t1,t2):
  l_final=[]
  mask_token=4 #"[MASK]"
  c=0
  c_former=0
  for i in t1:
    # print(i)
    # print(c)
    if i in t2[c:]:
      c=t2.index(i)
      if c!=0:
        #l_final.append("[MASK]")
        l_final.extend([4]*(c-c_former-1))
      l_final.append(i)
      c_former=c
    else:
      l_final.append(4)
  new_list=[l_final[0]]
  for i in range(1, len(l_final)):
      if l_final[i] == l_final[i-1] and l_final[i]==4:
          continue  # If the current element is the same as the previous, skip it
      else:
          new_list.append(l_final[i])  # Otherwise, add it to the new list

  return new_list  # Output: [1, 2, 3, 4, 5, 6]


In [71]:
masked_sentences=[]
for i in tqdm(range(0,len(l1))):
  # t1=tokenizer.tokenize(l1[i])
  # t2=tokenizer.tokenize(l2[i])
  t1=tokenizer(l1[i], add_special_tokens=False, return_tensors="pt", padding=True).input_ids#tokenizer.tokenize(l1[i])
  t2=tokenizer(l2[i], add_special_tokens=False, return_tensors="pt", padding=True).input_ids#tokenizer.tokenize(l2[i])
  t1=t1[0].tolist()
  t2=t2[0].tolist()
  masked_sentences.append(blanking_sentences_updated(t1,t2))


100%|██████████| 4000/4000 [00:02<00:00, 1596.27it/s]


In [64]:
masked_sentences[4]

[954,
 4,
 6512,
 4,
 38058,
 45,
 6263,
 4166,
 178,
 1385,
 15,
 4,
 107,
 1546,
 822,
 479,
 122,
 8]

In [72]:
blanked_list=[]
for i in tqdm(range(0,len(l1))):
  decoded_output=tokenizer.decode(masked_sentences[i], skip_special_tokens=False, clean_up_tokenization_spaces=False)
  blanked_list.append(decoded_output)

100%|██████████| 4000/4000 [00:01<00:00, 2577.74it/s]


In [15]:
!ls

10.208.38.94		     finetune_model.py		masked_hindi.txt
assam_input.txt		     hindi_input.txt		sentence_score.ipynb
combined_hindi_helsinki.txt  hindi_output.txt		tamil_input.txt
combined_hindi.txt	     masked_hindi_helsinki.txt


In [66]:
blanked_list[2]

'[MASK] 1975-76 का[MASK] एनबीए सत्र[MASK] का[MASK] 30[MASK] सत्र[MASK]था।'

In [73]:
file = open('masked_hindi_nonpara.txt','w')
for item in blanked_list:
	file.write(item+"\n")
file.close()

In [74]:
combined_list=[]
for i in tqdm(range(0,len(l1))):
  combined_sentence=l1[i]+" "+blanked_list[i]
  combined_list.append(combined_sentence)

100%|██████████| 4000/4000 [00:00<00:00, 577887.02it/s]


In [69]:
combined_list[4]

'जब प्रवाह की तुलनात्मक दरों को बनाए रखा जा सकता है, तो परिणाम उच्च होते हैं। जब[MASK] प्रवाह[MASK] दरों को बनाए रखा जा सकता है[MASK]तो परिणाम उच्च होते हैं।'

In [75]:
file = open('combined_hindi_nonpara.txt','w')
for item in combined_list:
	file.write(item+"\n")
file.close()

In [58]:
decoded_output=tokenizer.decode(masked_sentences[4], skip_special_tokens=False, clean_up_tokenization_spaces=False)

print("Models actual output:", decoded_output) 

Models actual output: [MASK] 2[MASK] 3[MASK] असतत[MASK] को[MASK] परिभाषित किया[MASK] है[MASK]


In [72]:
hindi_input_sentence = "मैं एक [MASK] हूँ"
hin_inp = tokenizer(hindi_input_sentence, add_special_tokens=False, return_tensors="pt", padding=True).input_ids

# What does this look like?

print("Original input sentence:", hindi_input_sentence)
print("Segmented input sentence:", tokenizer.convert_ids_to_tokens(hin_inp[0]))
print("Input sentence as tensor: ", hin_inp)

Original input sentence: मैं एक [MASK] हूँ
Segmented input sentence: ['▁मैं', '▁एक', '[MASK]', '▁हूँ']
Input sentence as tensor:  tensor([[ 942,   43,    4, 8384]])


In [73]:
decoded_output=tokenizer.decode(hin_inp[0], skip_special_tokens=False, clean_up_tokenization_spaces=False)

print("Models actual output:", decoded_output) 

Models actual output: मैं एक[MASK]हूँ


In [74]:
l1[4]

'जब प्रवाह की तुलनात्मक दरों को बनाए रखा जा सकता है, तो परिणाम उच्च होते हैं।'

In [75]:
l2[4]

'जब तुलनात्मक प्रवाह दरों को बनाए रखा जा सकता है तो परिणाम उच्च होते हैं।'

Finetune

In [76]:
# Tokenize 
hindi_input_sentence = "मैं एक [MASK] हूँ </s> <2hi>"
hin_inp = tokenizer(hindi_input_sentence, add_special_tokens=False, return_tensors="pt", padding=True).input_ids

# What does this look like?

print("Original input sentence:", hindi_input_sentence)
print("Segmented input sentence:", tokenizer.convert_ids_to_tokens(hin_inp[0]))
print("Input sentence as tensor: ", hin_inp)

# Generate

model_output=model.generate(hin_inp, use_cache=True, num_beams=4, max_length=20, min_length=1, early_stopping=True, pad_token_id=pad_id, bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2hi>"))
# Output is a tensor and we need to convert it back into a sentence.

print("Model output as tensor:", model_output)

# What is this in tokens?

print("Model output as segmented sentence:", tokenizer.convert_ids_to_tokens(model_output[0]))

# Decode using the tokenizer to get output strings.

decoded_output=tokenizer.decode(model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

print("Models actual output:", decoded_output) 

Original input sentence: मैं एक [MASK] हूँ </s> <2hi>
Segmented input sentence: ['▁मैं', '▁एक', '[MASK]', '▁हूँ', '</s>', '<2hi>']
Input sentence as tensor:  tensor([[  942,    43,     4,  8384, 64001, 64006]])
Model output as tensor: tensor([[64006,   942,    43,  6540,  8384, 64001]])
Model output as segmented sentence: ['<2hi>', '▁मैं', '▁एक', '▁कलाकार', '▁हूँ', '</s>']
Models actual output: मैं एक कलाकार हूँ


In [77]:
# Lets use num_return_sequences=4 to get all 4 options that the model generated.

model_output=model.generate(hin_inp, use_cache=True, num_beams=4, max_length=20, min_length=1, num_return_sequences=4, early_stopping=True, pad_token_id=pad_id, bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2hi>"))

for i, mod_out in enumerate(model_output):
  print("Output ", i)
  print("Model output as tensor:", mod_out)

  print("Model output as segmented sentence:", tokenizer.convert_ids_to_tokens(mod_out))

  decoded_output=tokenizer.decode(mod_out, skip_special_tokens=True, clean_up_tokenization_spaces=False)

  print("Models actual output:", decoded_output) 

Output  0
Model output as tensor: tensor([64006,   942,    43,  6540,  8384, 64001])
Model output as segmented sentence: ['<2hi>', '▁मैं', '▁एक', '▁कलाकार', '▁हूँ', '</s>']
Models actual output: मैं एक कलाकार हूँ
Output  1
Model output as tensor: tensor([64006,   942,    43,  4316,  8384, 64001])
Model output as segmented sentence: ['<2hi>', '▁मैं', '▁एक', '▁पत्रकार', '▁हूँ', '</s>']
Models actual output: मैं एक पत्रकार हूँ
Output  2
Model output as tensor: tensor([64006,   942,    43,   530,  8384, 64001])
Model output as segmented sentence: ['<2hi>', '▁मैं', '▁एक', '▁महिला', '▁हूँ', '</s>']
Models actual output: मैं एक महिला हूँ
Output  3
Model output as tensor: tensor([64006,   942,    43, 18460,  8384, 64001])
Model output as segmented sentence: ['<2hi>', '▁मैं', '▁एक', '▁इंसान', '▁हूँ', '</s>']
Models actual output: मैं एक इंसान हूँ


In [78]:
## Batching
import sys
import random
import numpy as np
import torch

def yield_corpus_indefinitely_bi(corpus, language):
    """This shuffles the corpus at the beginning of each epoch and returns sentences indefinitely."""
    epoch_counter = 0
    num_lines = len(corpus)
    num_sentences_before_sort = 5000
    num_sorted_segments = (num_lines // num_sentences_before_sort) + 1
    while True:
        print("Shuffling corpus:", language)
        random.shuffle(corpus)
        for src_line, tgt_line in corpus:
            yield src_line, tgt_line
        epoch_counter += 1
        print("Finished epoch", epoch_counter, "for language:", language)
    return None, None ## We should never reach this point.


def generate_batches_bilingual(tok, num_batches, is_summarization=False, batch_size=16, src_lang="en", tgt_lang="hi", src_file_prefix="TED2020.en-hi", tgt_file_prefix="TED2020.en-hi"):
    """Generates the source, target and source attention masks for the training set. 
    The source and target sentences are ignored if empty and are truncated if longer than a 
    threshold. The batch size in this context is the maximum number of tokens in the batch post padding."""
    batch_count = 0
    mask_tok = "[MASK]"

    language_list = [src_lang+"-"+tgt_lang]
    print("Training for:", language_list)
    src_file_content = open(src_file_prefix).readlines()
    tgt_file_content = open(tgt_file_prefix).readlines()
    file_content = list(zip(src_file_content, tgt_file_content))
    file_iterator = yield_corpus_indefinitely_bi(file_content, src_lang+"-"+tgt_lang)
    slang = "<2"+src_lang+">"
    tlang = "<2"+tgt_lang+">"
            
    while batch_count != num_batches:
        curr_batch_count = 0
        encoder_input_batch = []
        decoder_input_batch = []
        decoder_label_batch = []
        batch_count += 1
        max_src_sent_len = 0
        max_tgt_sent_len = 0
        sents_in_batch = 0
        while True:
            src_sent, tgt_sent = next(file_iterator)
            src_sent = src_sent.strip()
            tgt_sent = tgt_sent.strip()
            # if slang != "<2en>" and slang != "<2hi>": # Transliterate to Devanagari
            #     src_sent = UnicodeIndicTransliterator.transliterate(src_sent, slang[2:4], "hi")
            # if tlang != "<2en>" and tlang != "<2hi>": # Transliterate to Devanagari
            #     tgt_sent = UnicodeIndicTransliterator.transliterate(tgt_sent, tlang[2:4], "hi")
            src_sent_split = src_sent.split(" ")
            tgt_sent_split = tgt_sent.split(" ")
            tgt_sent_len = len(tgt_sent_split)
            src_sent_len = len(src_sent_split)
            
            if src_sent_len < 1 or tgt_sent_len < 1:
                continue
            else:   # Initial truncation
                if src_sent_len >= 100:
                    src_sent_split = src_sent_split[:100]
                    src_sent = " ".join(src_sent_split)
                    src_sent_len = 100
                if tgt_sent_len >= 100:
                    tgt_sent_split = tgt_sent_split[:100]
                    tgt_sent = " ".join(tgt_sent_split)
                    tgt_sent_len = 100
            
            iids = tok(src_sent + " </s> " + slang, add_special_tokens=False, return_tensors="pt").input_ids
            curr_src_sent_len = len(iids[0])
            if curr_src_sent_len > 256:
                src_sent = tok.decode(iids[0][0:256-2], skip_special_tokens=True, clean_up_tokenization_spaces=False)
                curr_src_sent_len = 256
            
            iids = tok(tlang + " " + tgt_sent, add_special_tokens=False, return_tensors="pt").input_ids
            curr_tgt_sent_len = len(iids[0])
            if curr_tgt_sent_len > 256:
                tgt_sent = tok.decode(iids[0][1:256], skip_special_tokens=True, clean_up_tokenization_spaces=False)
                curr_tgt_sent_len = 256
            
            
            encoder_input_batch.append(src_sent + " </s> " + slang)
            decoder_input_batch.append(tlang + " " + tgt_sent)
            decoder_label_batch.append(tgt_sent + " </s>")
            sents_in_batch += 1
            if sents_in_batch == batch_size:
                    break
                
        input_ids = tok(encoder_input_batch, add_special_tokens=False, return_tensors="pt", padding=True).input_ids
        input_masks = (input_ids != 4).int()
        decoder_input_ids = tok(decoder_input_batch, add_special_tokens=False, return_tensors="pt", padding=True).input_ids
        labels = tok(decoder_label_batch, add_special_tokens=False, return_tensors="pt", padding=True).input_ids
        yield input_ids, input_masks, decoder_input_ids, labels

## Loss

def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None):
    """From fairseq. This returns the label smoothed cross entropy loss."""
    if target.dim() == lprobs.dim() - 1:
        target = target.unsqueeze(-1)
    nll_loss = -lprobs.gather(dim=-1, index=target)
    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
    if ignore_index is not None:
        pad_mask = target.eq(ignore_index)
        nll_loss.masked_fill_(pad_mask, 0.0)
        smooth_loss.masked_fill_(pad_mask, 0.0)
        denominator = (1.0 - 1.0*pad_mask)
        denominator = denominator.sum()
    else:
        nll_loss = nll_loss.squeeze(-1)
        smooth_loss = smooth_loss.squeeze(-1)
        denominator = 1.0
    
    if ignore_index is not None:
        nll_loss = nll_loss.sum()
        smooth_loss = smooth_loss.sum()
    else:
        nll_loss = nll_loss.mean()
        smooth_loss = smooth_loss.mean()
        
    eps_i = epsilon / lprobs.size(-1)
    loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
    loss = loss/denominator
    return loss

In [79]:
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW

In [80]:
tokenizer = AlbertTokenizer.from_pretrained("ai4bharat/IndicBART", do_lower_case=False, use_fast=False, keep_accents=True)

bos_id = tokenizer._convert_token_to_id_with_added_voc("<s>")
eos_id = tokenizer._convert_token_to_id_with_added_voc("</s>")
pad_id = tokenizer._convert_token_to_id_with_added_voc("<pad>")

model = MBartForConditionalGeneration.from_pretrained("ai4bharat/IndicBART")
model.to(0)

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad],
            "weight_decay": 0.00001,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad],
            "weight_decay": 0.0,
        },
    ] ## We suppose that weight decay will be used except for biases and layer norm weights.

print("Optimizing", [n for n, p in model.named_parameters() if p.requires_grad])
num_params_to_optimize = sum(p.numel() for p in model.parameters() if p.requires_grad)
num_model_params = sum(p.numel() for p in model.parameters())
print("Number of model parameters:", num_model_params)
print("Total number of params to be optimized are: ", num_params_to_optimize)

print("Percentage of parameters to be optimized: ", 100*num_params_to_optimize/num_model_params)
    
optimizer = AdamW(optimizer_grouped_parameters, lr=0.001, eps=1e-8)

scheduler = get_linear_schedule_with_warmup(optimizer, 4000, 200) ## A warmup and decay scheduler. We use the linear scheduler for now. TODO: Enable other schedulers with a flag.

Optimizing ['model.shared.weight', 'model.encoder.embed_positions.weight', 'model.encoder.layers.0.self_attn.k_proj.weight', 'model.encoder.layers.0.self_attn.k_proj.bias', 'model.encoder.layers.0.self_attn.v_proj.weight', 'model.encoder.layers.0.self_attn.v_proj.bias', 'model.encoder.layers.0.self_attn.q_proj.weight', 'model.encoder.layers.0.self_attn.q_proj.bias', 'model.encoder.layers.0.self_attn.out_proj.weight', 'model.encoder.layers.0.self_attn.out_proj.bias', 'model.encoder.layers.0.self_attn_layer_norm.weight', 'model.encoder.layers.0.self_attn_layer_norm.bias', 'model.encoder.layers.0.fc1.weight', 'model.encoder.layers.0.fc1.bias', 'model.encoder.layers.0.fc2.weight', 'model.encoder.layers.0.fc2.bias', 'model.encoder.layers.0.final_layer_norm.weight', 'model.encoder.layers.0.final_layer_norm.bias', 'model.encoder.layers.1.self_attn.k_proj.weight', 'model.encoder.layers.1.self_attn.k_proj.bias', 'model.encoder.layers.1.self_attn.v_proj.weight', 'model.encoder.layers.1.self_attn

In [40]:
tokenizer.pad_token_id

0

In [41]:
tokenizer

AlbertTokenizer(name_or_path='ai4bharat/IndicBART', vocab_size=64000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '<unk>', 'sep_token': '[SEP]', 'pad_token': '<pad>', 'cls_token': '[CLS]', 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True), 'additional_special_tokens': ['<s>', '</s>', '<2as>', '<2bn>', '<2en>', '<2gu>', '<2hi>', '<2kn>', '<2ml>', '<2mr>', '<2or>', '<2pa>', '<2ta>', '<2te>']}, clean_up_tokenization_spaces=True)

In [81]:
## Lets set the model to training mode
model.train()
ctr=0
for i, (input_ids, input_masks, decoder_input_ids, labels) in enumerate(generate_batches_bilingual(tokenizer, 500, is_summarization=True, batch_size=16, src_lang="hi", tgt_lang="hi", src_file_prefix="combined_hindi_nonpara.txt", tgt_file_prefix="../hindi_output_nonpara.txt")):
  ctr=i
  if i%100 == 0:
    print()
    model.eval() # Set dropouts to zero
    print("Lets see how well the model is doing after ", i, "iterations of training")

    model.train() # back to training mode

  input_ids = input_ids.to(0)
  input_masks = input_masks.to(0)
  decoder_input_ids = decoder_input_ids.to(0)
  labels = labels.to(0)
  mod_compute = model(input_ids=input_ids, attention_mask=input_masks, decoder_input_ids=decoder_input_ids) ## Run the model and get logits.
  logits = mod_compute.logits
  lprobs = torch.nn.functional.log_softmax(logits, dim=-1) ## Softmax tempering of logits if needed.
  loss = label_smoothed_nll_loss(
      lprobs, labels, 0.1, ignore_index=0
  ) ## Label smoothed cross entropy loss.
  del input_ids ## Delete to avoid retention.
  del input_masks ## Delete to avoid retention.
  del decoder_input_ids ## Delete to avoid retention.
  del labels ## Delete to avoid retention.
  loss.backward()
  torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
  optimizer.step()
  scheduler.step()
  if i % 100 == 0:
    print("Loss for batch ", i+1, "is", round(loss.detach().cpu().numpy().item(), 2))
    print()

Training for: ['hi-hi']
Shuffling corpus: hi-hi

Lets see how well the model is doing after  0 iterations of training
Loss for batch  1 is 4.13


Lets see how well the model is doing after  100 iterations of training
Loss for batch  101 is 2.71


Lets see how well the model is doing after  200 iterations of training
Loss for batch  201 is 2.88

Finished epoch 1 for language: hi-hi
Shuffling corpus: hi-hi

Lets see how well the model is doing after  300 iterations of training
Loss for batch  301 is 2.97


Lets see how well the model is doing after  400 iterations of training
Loss for batch  401 is 2.73



In [33]:
!ls

10.208.38.94		     finetune_model.py		masked_hindi.txt
assam_input.txt		     hindi_input.txt		sentence_score.ipynb
combined_hindi_helsinki.txt  hindi_output.txt		tamil_input.txt
combined_hindi.txt	     masked_hindi_helsinki.txt


In [82]:
torch.save(model.state_dict(), '../normal_nonpara_model.pt')


In [43]:
torch.cuda.is_available()

True

Downstream tasks: Paraphrase Detection

In [27]:
l1=[]
l2=[]
with open('../hindi_input_para.txt') as f:
    l1 = f.readlines()

with open('../hindi_output_para.txt') as f:
    l2 = f.readlines()

In [28]:
for i in range(0,len(l1)):
  l1[i]=str(l1[i].strip())
  l2[i]=str(l2[i].strip())  

In [29]:
l3=[]
l4=[]
with open('../hindi_input_nonpara.txt') as f:
    l3 = f.readlines()

with open('../hindi_output_nonpara.txt') as f:
    l4 = f.readlines()

In [30]:
for i in range(0,len(l3)):
  l3[i]=str(l3[i].strip())
  l4[i]=str(l4[i].strip())  

In [31]:
l_train=[]
for i in range(0,len(l1)):
    temp_var=(l1[i],l2[i],1)
    l_train.append(temp_var)
for i in range(0,len(l3)):
    temp_var=(l3[i],l4[i],0)
    l_train.append(temp_var)

In [32]:
first_elements = []
second_elements =[]
third_elements = []

# Loop through each tuple in the list
for my_tuple in l_train:
    # Get the third element of the tuple (at index 2)
    first_element = my_tuple[0]
    second_element = my_tuple[1]
    third_element = my_tuple[2]
    # Add the third element to the list
    first_elements.append(first_element)
    second_elements.append(second_element)
    third_elements.append(third_element)

In [34]:
import pandas as pd

In [35]:
d = {'text1': first_elements, 'text2': second_elements, 'text3': third_elements} 
test_df = pd.DataFrame(d)
test_df = test_df.dropna()
tt1 = test_df["text1"].tolist()
tt2 = test_df["text2"].tolist()
tt3 = test_df["text3"].tolist()
# Define the three lists
# Create an empty list to store the tuples
combined_list = []

# Loop through the lists using the zip() function
for item1, item2, item3 in zip(tt1, tt2, tt3):
    # Create a tuple containing the current elements
    current_tuple = (item1, item2, item3)
    # Append the tuple to the combined list
    combined_list.append(current_tuple)

In [38]:
combined_list[53]

('गिमनासिया एस्ग्रिमा (एलपी) ने 3-2 से जीत हासिल की और वह प्राइमरा डिविज़न में रह गई।',
 'जिम्नासिया एस्ग्रिमा (एलपी) ने 3-2 से जीत हासिल की और प्राइमेरा डिविसिमिनेस एन में रह गई।',
 1)

In [37]:
len(combined_list)

8000

In [46]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

# Load the pre-trained Indic-BERT model and tokenizer
model_name = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Prepare the training data
train_sentence_pairs = combined_list
# The third element in each tuple indicates whether the two sentences are paraphrases (1) or not paraphrases (0)

# Tokenize the training data
train_encodings = tokenizer([sp[0] for sp in train_sentence_pairs], [sp[1] for sp in train_sentence_pairs], truncation=True, padding=True, return_tensors='pt')
train_encodings['labels'] = torch.tensor([sp[2] for sp in train_sentence_pairs])

# Create a PyTorch DataLoader object for the training data
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_encodings['labels'])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)

# Define the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Move the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Train the model
model.train()
for epoch in range(3):
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the finetuned model
model.save_pretrained("finetuned_model")


Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.weight', 'sop_classifier.classifier.bias', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.bias', 'sop_classifier.classifier.weight', 'predictions.dense.weight', 'predictions.bias', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indi

Downstream tasks: Sentiment Analysis

In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

text = "Good night 😊"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")


  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)lve/main/config.json: 100%|██████████| 841/841 [00:00<00:00, 91.3kB/s]
Downloading (…)tencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 53.3MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 150/150 [00:00<00:00, 45.4kB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.11G/1.11G [00:10<00:00, 102MB/s] 


1) positive 0.7673
2) neutral 0.2015
3) negative 0.0313


In [2]:
print(scores)

[0.03125937 0.20148002 0.7672607 ]


In [2]:
from transformers import pipeline
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
sentiment_task("T'estimo!")[0]['label']


[{'label': 'positive', 'score': 0.660058319568634}]

In [6]:
sentiment_task("T'estimo!")[0]['label']

'positive'

In [20]:
l1=[]
l2=[]
with open('../hindi_input_para.txt') as f:
    l1 = f.readlines()

with open('../hindi_output_para.txt') as f:
    l2 = f.readlines()

In [40]:
model = MBartForConditionalGeneration.from_pretrained("ai4bharat/IndicBART")
model.load_state_dict(torch.load('../normal_para_model.pt'))

<All keys matched successfully>

In [None]:
hindi_input_sentence = "मैं एक [MASK] हूँ </s> <2hi>"
hin_inp = tokenizer(hindi_input_sentence, add_special_tokens=False, return_tensors="pt", padding=True).input_ids.to(0)

# What does this look like?

print("Original input sentence:", hindi_input_sentence)
print("Segmented input sentence:", tokenizer.convert_ids_to_tokens(hin_inp[0]))
print("Input sentence as tensor: ", hin_inp)

# Generate

model_output=model.generate(hin_inp, use_cache=True, num_beams=4, max_length=20, min_length=1, early_stopping=True, pad_token_id=pad_id, bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2hi>"))
# Output is a tensor and we need to convert it back into a sentence.

print("Model output as tensor:", model_output)

# What is this in tokens?

print("Model output as segmented sentence:", tokenizer.convert_ids_to_tokens(model_output[0]))

# Decode using the tokenizer to get output strings.

decoded_output=tokenizer.decode(model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

print("Models actual output:", decoded_output)

In [43]:
created_counterfactuals=[]
model.to(0)
for i in tqdm(range(0,len(l3))):
   hin_inp = tokenizer(l3[i], add_special_tokens=False, return_tensors="pt", padding=True).input_ids.to(0)
   model_output=model.generate(hin_inp, use_cache=True, num_beams=4, max_length=20, min_length=1, early_stopping=True, pad_token_id=pad_id, bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2hi>"))
   decoded_output=tokenizer.decode(model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
   created_counterfactuals.append(decoded_output)

100%|██████████| 999/999 [02:44<00:00,  6.09it/s]


In [44]:
file = open('generated_data.txt','w')
for item in created_counterfactuals:
	file.write(item+"\n")
file.close()

In [21]:
for i in range(0,len(l1)):
  l1[i]=l1[i].strip()
  l2[i]=l2[i].strip()  

In [22]:
l3=l1[3001:]
l4=l2[3001:]

In [23]:
l5=[]
for i in tqdm(range(0,len(l3))):
    a=sentiment_task(l3[i])
    if a[0]['label']=='positive':
        l5.append(1)
    elif a[0]['label']=='negative':
        l5.append(-1)
    elif a[0]['label']=='neutral':
        l5.append(0)

100%|██████████| 999/999 [00:27<00:00, 36.13it/s]


In [24]:
l6=[]
for i in tqdm(range(0,len(l4))):
    a=sentiment_task(l4[i])
    if a[0]['label']=='positive':
        l6.append(1)
    elif a[0]['label']=='negative':
        l6.append(-1)
    elif a[0]['label']=='neutral':
        l6.append(0)

100%|██████████| 999/999 [00:27<00:00, 36.91it/s]


In [25]:
c=0
for i in range(0, len(l5)):
    if l5[i]==l6[i]:
        c+=1
print(c/len(l5))

0.954954954954955


In [13]:
l7=[]
l8=[]
with open('../hindi_input_nonpara.txt') as f:
    l7 = f.readlines()

with open('../hindi_output_nonpara.txt') as f:
    l8 = f.readlines()

In [14]:
for i in range(0,len(l7)):
  l7[i]=l7[i].strip()
  l8[i]=l8[i].strip()  

In [15]:
l_3=l7[:3000]
l_4=l8[:3000]

In [16]:
l_5=[]
for i in tqdm(range(0,len(l_3))):
    a=sentiment_task(l_3[i])
    if a[0]['label']=='positive':
        l_5.append(1)
    elif a[0]['label']=='negative':
        l_5.append(-1)
    elif a[0]['label']=='neutral':
        l_5.append(0)

100%|██████████| 3000/3000 [01:33<00:00, 31.94it/s]


In [17]:
l_6=[]
for i in tqdm(range(0,len(l_4))):
    a=sentiment_task(l_4[i])
    if a[0]['label']=='positive':
        l_6.append(1)
    elif a[0]['label']=='negative':
        l_6.append(-1)
    elif a[0]['label']=='neutral':
        l_6.append(0)

100%|██████████| 3000/3000 [01:29<00:00, 33.63it/s]


In [19]:
c=0
for i in range(0, len(l_5)):
    if l_5[i]==l_6[i]:
        c+=1
print(1-(c/len(l_5)))
print(c/len(l_5))

0.05166666666666664
0.9483333333333334


In [56]:
l=list(zip(['a','b','c'],[1,2,3]))
print(l)

[('a', 1), ('b', 2), ('c', 3)]


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader
from transformers import Trainer, TrainingArguments
import numpy as np

tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-xlm-roberta-base-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-xlm-roberta-base-sentiment', num_labels=3)

train_dataset = [
    {'text': 'यह फोन बहुत अच्छा है।', 'label': 1},
    {'text': 'मैं इस फोन से नाखुश हूँ।', 'label': -1},
    {'text': 'फोन की बैटरी लाइफ काफी अच्छी है।', 'label': 1},
    {'text': 'फोन का कैमरा बहुत बेकार है।', 'label': -1},
]

def encode_text(text):
    return tokenizer(text, padding=True, truncation=True, max_length=512)

def encode_label(label):
    return label + 1

def encode_example(example):
    return {'input_ids': encode_text(example['text'])['input_ids'],
            'attention_mask': encode_text(example['text'])['attention_mask'],
            'label': encode_label(example['label'])}

encoded_train_dataset = [encode_example(example) for example in train_dataset]

train_dataloader = DataLoader(encoded_train_dataset, batch_size=2, shuffle=True)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

trainer = Trainer(model=model, args=training_args)

for epoch in range(training_args.num_train_epochs):
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
        if step % 100 == 0:
            print(f"Epoch {epoch}, Step {step}, Loss {total_loss / (step+1)}")


In [65]:
len(l5)

999

In [66]:
len(created_counterfactuals)

999

In [52]:
created_counterfactuals[56]

'ग्रेवाल दूसरा परिवार है जो लोकप्रिय श्रृंखला द फैमिली ऑफ द ब्रिटिश चैनल 4 श्रृंखला में दिखाई'

In [46]:
encoded_dataset

{'input_ids': tensor([[     0,    967,  47186,  ...,      1,      1,      1],
        [     0,  78161,   1480,  ...,      1,      1,      1],
        [     0, 102110,  39021,  ...,      1,      1,      1],
        ...,
        [     0,  38731,   6690,  ...,      1,      1,      1],
        [     0,   4993,   4010,  ...,      1,      1,      1],
        [     0,   9331,   5278,  ...,      1,      1,      1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([ 0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,
         0,  0,  0, 