In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BartForConditionalGeneration, AutoTokenizer, BartTokenizer
import torch.optim
import accelerate
import os
import json
from tqdm import tqdm
from time import time
from random import sample
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data_root_path = os.path.join(os.getcwd(), 'v3/v2/en-hi')
model_path = os.path.join(os.path.join(os.getcwd(), 'Models'), 'Machine_Translation')

In [3]:
class CustomDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=512):
        #self.data = self.load_data(data_path, sample_size)
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        #self.sample_size = sample_size
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = item['English']
        target_text = item['Hindi']
        
        inputs = self.tokenizer.encode_plus(input_text, max_length=self.max_length,
                                            padding='max_length', truncation=True, return_tensors='pt'
                                           )
        
        targets = self.tokenizer.encode(target_text, max_length=self.max_length,
                                        padding='max_length', truncation=True, return_tensors='pt'
                                        )
        
        return {'input_ids': inputs['input_ids'].squeeze(),
                'attention_mask': inputs['attention_mask'].squeeze(),
                'labels': targets.squeeze()
               }
    """def load_data(self, data_path, sample_size):
        '''with open(data_path, 'r',encoding="utf8") as file:
            data = json.load(file)'''
        data = []
        with open(os.path.join(data_root_path, 'train.en'), 'r', encoding="utf8") as en_file:
            en_lines = en_file.readlines()
        with open(os.path.join(data_root_path, 'train.hi'), 'r', encoding="utf8") as hn_file:
            hn_lines = hn_file.readlines()
            
        for en_line, hn_line in zip(en_lines, hn_lines):
            data.append(dict(English=en_line.replace('\n', ''), Hindi=hn_line.replace('\n', '')))
            
        return sample(data, sample_size)"""

In [4]:
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)

In [5]:
def load_data(data_path, sample_size):
    '''with open(data_path, 'r',encoding="utf8") as file:
        data = json.load(file)'''
    data = []
    with open(os.path.join(data_root_path, 'train.en'), 'r', encoding="utf8") as en_file:
        en_lines = en_file.readlines()
    with open(os.path.join(data_root_path, 'train.hi'), 'r', encoding="utf8") as hn_file:
        hn_lines = hn_file.readlines()

    for en_line, hn_line in zip(en_lines, hn_lines):
        data.append(dict(English=en_line.replace('\n', ''), Hindi=hn_line.replace('\n', '')))

    return sample(data, sample_size)

In [6]:
data = load_data(data_root_path, 5600)

In [7]:
train_data, test_data = train_test_split(data, test_size=0.1785, shuffle=False)
train_data, val_data = train_test_split(train_data, test_size=0.1304, shuffle=False)

In [8]:
train_dataset = CustomDataset(tokenizer, train_data)
dev_dataset = CustomDataset(tokenizer, val_data)
test_dataset = CustomDataset(tokenizer, test_data)

In [9]:
train_loader = DataLoader(train_dataset, batch_size=25, shuffle=True)
val_loader = DataLoader(dev_dataset, batch_size=25, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=25, shuffle=False)

In [10]:
def Fine_Tune(train_loader, val_loader, num_epochs, num_layers_to_finetune):
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
    
    for param in model.parameters():
        param.requires_grad = False
    
    for i in range(1, num_layers_to_finetune + 1):
        for param in model.model.encoder.layers[-i].parameters():
            param.requires_grad = True
        for param in model.model.decoder.layers[-i].parameters():
            param.requires_grad = True
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    #print(device)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=7e-5, weight_decay=3e-6)
    for epoch in tqdm(range(num_epochs)):
        model.train()
        print("Training phase")
        #for idx, batch in enumerate(train_loader):
        for batch in tqdm(train_loader):
            #time0 = time()
            #print(f"batch: {idx+1} starts")
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            #time1 = time()
            #print(f"batch: {idx+1} ends\ntime taken: {time1-time0} seconds")
            #print(f"time taken: {time1-time0} seconds")
        
        model.eval()
        print("Validation phase")
        with torch.no_grad():
            total_val_loss = 0.0
            for val_batch in tqdm(val_loader):
                input_ids = val_batch['input_ids'].to(device)
                attention_mask = val_batch['attention_mask'].to(device)
                labels = val_batch['labels'].to(device)
                val_output = model(input_ids, attention_mask=attention_mask, labels=labels)
                total_val_loss += val_output.loss.item()
            average_val_loss = total_val_loss / len(val_loader)
            
        print(f'Epoch: {epoch+1}/{num_epochs}, Validation Loss: {average_val_loss}')
    os.makedirs(model_path, exist_ok=True)
    model.save_pretrained(model_path, from_pt=True)
    torch.save(model.state_dict(), os.path.join(model_path, 'pytorch_model.bin'))

In [11]:
def Generate_translation(val_loader):
    model = BartForConditionalGeneration.from_pretrained(model_path)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    with torch.no_grad():
        for val_batch in val_loader:
            input_ids = val_batch['input_ids'].to(device)
            attention_mask = val_batch['attention_mask'].to(device)
            labels = val_batch['labels'].to(device)
            generated_ids = model.generate(input_ids, attention_mask=attention_mask,
                                           max_length=60, num_beams=2, repetition_penalty=2.0,
                                           length_penalty=2.0, early_stopping=True
                                          )
            generated_title = tokenizer.decode(generated_ids[0])
            input_text = tokenizer.decode(input_ids[0])
            actual_title = tokenizer.decode(labels[0])
            
            print(f'Input Text: {input_text}\nGenerated Title: {generated_title}\nActual Title: {actual_title}')
            print('\n'+'='*50+'\n')

In [None]:
Fine_Tune(train_loader, val_loader, 5, 2)

  0%|                                                                                                                                                                                                                                                   | 0/5 [00:00<?, ?it/s]

Training phase



  attn_output = torch.nn.functional.scaled_dot_product_attention(

  1%|█▍                                                                                                                                                                                                                                       | 1/160 [00:07<20:48,  7.85s/it][A
  1%|██▉                                                                                                                                                                                                                                      | 2/160 [00:08<09:11,  3.49s/it][A
  2%|████▎                                                                                                                                                                                                                                    | 3/160 [00:16<14:17,  5.46s/it][A
  2%|█████▊                                                                                                   

In [None]:
Generate_translation(test_loader)