In [1]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install progress

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, T5ForConditionalGeneration, T5Config, AdamW
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd
#!pip install SentencePiece 

In [4]:
# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

https://github.com/shahparth123/eng_guj_parallel_corpus

In [5]:
# Load the dataset for Fr to English translation
# Read the text file and split lines
with open('Data/gujarati.txt', "r") as text_file:
    guj_lines = text_file.read().splitlines()

# Create DataFrame from lines
guj_data= pd.DataFrame(guj_lines, columns=["text"])


with open('Data/english.txt', "r") as text_file:
    eng_lines = text_file.read().splitlines()

# Create DataFrame from lines
eng_data = pd.DataFrame(eng_lines, columns=["text"])

In [6]:
eng_data.head()

Unnamed: 0,text
0,A bicycle replica with a clock as the front wh...
1,A black Honda motorcycle parked in front of a ...
2,A room with blue walls and a white sink and door.
3,A car that seems to be parked illegally behind...
4,A large passenger airplane flying through the ...


In [7]:
guj_data.head()

Unnamed: 0,text
0,ફ્રન્ટ વ્હીલ તરીકે ઘડિયાળ સાથે સાયકલ પ્રતિકૃતિ.
1,ગેરેજની સામે પાર્ક કરેલી બ્લેક હોન્ડા મોટરસાયકલ
2,વાદળી દિવાલો અને સફેદ સિંક અને બારણું ધરાવતી ખંડ
3,એક કાર કે જે કાનૂની રીતે પાર્ક કરેલી કારની પાછ...
4,હવામાં ઉડતી મોટી પેસેન્જર વિમાન.


## Merge Columns in one DF

In [8]:
 # Merge the DataFrames based on their columns
data = pd.merge(eng_data, guj_data, left_index=True, right_index=True)
data.columns = ['en','gu']
data.head()

Unnamed: 0,en,gu
0,A bicycle replica with a clock as the front wh...,ફ્રન્ટ વ્હીલ તરીકે ઘડિયાળ સાથે સાયકલ પ્રતિકૃતિ.
1,A black Honda motorcycle parked in front of a ...,ગેરેજની સામે પાર્ક કરેલી બ્લેક હોન્ડા મોટરસાયકલ
2,A room with blue walls and a white sink and door.,વાદળી દિવાલો અને સફેદ સિંક અને બારણું ધરાવતી ખંડ
3,A car that seems to be parked illegally behind...,એક કાર કે જે કાનૂની રીતે પાર્ક કરેલી કારની પાછ...
4,A large passenger airplane flying through the ...,હવામાં ઉડતી મોટી પેસેન્જર વિમાન.


In [9]:
# Split the data into train and test sets
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

# Split the train set further into train and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [10]:
# Assuming you have a DataFrame named 'df'
def fun_to_dic(df):
  var_dic = {}
  lst = []
  for index, row in df.iterrows():
      # 'row' is a Series object representing a single row
      new_dict={'en':row['en'],'gu':row['gu']}
      lst.append(new_dict)
  var_dic['translation'] = lst
  return var_dic
    

In [11]:
# Create a Dataset object for each split
train_dataset = Dataset.from_dict(fun_to_dic(train_df))
val_dataset = Dataset.from_dict(fun_to_dic(val_df))
test_dataset = Dataset.from_dict(fun_to_dic(test_df))

In [12]:
# Create a DatasetDict object with the splits
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 41600
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 10400
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 13000
    })
})

In [14]:
# Extract the appropriate keys for source and target translations
source_key = "translation" if "translation" in dataset["train"][0].keys() else "translations"
target_key = "translation" if "translation" in dataset["train"][0].keys() else "translations"
target_key

'translation'

In [15]:
dataset["train"]['translation']

[{'en': 'Small jet sits on tarmac of airport on sunny day.',
  'gu': 'નાના જેટ સન્ની દિવસે એરપોર્ટની ડામર પર આવેલું છે.'},
 {'en': 'A cow standing next to a white wall and bright blue door.',
  'gu': 'એક સફેદ દિવાલ અને તેજસ્વી વાદળી બારણું આગળ એક ગાય ઊભા.'},
 {'en': 'A motorcycle and car parked on the street.',
  'gu': 'એક મોટરસાઇકલ અને કાર શેરીમાં પાર્ક છે.'},
 {'en': 'A man and woman are riding a motorcycle and the woman is carrying a DSW bag.',
  'gu': 'એક માણસ અને સ્ત્રી મોટરસાઇકલ પર સવારી કરે છે અને મહિલા એક ડીએસડબલ્યુ બેગ ધરાવે છે.'},
 {'en': 'A kitchen filled with wooden cabinets and a table.',
  'gu': 'લાકડાના કેબિનેટ્સ અને કોષ્ટકથી ભરપૂર રસોડું.'},
 {'en': 'Two motorcyclists near a purple motorcycle towing a trailer.',
  'gu': 'એક ટ્રેલર ટાઈપ જાંબલી મોટરસાઇકલ નજીક બે મોટરસાયક્લીસ્ટોના.'},
 {'en': ' A road with pavement on which street sign board is attached on a pole.A cycle is standing by the pole. ',
  'gu': '\xa0પેવમેન્ટ સાથેનો રસ્તો, જેના પર ધ્રુવ પર શેરી સાઇન બોર્ડ જોડાય 

In [16]:
# Access the training split of the dataset
train_dataset = dataset["train"]

# Select a subset of training samples
num_train_samples = 3000
train_subset = train_dataset.select(range(num_train_samples))
print(train_subset)

Dataset({
    features: ['translation'],
    num_rows: 3000
})


In [17]:
# Load the T5 tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

In [18]:
# Custom dataset class for translation
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data_items = data

    def __len__(self):
        return len(self.data_items)

    def __getitem__(self, index):
        example = self.data_items[index]
        source_text = example['translation']["en"]
        target_text = example['translation']["gu"]
        return source_text, target_text

    def collate_fn(self, batch):
        source_texts, target_texts = zip(*batch)
        source_inputs = tokenizer.batch_encode_plus(source_texts, padding="longest", truncation=True, max_length=512, return_tensors="pt")
        target_inputs = tokenizer.batch_encode_plus(target_texts, padding="longest", truncation=True, max_length=512, return_tensors="pt")
        input_ids = source_inputs["input_ids"].squeeze()
        attention_mask = source_inputs["attention_mask"].squeeze()
        labels = target_inputs["input_ids"].squeeze()
        decoder_attention_mask = target_inputs["attention_mask"].squeeze()
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
            "decoder_attention_mask": decoder_attention_mask,
        }

In [19]:
# Create the translation datasets and dataloaders
train_dataset = TranslationDataset(train_subset)
validation_dataset = TranslationDataset(dataset["validation"])
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=train_dataset.collate_fn)
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=8, collate_fn=validation_dataset.collate_fn)


In [20]:
# Set the training parameters
num_epochs = 4
learning_rate = 1e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)



In [21]:
# Prepare the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=learning_rate, total_steps=total_steps)

In [22]:
print(len(train_dataloader))

375


In [23]:

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        decoder_attention_mask = batch["decoder_attention_mask"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        progress_bar.set_postfix({"loss": loss.item()})

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss}")

    # Validation loop
    model.eval()
    with torch.no_grad():
        total_val_loss = 0.0
        for val_batch in validation_dataloader:
            input_ids = val_batch["input_ids"].to(device)
            attention_mask = val_batch["attention_mask"].to(device)
            labels = val_batch["labels"].to(device)
            decoder_attention_mask = val_batch["decoder_attention_mask"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                decoder_attention_mask=decoder_attention_mask
            )
            val_loss = outputs.loss
            total_val_loss += val_loss.item()

        average_val_loss = total_val_loss / len(validation_dataloader)
        print(f"Validation Loss: {average_val_loss}")

Epoch 1/4: 100%|██████████| 375/375 [14:14<00:00,  2.28s/batch, loss=2.84]


Epoch 1/4 - Average Loss: 3.867903246243795
Validation Loss: 3.030196715134841


Epoch 2/4: 100%|██████████| 375/375 [13:52<00:00,  2.22s/batch, loss=3.32]


Epoch 2/4 - Average Loss: 2.95347074731191
Validation Loss: 1.7310846930283768


Epoch 3/4: 100%|██████████| 375/375 [13:38<00:00,  2.18s/batch, loss=2.44]


Epoch 3/4 - Average Loss: 2.1284131943384805
Validation Loss: 0.9175038376679787


Epoch 4/4: 100%|██████████| 375/375 [13:40<00:00,  2.19s/batch, loss=1.43]


Epoch 4/4 - Average Loss: 1.6141133081118266
Validation Loss: 0.6037931420367497


In [24]:
test_dataset = TranslationDataset(dataset["validation"])
test_dataloader = DataLoader(test_dataset, batch_size=16, collate_fn=test_dataset.collate_fn)


In [25]:
test_dataset

<__main__.TranslationDataset at 0x7fd5ac6384f0>

In [26]:
model.save_pretrained("fine_tuned_t5")

In [None]:
model.eval()
total_correct = 0
total_samples = 0

  
for example in test_dataset:
    source_text = example[0]
    target_text = example[1]
    
    # Tokenize the input text
    input_ids = tokenizer.encode(source_text, return_tensors="pt").to(device)
    
    # Generate the predicted output
    with torch.no_grad():
        output_ids = model.generate(input_ids)
    
    # Decode the predicted output
    predicted_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    # Compute accuracy
    if predicted_text == target_text:
        total_correct += 1
    total_samples += 1

accuracy = total_correct / total_samples
print(f"Accuracy: {accuracy * 100:.2f}%")

