In [2]:
from datasets import load_dataset
from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import default_data_collator
from transformers import HfArgumentParser
import pandas as pd
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "t5-small"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, config=config).to(device)

# dataset = load_dataset('findnitai/english-to-hinglish')
# master = [line['en'] for line in dataset['train']['translation']]
# master += [line['hi_ng'] for line in dataset['train']['translation']]

df = pd.read_csv("/kaggle/input/engtohing/train_new.txt", sep='\t', header=None, names=['eng', 'hing'])
master = df['eng'].tolist()
master+= df['hing'].tolist()

# dataset.head()

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [3]:
print(config)

T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
     

In [4]:
print(len(df))

248330


In [5]:
from datasets import Dataset
data = []
for index, row in df.iterrows():
    data.append({
        'en': row['eng'],
        'hi_ng': row['hing']
    })

# Create a Dataset object
dataset = Dataset.from_dict({'translation': data})

print(dataset)

Dataset({
    features: ['translation'],
    num_rows: 248330
})


In [6]:
print(dataset[0])

{'translation': {'en': 'Hindi Milaap From Hyderbad', 'hi_ng': 'Hindi Milaap From Hyderbad'}}


In [7]:
def gen_training_data():
    return (master[i : i+500] for i in range(0, len(master), 500))

tokenizer_training_data = gen_training_data()
tokenizer = tokenizer.train_new_from_iterator(tokenizer_training_data, 32128)

def preprocess(source_data):
    inputs = [sample['en'] for sample in source_data["translation"]]
    targets = [sample['hi_ng'] for sample in source_data["translation"]]
    inputs = ["Translate English to Hinglish: " + inp for inp in inputs]
    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)
    labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)
    labels["input_ids"] = [[l if l != tokenizer.pad_token_id else -100 for l in label] for label in labels["input_ids"]]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

raw_dataset = {"train": dataset}
train_dataset = raw_dataset["train"].map(preprocess, batched=True, remove_columns="translation")
print(train_dataset)





KeyboardInterrupt: 

In [None]:
print(raw_dataset['train']['translation'][:5])

In [None]:
data_collator = default_data_collator

num_epochs = 3
trainer_args_in = {
    'output_dir': 'my-t5-hinglish-translator',
    'overwrite_output_dir': True,
    'do_train': True,
    'per_device_train_batch_size': 8,
    'num_train_epochs': num_epochs,
    'save_strategy': 'no',
    'report_to' : []
}

parser = HfArgumentParser((Seq2SeqTrainingArguments,))
training_args = parser.parse_dict(trainer_args_in)
trainer = Seq2SeqTrainer(model=model, args=training_args[0], train_dataset=train_dataset, tokenizer=tokenizer, data_collator=data_collator)

train_result = trainer.train(resume_from_checkpoint=None)
trainer.save_model()

model = AutoModelForSeq2SeqLM.from_pretrained("my-t5-hinglish-translator").to(device)
input_text = "translate English to Hinglish: How is the weather?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
outputs = model.generate(input_ids)
print("Test Output: " + tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
# import torch
# from datasets import load_dataset, Dataset
# from transformers import AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
# from transformers import default_data_collator
# import pandas as pd
# from transformers import HfArgumentParser

# # Check if GPU is available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Model configuration
# model_name = "t5-small"
# config = AutoConfig.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name, config=config).to(device)

# df = pd.read_csv("/kaggle/input/engtohing/train_new.txt", sep='\t', header=None, names=['eng', 'hing'])
# master = df['eng'].tolist()
# master+= df['hing'].tolist()

# data = []
# for index, row in df.iterrows():
#     data.append({
#         'en': row['eng'],
#         'hi_ng': row['hing']
#     })

# # Create a Dataset object
# dataset = Dataset.from_dict({'translation': data})

# # Tokenizer training data
# def gen_training_data():
#     return (master[i : i+500] for i in range(0, len(master), 500))

# tokenizer_training_data = gen_training_data()
# tokenizer = tokenizer.train_new_from_iterator(tokenizer_training_data, 32128)

# # Data preprocessing
# def preprocess(source_data):
#     inputs = [sample['en'] for sample in source_data["translation"]]
#     targets = [sample['hi_ng'] for sample in source_data["translation"]]
#     inputs = ["Translate English to Hinglish: " + inp for inp in inputs]
#     model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)
#     labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)
#     labels["input_ids"] = [[l if l != tokenizer.pad_token_id else -100 for l in label] for label in labels["input_ids"]]
#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

# raw_dataset = {"train": dataset}
# train_dataset = raw_dataset["train"].map(preprocess, batched=True, remove_columns="translation")
# data_collator = default_data_collator

# # # Training arguments
# # num_epochs = 3
# # training_args = Seq2SeqTrainingArguments(
# #     output_dir='my-t5-hinglish-translator',
# #     overwrite_output_dir=True,
# #     do_train=True,
# #     per_device_train_batch_size=8,
# #     num_train_epochs=num_epochs,
# #     'save_strategy': 'no'
# # )

# num_epochs = 2
# trainer_args_in = {
#     'output_dir': 'my-t5-hinglish-translator',
#     'overwrite_output_dir': True,
#     'do_train': True,
#     'per_device_train_batch_size': 8,
#     'num_train_epochs': num_epochs,
#     'save_strategy': 'no'
# }

# parser = HfArgumentParser((Seq2SeqTrainingArguments,))
# training_args = parser.parse_dict(trainer_args_in)
# # trainer = Seq2SeqTrainer(model=model, args=training_args[0], train_dataset=train_dataset, tokenizer=tokenizer, data_collator=data_collator)


# # Load checkpoint if available
# checkpoint_path = '/kaggle/input/t5-3epoch/kaggle/working/my-t5-hinglish-translator'  # Update with your checkpoint folder path
# if checkpoint_path:
#     trainer = Seq2SeqTrainer(
#         model=model,
#         args=training_args[0],
#         train_dataset=train_dataset,
#         tokenizer=tokenizer,
#         data_collator=data_collator
#     )
# else:
#     trainer = Seq2SeqTrainer(
#         model=model,
#         args=training_args,
#         train_dataset=train_dataset,
#         tokenizer=tokenizer,
#         data_collator=data_collator
#     )

# # Continue training
# trainer.train(resume_from_checkpoint=checkpoint_path)
# trainer.save_model()

# # Inference
# model = AutoModelForSeq2SeqLM.from_pretrained("my-t5-hinglish-translator").to(device)
# input_text = "translate English to Hinglish: How is the weather?"
# input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
# outputs = model.generate(input_ids)
# print("Test Output: " + tokenizer.decode(outputs[0], skip_special_tokens=True))

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/t5-3epoch/kaggle/working/my-t5-hinglish-translator")
model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/input/t5-3epoch/kaggle/working/my-t5-hinglish-translator").to(device)

In [19]:
# Generate output
input_text = "translate english to hinglish: This fact is based on possibility"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
outputs = model.generate(input_ids)
output_string = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Input:", input_text)
print("Output:", output_string)

Input: translate english to hinglish: This fact is based on possibility
Output: यह fact possibility पर based है ।


In [16]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
!zip -r my-t5-hinglish-translator /kaggle/working/my-t5-hinglish-translator
from IPython.display import FileLink
FileLink(r'my-t5-hinglish-translator.zip')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/my-t5-hinglish-translator/ (stored 0%)
  adding: kaggle/working/my-t5-hinglish-translator/runs/ (stored 0%)
  adding: kaggle/working/my-t5-hinglish-translator/runs/May07_15-25-02_b42c6a7ce932/ (stored 0%)
  adding: kaggle/working/my-t5-hinglish-translator/runs/May07_15-25-02_b42c6a7ce932/events.out.tfevents.1715095503.b42c6a7ce932.34.0 (deflated 69%)


In [21]:
from tqdm import tqdm
def translate_text_pytorch(text, model, tokenizer):
    input_text = "translate english to hinglish: " + text
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(input_ids)
    output_string = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return output_string


df_test = pd.read_csv('/kaggle/input/engtohing/test_new.txt', names=['en', 'hing'], usecols=['en', 'hing'], sep='\t')
en_test = df_test['en']
hing_test = df_test['hing']
texts = list(en_test)

In [20]:
input_text = "translate english to hinglish: "+texts[0]
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
outputs = model.generate(input_ids)
output_string = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(input_text ,output_string)

translate english to hinglish: This fact is based on possibility यह fact possibility पर based है ।


In [22]:

translated = []
for text in tqdm(texts):
    translated.append(translate_text_pytorch(text, model, tokenizer))

100%|██████████| 2000/2000 [03:13<00:00, 10.36it/s]


In [23]:
with open('translated.txt', 'w', encoding='utf-8') as file:
    for translate in translated:
        file.write(translate + '\n')