In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import KFold

# Load the training and testing dataset from CSV
df = pd.read_csv('/content/10_13.csv')
df_test = pd.read_csv('/content/test_data.csv')

In [None]:
# Define the dataset class
class GreetingDataset(Dataset):
    def __init__(self, data, tokenizer, source_max_token_len=512, target_max_token_len=128):
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]

        source_encoding = tokenizer(
            data_row['input'],
            max_length=self.source_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        target_encoding = tokenizer(
            data_row['receiver'],
            max_length=self.target_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        labels = target_encoding['input_ids']
        labels[labels == 0] = -100
        return dict(
            input_ids=source_encoding['input_ids'].flatten(),
            attention_mask=source_encoding['attention_mask'].flatten(),
            labels=labels.flatten()
        )

In [None]:
# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# KFold Cross Validation setup
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=200)
epochs = 10

# Loop for each fold
for fold, (train_ids, _) in enumerate(kfold.split(df)):
    print(f'FOLD {fold}')
    print('--------------------------------')

    # Split data for training
    df_train = df.iloc[train_ids]

    # Create datasets for training
    train_dataset = GreetingDataset(df_train, tokenizer)

    # DataLoaders for training
    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

    # Initialize model for each fold
    model = T5ForConditionalGeneration.from_pretrained('t5-base')
    optimizer = AdamW(model.parameters(), lr=1e-4)

    # Scheduler and device setup
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_dataloader) * epochs
    )
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    # Training loop
    for epoch in range(epochs):
        model.train()
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            scheduler.step()

            print(f'Epoch: {epoch}, Loss: {loss.item()}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


FOLD 0
--------------------------------


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



Epoch: 0, Loss: 3.56543231010437
Epoch: 0, Loss: 4.65068244934082
Epoch: 0, Loss: 3.352097988128662
Epoch: 0, Loss: 1.7584164142608643
Epoch: 0, Loss: 2.076038360595703
Epoch: 0, Loss: 1.7067160606384277
Epoch: 0, Loss: 1.4559040069580078
Epoch: 0, Loss: 1.7029590606689453
Epoch: 0, Loss: 1.2621036767959595
Epoch: 0, Loss: 1.4953712224960327
Epoch: 0, Loss: 2.1406726837158203
Epoch: 0, Loss: 1.4473797082901
Epoch: 0, Loss: 1.977662444114685
Epoch: 0, Loss: 0.478817880153656
Epoch: 0, Loss: 1.1092517375946045
Epoch: 0, Loss: 0.8908511996269226
Epoch: 1, Loss: 0.8216361403465271
Epoch: 1, Loss: 1.0410709381103516
Epoch: 1, Loss: 0.5712700486183167
Epoch: 1, Loss: 0.6259547472000122
Epoch: 1, Loss: 0.6932999491691589
Epoch: 1, Loss: 0.7687098979949951
Epoch: 1, Loss: 0.779318630695343
Epoch: 1, Loss: 0.6173781752586365
Epoch: 1, Loss: 0.8844025135040283
Epoch: 1, Loss: 0.41836974024772644
Epoch: 1, Loss: 0.5081225633621216
Epoch: 1, Loss: 0.6838318705558777
Epoch: 1, Loss: 0.5815823674201



Epoch: 0, Loss: 4.438751697540283
Epoch: 0, Loss: 6.570211410522461
Epoch: 0, Loss: 2.2465429306030273
Epoch: 0, Loss: 2.952369451522827
Epoch: 0, Loss: 2.880758047103882
Epoch: 0, Loss: 1.7975518703460693
Epoch: 0, Loss: 2.2151756286621094
Epoch: 0, Loss: 1.2347761392593384
Epoch: 0, Loss: 1.4845783710479736
Epoch: 0, Loss: 1.8011620044708252
Epoch: 0, Loss: 1.915417194366455
Epoch: 0, Loss: 1.4737944602966309
Epoch: 0, Loss: 1.0984866619110107
Epoch: 0, Loss: 2.458838939666748
Epoch: 0, Loss: 1.1978048086166382
Epoch: 0, Loss: 0.5479729175567627
Epoch: 1, Loss: 0.7807000279426575
Epoch: 1, Loss: 1.2006402015686035
Epoch: 1, Loss: 0.8238429427146912
Epoch: 1, Loss: 1.6387063264846802
Epoch: 1, Loss: 0.827042818069458
Epoch: 1, Loss: 1.0848315954208374
Epoch: 1, Loss: 0.6879200339317322
Epoch: 1, Loss: 0.8193394541740417
Epoch: 1, Loss: 0.7248915433883667
Epoch: 1, Loss: 0.7292153835296631
Epoch: 1, Loss: 0.981360673904419
Epoch: 1, Loss: 0.894077479839325
Epoch: 1, Loss: 0.77160936594

In [None]:
# Load the best model after training for predictions
# Saving the model
model.save_pretrained('./model_2')  # This will create a folder 'model_1' in the current directory
model = T5ForConditionalGeneration.from_pretrained('./model_2')
model.eval()
model.to(device)

# Create a DataLoader for the test dataset
test_dataset = GreetingDataset(df_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=8)

In [None]:
# Define the dataset class for test data
class TestGreetingDataset(Dataset):
    def __init__(self, data, tokenizer, max_token_len=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]

        encoding = tokenizer(
            data_row['input'],
            max_length=self.max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        return dict(
            input_ids=encoding['input_ids'].flatten(),
            attention_mask=encoding['attention_mask'].flatten()
        )
def generate_predictions(dataloader, model, tokenizer, device, max_length=512):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Generate outputs
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)

            # Decode the predicted IDs to the text
            decoded_preds = [tokenizer.decode(pred_id, skip_special_tokens=True) for pred_id in outputs]

            predictions.extend(decoded_preds)

    return predictions

# Load test data
test_df = pd.read_csv('test_data.csv')
test_dataset = TestGreetingDataset(test_df, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Generate predictions for test data
test_predictions = generate_predictions(test_dataloader, model, tokenizer, device, max_length=512)

# Save predictions in test_data.csv
test_df['predicted_receiver'] = test_predictions
test_df.to_csv('test_data_with_predictions.csv', index=False)
