In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import KFold

# Load the training and testing dataset from CSV
df = pd.read_csv('/content/10_13.csv')
df_test = pd.read_csv('/content/test_data.csv')

In [None]:
# Define the dataset class
class GreetingDataset(Dataset):
    def __init__(self, data, tokenizer, source_max_token_len=512, target_max_token_len=128):
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]

        source_encoding = tokenizer(
            data_row['input'],
            max_length=self.source_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        target_encoding = tokenizer(
            data_row['sender'],
            max_length=self.target_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        labels = target_encoding['input_ids']
        labels[labels == 0] = -100
        return dict(
            input_ids=source_encoding['input_ids'].flatten(),
            attention_mask=source_encoding['attention_mask'].flatten(),
            labels=labels.flatten()
        )

In [None]:
# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# KFold Cross Validation setup
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=200)
epochs = 10

# Loop for each fold
for fold, (train_ids, _) in enumerate(kfold.split(df)):
    print(f'FOLD {fold}')
    print('--------------------------------')

    # Split data for training
    df_train = df.iloc[train_ids]

    # Create datasets for training
    train_dataset = GreetingDataset(df_train, tokenizer)

    # DataLoaders for training
    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

    # Initialize model for each fold
    model = T5ForConditionalGeneration.from_pretrained('t5-base')
    optimizer = AdamW(model.parameters(), lr=1e-4)

    # Scheduler and device setup
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_dataloader) * epochs
    )
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)

    # Training loop
    for epoch in range(epochs):
        model.train()
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            scheduler.step()

            print(f'Epoch: {epoch}, Loss: {loss.item()}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


FOLD 0
--------------------------------


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



Epoch: 0, Loss: 2.563680410385132
Epoch: 0, Loss: 6.042134761810303
Epoch: 0, Loss: 6.385861873626709
Epoch: 0, Loss: 2.498853921890259
Epoch: 0, Loss: 2.4448015689849854
Epoch: 0, Loss: 3.4361572265625
Epoch: 0, Loss: 2.9386754035949707
Epoch: 0, Loss: 2.5575716495513916
Epoch: 0, Loss: 2.8600621223449707
Epoch: 0, Loss: 2.6525213718414307
Epoch: 0, Loss: 1.7925846576690674
Epoch: 0, Loss: 4.005783557891846
Epoch: 0, Loss: 2.275695323944092
Epoch: 0, Loss: 2.2652721405029297
Epoch: 0, Loss: 1.657443642616272
Epoch: 0, Loss: 1.5459929704666138
Epoch: 1, Loss: 1.6536266803741455
Epoch: 1, Loss: 1.850138545036316
Epoch: 1, Loss: 1.9947962760925293
Epoch: 1, Loss: 1.2297544479370117
Epoch: 1, Loss: 1.047934889793396
Epoch: 1, Loss: 0.9879496693611145
Epoch: 1, Loss: 2.027306318283081
Epoch: 1, Loss: 1.4720319509506226
Epoch: 1, Loss: 0.8650949597358704
Epoch: 1, Loss: 0.6868875622749329
Epoch: 1, Loss: 1.2688758373260498
Epoch: 1, Loss: 1.2132747173309326
Epoch: 1, Loss: 1.835708022117614



Epoch: 0, Loss: 7.373545169830322
Epoch: 0, Loss: 2.605210542678833
Epoch: 0, Loss: 6.012519836425781
Epoch: 0, Loss: 1.8318023681640625
Epoch: 0, Loss: 4.741080284118652
Epoch: 0, Loss: 3.6350529193878174
Epoch: 0, Loss: 4.09054708480835
Epoch: 0, Loss: 5.071191310882568
Epoch: 0, Loss: 3.8250207901000977
Epoch: 0, Loss: 1.5468313694000244
Epoch: 0, Loss: 1.3734002113342285
Epoch: 0, Loss: 2.495683431625366
Epoch: 0, Loss: 2.5366897583007812
Epoch: 0, Loss: 1.5996365547180176
Epoch: 0, Loss: 1.449724793434143
Epoch: 0, Loss: 1.1363747119903564
Epoch: 1, Loss: 2.16517972946167
Epoch: 1, Loss: 0.8638656139373779
Epoch: 1, Loss: 1.3215471506118774
Epoch: 1, Loss: 1.482641339302063
Epoch: 1, Loss: 1.338900089263916
Epoch: 1, Loss: 2.08589768409729
Epoch: 1, Loss: 1.6292191743850708
Epoch: 1, Loss: 1.1938562393188477
Epoch: 1, Loss: 1.1898525953292847
Epoch: 1, Loss: 1.1769102811813354
Epoch: 1, Loss: 1.0044230222702026
Epoch: 1, Loss: 0.44829630851745605
Epoch: 1, Loss: 0.6412529349327087

In [None]:
# Load the best model after training for predictions
# Saving the model
model.save_pretrained('./model_1')
model = T5ForConditionalGeneration.from_pretrained('./model_1')
model.eval()
model.to(device)

# Create a DataLoader for the test dataset
test_dataset = GreetingDataset(df_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=8)

In [None]:
# Define the dataset class for test data
class TestGreetingDataset(Dataset):
    def __init__(self, data, tokenizer, max_token_len=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]

        encoding = tokenizer(
            data_row['input'],
            max_length=self.max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        return dict(
            input_ids=encoding['input_ids'].flatten(),
            attention_mask=encoding['attention_mask'].flatten()
        )


# Load test data
test_df = pd.read_csv('test_data.csv')
test_dataset = TestGreetingDataset(test_df, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=8)

# Generate predictions for test data
test_predictions = generate_predictions(test_dataloader, model, tokenizer, device, max_length=512)

# Save predictions in test_data.csv
test_df['predicted_sender'] = test_predictions
test_df.to_csv('test_data_with_predictions.csv', index=False)
