In [None]:
!pip install transformers



In [None]:
import torch
from transformers import BertModel, BertTokenizer, BertConfig, BertForMaskedLM, BertTokenizerFast
from transformers import EncoderDecoderModel, EncoderDecoderConfig
from tqdm import tqdm
import os
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import zipfile
import io
import gc


FILEPATH = 'filtered_paranmt.zip'


def extract_tsv(file_path):
    # load the .tsv file and return the data as a pandas DataFrame
    data = pd.read_csv(file_path, sep='\t')
    return data


class ParaNMTDataset(Dataset):
    def __init__(self, dataframe=None, filepath=None):
        if filepath is not None:
            self.data = extract_tsv(filepath)
        elif dataframe is not None:
            self.data = dataframe
        else:
            print('No file or dataframe were provided to Dataset constructor')
            self.data = None
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        # len is obligatory method for the torch-formatted dataset
        return len(self.data)

    def __getitem__(self, index):
        source_text = self.data.iloc[index]['reference']
        target_text = self.data.iloc[index]['translation']
        ref_tox = self.data.iloc[index]['ref_tox']
        trn_tox = self.data.iloc[index]['trn_tox']

        source_tokens = self.tokenizer.tokenize(source_text)
        target_tokens = self.tokenizer.tokenize(target_text)
        source_ids = torch.tensor(self.tokenizer.convert_tokens_to_ids(source_tokens))
        target_ids = torch.tensor(self.tokenizer.convert_tokens_to_ids(target_tokens))

        # Add the missing inputs
        input_ids = source_ids
        attention_mask = torch.ones_like(input_ids)
        decoder_input_ids = target_ids
        decoder_attention_mask = torch.ones_like(decoder_input_ids)

        return input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, ref_tox, trn_tox


def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    attention_mask = [item[1] for item in batch]
    decoder_input_ids = [item[2] for item in batch]
    decoder_attention_mask = [item[3] for item in batch]
    ref_tox = [item[4] for item in batch]
    trn_tox = [item[5] for item in batch]

    # Pad sequences
    input_ids_padded = pad_sequence(input_ids, batch_first=True)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True)
    decoder_input_ids_padded = pad_sequence(decoder_input_ids, batch_first=True)
    decoder_attention_mask_padded = pad_sequence(decoder_attention_mask, batch_first=True)
    ref_tox = torch.tensor(ref_tox)
    trn_tox = torch.tensor(trn_tox)

    return input_ids_padded, attention_mask_padded, decoder_input_ids_padded, decoder_attention_mask_padded, \
        ref_tox, trn_tox


def get_dataloader(dataframe=None, train_test_split=True, test_size=0.2, filepath=None):
    if dataframe is not None:
        dataset_ = ParaNMTDataset(dataframe)
    elif filepath is not None:
        dataset_ = ParaNMTDataset(filepath)
    else:
        print('No data was provided to get_dataloader')
        return None

    if train_test_split:
        # split dataset into train and test sets
        dataset_size = len(dataset_)
        train_size = int((1 - test_size) * dataset_size)
        test_size = dataset_size - train_size
        train_data, test_data = random_split(dataset_, [train_size, test_size])

        # create DataLoaders for train and test sets
        train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_fn)
        test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collate_fn)

        return train_dataloader, test_dataloader
    else:
        # create a DataLoader for the entire dataset
        dataloader_ = DataLoader(dataset_, batch_size=32, shuffle=True, collate_fn=collate_fn)
        return dataloader_


def unzip_tsv(filepath=FILEPATH):
    print(os.listdir('.'))
    with zipfile.ZipFile(filepath, 'r') as zip_ref:
        file_content = zip_ref.read("filtered.tsv").decode("utf-8")
    df = pd.read_csv(io.StringIO(file_content), sep="\t")
    return df


def test_dataloader_creation():
    train_dataloader, test_dataloader = get_dataloader()
    for batch in train_dataloader:
        source_ids, target_ids, ref_tox, trn_tox = batch
        print("--- TRAIN ---")
        print("Source IDs:", source_ids)
        print("Target IDs:", target_ids)
        print("Reference Toxicity:", ref_tox)
        print("Translation Toxicity:", trn_tox)
        print("Batch Size:", len(source_ids))
        break
    for batch in test_dataloader:
        source_ids, target_ids, ref_tox, trn_tox = batch
        print("--- TEST ---")
        print("Source IDs:", source_ids)
        print("Target IDs:", target_ids)
        print("Reference Toxicity:", ref_tox)
        print("Translation Toxicity:", trn_tox)
        print("Batch Size:", len(source_ids))
        break


def train_model(model, train_loader, batch_size, num_epochs, learning_rate):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)

        for batch in progress_bar:
            torch.cuda.empty_cache()
            gc.collect()
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            decoder_input_ids = batch[2].to(device)
            decoder_attention_mask = batch[3].to(device)
            ref_tox = batch[4].to(device)
            trn_tox = batch[5].to(device)

            optimizer.zero_grad()

            logits = model(input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
            loss = criterion(logits.view(-1, logits.shape[-1]), decoder_input_ids.view(-1))

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            progress_bar.set_postfix({'Loss': loss.item()})

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}')


class ToxicityTranslationModel(torch.nn.Module):
    def __init__(self):
        super(ToxicityTranslationModel, self).__init__()
        self.encoder = BertModel.from_pretrained('bert-base-uncased')
        self.decoder = BertForMaskedLM.from_pretrained('bert-base-uncased')
        self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        self.config = EncoderDecoderConfig.from_encoder_decoder_configs(
            encoder_config=BertConfig.from_pretrained('bert-base-uncased'),
            decoder_config=BertConfig.from_pretrained('bert-base-uncased')
        )
        self.model = EncoderDecoderModel(config=self.config)

    def forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask):
        encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            encoder_hidden_states=encoder_outputs.last_hidden_state,
            encoder_attention_mask=attention_mask
        )
        return decoder_outputs.logits

    def translate(self, input_text):
        input_ids = self.tokenizer.encode(input_text, add_special_tokens=True, truncation=True, max_length=512)
        input_ids = torch.tensor(input_ids).unsqueeze(0)
        attention_mask = torch.ones_like(input_ids)

        decoder_input_ids = self.tokenizer.encode("[MASK]", add_special_tokens=False)
        decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)
        decoder_attention_mask = torch.ones_like(decoder_input_ids)

        self.model.eval()
        with torch.no_grad():
            translated_ids = self.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                decoder_input_ids=decoder_input_ids,
                decoder_attention_mask=decoder_attention_mask,
                max_length=512,
                num_beams=5,
                early_stopping=True
            )

        translated_text = self.tokenizer.decode(translated_ids[0], skip_special_tokens=True)
        return translated_text


model = ToxicityTranslationModel()
train_loader, test_loader = get_dataloader(unzip_tsv())
train_model(model, train_loader, 1, 1, 0.003)
# Load the trained model weights

input_text = "This is a toxic sentence."
translated_text = model.translate(input_text)

print(f"Input: {input_text}")
print(f"Translated: {translated_text}")



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['.config', 'filtered_paranmt.zip', 'sample_data']




OutOfMemoryError: ignored

In [None]:
import torch
from transformers import GPT2Model, GPT2Tokenizer, GPT2Config, GPT2LMHeadModel, GPT2TokenizerFast
from transformers import EncoderDecoderModel, EncoderDecoderConfig
from tqdm import tqdm
import os
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import zipfile
import io


FILEPATH = 'filtered_paranmt.zip'


def extract_tsv(file_path):
    # load the .tsv file and return the data as a pandas DataFrame
    data = pd.read_csv(file_path, sep='\t')
    return data


class ParaNMTDataset(Dataset):
    def __init__(self, dataframe=None, filepath=None):
        if filepath is not None:
            self.data = extract_tsv(filepath)
        elif dataframe is not None:
            self.data = dataframe
        else:
            print('No file or dataframe were provided to Dataset constructor')
            self.data = None
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source_text = self.data.iloc[index]['reference']
        target_text = self.data.iloc[index]['translation']
        ref_tox = self.data.iloc[index]['ref_tox']
        trn_tox = self.data.iloc[index]['trn_tox']

        source_tokens = self.tokenizer.tokenize(source_text)
        target_tokens = self.tokenizer.tokenize(target_text)
        source_ids = torch.tensor(self.tokenizer.convert_tokens_to_ids(source_tokens))
        target_ids = torch.tensor(self.tokenizer.convert_tokens_to_ids(target_tokens))

        # Add the missing inputs
        input_ids = source_ids
        attention_mask = torch.ones_like(input_ids)
        decoder_input_ids = target_ids
        decoder_attention_mask = torch.ones_like(decoder_input_ids)

        return input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, ref_tox, trn_tox


def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    attention_mask = [item[1] for item in batch]
    decoder_input_ids = [item[2] for item in batch]
    decoder_attention_mask = [item[3] for item in batch]
    ref_tox = [item[4] for item in batch]
    trn_tox = [item[5] for item in batch]

    # Pad sequences
    input_ids_padded = pad_sequence(input_ids, batch_first=True)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True)
    decoder_input_ids_padded = pad_sequence(decoder_input_ids, batch_first=True)
    decoder_attention_mask_padded = pad_sequence(decoder_attention_mask, batch_first=True)
    ref_tox = torch.tensor(ref_tox)
    trn_tox = torch.tensor(trn_tox)

    return input_ids_padded, attention_mask_padded, decoder_input_ids_padded, decoder_attention_mask_padded, \
        ref_tox, trn_tox


def get_dataloader(dataframe=None, train_test_split=True, test_size=0.2, filepath=None):
    if dataframe is not None:
        dataset_ = ParaNMTDataset(dataframe)
    elif filepath is not None:
        dataset_ = ParaNMTDataset(filepath)
    else:
        print('No data was provided to get_dataloader')
        return None

    if train_test_split:
        dataset_size = len(dataset_)
        train_size = int((1 - test_size) * dataset_size)
        test_size = dataset_size - train_size
        train_data, test_data = random_split(dataset_, [train_size, test_size])

        train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_fn)
        test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collate_fn)

        return train_dataloader, test_dataloader
    else:
        dataloader_ = DataLoader(dataset_, batch_size=32, shuffle=True, collate_fn=collate_fn)
        return dataloader_


def unzip_tsv(filepath=FILEPATH):
    print(os.listdir('.'))
    with zipfile.ZipFile(filepath, 'r') as zip_ref:
        file_content = zip_ref.read("filtered.tsv").decode("utf-8")
    df = pd.read_csv(io.StringIO(file_content), sep="\t")
    return df


def determine_toxic(df):
    df['toxic'] = df.apply(lambda row: row['reference'] if row['ref_tox'] > row['trn_tox'] else row['translation'], axis=1)
    df['toxic_tox'] = df[['ref_tox', 'trn_tox']].max(axis=1)
    df['neutral'] = df.apply(lambda row: row['reference'] if row['ref_tox'] <= row['trn_tox'] else row['translation'], axis=1)
    df['neutral_tox'] = df[['ref_tox', 'trn_tox']].min(axis=1)

    # Drop the old columns
    df = df.drop(['reference', 'translation', 'ref_tox', 'trn_tox'], axis=1)
    return df


def train_model(model, train_loader, batch_size, num_epochs, learning_rate):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False)

        for batch in progress_bar:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            decoder_input_ids = batch[2].to(device)
            decoder_attention_mask = batch[3].to(device)
            ref_tox = batch[4].to(device)
            trn_tox = batch[5].to(device)

            optimizer.zero_grad()

            logits = model(input_ids, attention_mask, decoder_input_ids, decoder_attention_mask)
            loss = criterion(logits.view(-1, logits.shape[-1]), decoder_input_ids.view(-1))

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            progress_bar.set_postfix({'Loss': loss.item()})

        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}')


class ToxicityTranslationModel2(torch.nn.Module):
    def __init__(self):
        super(ToxicityTranslationModel2, self).__init__()
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.config = EncoderDecoderConfig.from_encoder_decoder_configs(
            encoder_config=GPT2Config.from_pretrained('gpt2'),
            decoder_config=GPT2Config.from_pretrained('gpt2'),
            add_cross_attention=True
        )
        self.model = EncoderDecoderModel(config=self.config)

    def forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask
        )
        return outputs.logits

    def translate(self, input_text):
        input_ids = self.tokenizer.encode(input_text, add_special_tokens=True, truncation=True, max_length=512)
        input_ids = torch.tensor(input_ids).unsqueeze(0)
        attention_mask = torch.ones_like(input_ids)

        decoder_input_ids = self.tokenizer.encode("[MASK]", add_special_tokens=False)
        decoder_input_ids = torch.tensor(decoder_input_ids).unsqueeze(0)
        decoder_attention_mask = torch.ones_like(decoder_input_ids)

        self.model.eval()
        with torch.no_grad():
            translated_ids = self.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                decoder_input_ids=decoder_input_ids,
                decoder_attention_mask=decoder_attention_mask,
                max_length=512,
                num_beams=5,
                early_stopping=True
            )

        translated_text = self.tokenizer.decode(translated_ids[0], skip_special_tokens=True)
        return translated_text


model = ToxicityTranslationModel2()
train_loader, test_loader = get_dataloader(unzip_tsv())
train_model(model, train_loader, 16, 1, 0.003)
# Load the trained model weights

input_text = "This is a toxic sentence."
translated_text = model.translate(input_text)

print(f"Input: {input_text}")
print(f"Translated: {translated_text}")


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

['.config', 'filtered_paranmt.zip', 'sample_data']


OutOfMemoryError: ignored