In [75]:
import pandas as pd

In [76]:
path = '/content/drive/MyDrive/filtered_and_prepocessed.csv'


In [77]:
data = pd.read_csv(path)

In [78]:
data.head()

Unnamed: 0,reference,translation
0,alkar floods mental waste would explain high l...,alkar flooding psychic waste explains high lev...
1,youre becoming disgusting,youre getting nasty
2,well spare life,well could spare life one
3,monkey wake,ah monkey youve got snap
4,orders kill,ive got orders put


## Model Building

In [79]:
input_texts = list(data['reference'])
target_texts = list(data['translation'])

In [82]:
input_texts = list(filter(lambda x: isinstance(x, str), input_texts))[:1000]
target_texts = list(filter(lambda x: isinstance(x, str), target_texts))[:1000]

In [83]:
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence

word_counter = Counter()

for sentence in input_texts + target_texts:
    word_counter.update(sentence.lower().split())


vocab = {word: index + 4 for index, word in enumerate(word_counter)}  # +4 for special tokens
vocab['<pad>'] = 0
vocab['<unk>'] = 1
vocab['<eos>'] = 2
vocab['<sos>'] = 3

# Determine the size of your vocabulary
num_tokens = len(vocab)

# Function to tokenize sentences
def tokenize(sentence, vocab, max_length):
    return [vocab.get(token, vocab['<unk>']) for token in sentence.lower().split()[:max_length]]

class TextDataset(Dataset):
    def __init__(self, input_texts, target_texts, vocab, max_length=512):
        self.input_texts = [torch.tensor(tokenize(text, vocab, max_length)) for text in input_texts]
        self.target_texts = [torch.tensor(tokenize(text, vocab, max_length)) for text in target_texts]
        self.pad_idx = vocab['<pad>']

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        return self.input_texts[idx], self.target_texts[idx]

    def collate_fn(self, batch):
        input_texts, target_texts = zip(*batch)
        # Padding sequences to the max length in each batch
        input_texts = pad_sequence(input_texts, batch_first=True, padding_value=self.pad_idx)
        target_texts = pad_sequence(target_texts, batch_first=True, padding_value=self.pad_idx)
        return input_texts, target_texts

# Split the dataset
input_train, input_val, target_train, target_val = train_test_split(input_texts, target_texts, test_size=0.1)

# Create datasets and dataloaders
train_dataset = TextDataset(input_train, target_train, vocab, max_length=500)
val_dataset = TextDataset(input_val, target_val, vocab, max_length=500)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=train_dataset.collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=32, collate_fn=val_dataset.collate_fn)



In [84]:
print(vocab)

{'alkar': 4, 'floods': 5, 'mental': 6, 'waste': 7, 'would': 8, 'explain': 9, 'high': 10, 'levels': 11, 'neurotransmitter': 12, 'youre': 13, 'becoming': 14, 'disgusting': 15, 'well': 16, 'spare': 17, 'life': 18, 'monkey': 19, 'wake': 20, 'orders': 21, 'kill': 22, 'im': 23, 'gon': 24, 'na': 25, 'child': 26, 'genetic': 27, 'disorder': 28, 'whos': 29, 'die': 30, 'l': 31, 'theyre': 32, 'laughing': 33, 'us': 34, 'kick': 35, 'ass': 36, 'maine': 37, 'short': 38, 'black': 39, 'people': 40, 'back': 41, 'briggs': 42, 'hell': 43, 'going': 44, 'another': 45, 'simply': 46, 'didnt': 47, 'know': 48, 'whenever': 49, 'met': 50, 'brother': 51, 'nearly': 52, 'beat': 53, 'shit': 54, 'youd': 55, 'probably': 56, 'want': 57, 'buy': 58, 'chocolates': 59, 'flowers': 60, 'whispered': 61, 'pretty': 62, 'rubbish': 63, 'spirits': 64, 'cursed': 65, 'walking': 66, 'roads': 67, 'waterways': 68, 'find': 69, 'unfaithful': 70, 'man': 71, 'never': 72, 'seen': 73, 'ill': 74, 'freeze': 75, 'come': 76, 'cal': 77, 'leave': 78

In [85]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import random

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_len):
        # Embedding
        embedded = self.dropout(self.embedding(src))

        # Pack sequence
        packed_embedded = pack_padded_sequence(embedded, src_len.to('cpu'), batch_first=True, enforce_sorted=False)

        # Pass packed sequence through rnn
        packed_outputs, (hidden, cell) = self.rnn(packed_embedded)

        # Unpack sequence
        outputs, _ = pad_packed_sequence(packed_outputs, batch_first=True)

        # outputs is now a padded sequence
        return outputs, hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]

        #input = input.unsqueeze(1)  # input = [batch size, 1]

        embedded = self.dropout(self.embedding(input))

        # embedded = [batch size, 1, emb dim]

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        # output = [batch size, seq len, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]

        # seq len will always be 1 in the decoder, therefore:
        # output = [batch size, 1, hid dim]
        # hidden = [n layers, batch size, hid dim]
        # cell = [n layers, batch size, hid dim]

        prediction = self.fc_out(output.squeeze(1))

        # prediction = [batch size, output dim]

        return prediction, hidden, cell

teacher_forcing_ratio = 0.5

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, src_len):
        batch_size = src.shape[0]
        trg_len = trg.shape[1] if trg is not None else None
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        # Encode source sequence
        encoder_outputs, hidden, cell = self.encoder(src, src_len)

        # First input to the decoder is the <sos> token, assume it's always the first in the vocab
        input = trg[:, 0]

        for t in range(1, trg_len):
            # Decode one step at a time
            # The decoder takes in the previous target token and the hidden states
            # We unsqueeze(1) to add the sequence length dimension (which is 1)
            output, hidden, cell = self.decoder(input.unsqueeze(1), hidden, cell)

            # Save the output
            outputs[:, t, :] = output.squeeze(1)

            # Get the most probable next token
            top1 = output.squeeze(1).argmax(1)

            # Decide whether to use teacher forcing
            teacher_force = random.random() < teacher_forcing_ratio
            input = trg[:, t] if teacher_force else top1

        return outputs



In [87]:
num_tokens = len(vocab)
emb_dim = 256
hid_dim = 512
n_layers = 2
dropout = 0.5

# Initialize encoder and decoder
encoder = Encoder(num_tokens, emb_dim, hid_dim, n_layers, dropout)
decoder = Decoder(num_tokens, emb_dim, hid_dim, n_layers, dropout)

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the seq2seq model
model = Seq2Seq(encoder, decoder, device).to(device)


In [88]:
# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Define the loss function, ignoring the padded elements in the output sequence
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])


In [89]:
# Ensure that target_texts is not empty and contains the correct data
assert len(target_texts) > 0, "target_texts is empty"
assert all(isinstance(t, str) and t for t in target_texts), "target_texts should contain non-empty strings"


In [96]:
# Training and evaluation loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    # Training loop
    for src, trg in train_dataloader:
        src, trg = src.to(device), trg.to(device)
        assert trg.ndim == 2, "Target tensor trg should have 2 dimensions [batch_size, sequence_length]"
        src_len = torch.sum(src != vocab['<pad>'], dim=1)
        # src: [batch_size, src_len], trg: [batch_size, trg_len]
        src, trg = src.to(device), trg.to(device)
        # Calculate the length of each sentence in the src batch
        src_len = torch.sum(src != vocab['<pad>'], dim=1)  # [batch_size]
        optimizer.zero_grad()
        # Forward pass
        output = model.forward(src, trg, src_len)  # trg is not shifted here, assuming trg[:, 0] is <sos> in Seq2Seq model

        # trg is shifted inside the model, so we don't consider the first token (<sos>) in the loss
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        # Calculate loss
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    average_train_loss = total_train_loss / len(train_dataloader)
    print(f'Epoch {epoch+1} Train Loss: {average_train_loss:.4f}')

    # Evaluation loop
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for src, trg in val_dataloader:
            src, trg = src.to(device), trg.to(device)
            src_len = torch.sum(src != vocab['<pad>'], dim=1)  # [batch_size]

            # Forward pass
            output = model.forward(src, trg, src_len)

            # Calculate loss
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)
            loss = criterion(output, trg)
            total_val_loss += loss.item()

    average_val_loss = total_val_loss / len(val_dataloader)
    print(f'Epoch {epoch+1} Validation Loss: {average_val_loss:.4f}')


Epoch 1 Train Loss: 6.9332
Epoch 1 Validation Loss: 8.4971
Epoch 2 Train Loss: 6.6905
Epoch 2 Validation Loss: 8.7705
Epoch 3 Train Loss: 6.3947
Epoch 3 Validation Loss: 9.1231
Epoch 4 Train Loss: 5.9973
Epoch 4 Validation Loss: 9.4947
Epoch 5 Train Loss: 5.5134
Epoch 5 Validation Loss: 9.6672
Epoch 6 Train Loss: 4.9690
Epoch 6 Validation Loss: 10.0068
Epoch 7 Train Loss: 4.3247
Epoch 7 Validation Loss: 10.3512
Epoch 8 Train Loss: 3.7892
Epoch 8 Validation Loss: 10.4063
Epoch 9 Train Loss: 3.2228
Epoch 9 Validation Loss: 10.8692
Epoch 10 Train Loss: 2.6363
Epoch 10 Validation Loss: 11.0922


In [97]:
torch.save(model.state_dict(), 'seq2seq_model.pth')
torch.save(model, 'seq2seq_model_complete.pth')


In [98]:
vocab_inv = {index: token for token, index in vocab.items()}

In [99]:
def post_process(words):
    # Split the sentence into words and initialize an empty list for processed words
    processed_words = []
    not_allowed_words = ['<pad>', '<unk>', '<eos>', '<sos>']
    for word in words:
        if (not processed_words or word not in processed_words) and word not in not_allowed_words:
            processed_words.append(word)
    return processed_words

In [100]:
def translate_sentence(model, sentence, vocab, device):
    model.eval()
    tokens = [vocab['<sos>']] + [vocab.get(word, vocab['<unk>']) for word in sentence.lower().split()] + [vocab['<eos>']]
    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    src_len = torch.LongTensor([len(tokens)]).to(device)

    with torch.no_grad():
        encoder_outputs, hidden, cell = model.encoder(src_tensor, src_len)

    trg_tokens = [vocab['<sos>']]

    for i in range(200):
        trg_tensor = torch.LongTensor([trg_tokens[-1]]).unsqueeze(0).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)

            # Ensure output is 2D before calling argmax
            output = output.squeeze(1) if output.dim() == 3 else output
            pred_token = output.argmax(1).item()
            trg_tokens.append(pred_token)

            if pred_token == vocab['<eos>']:
                break

    translated_sentence = [vocab_inv[token] for token in trg_tokens if token in vocab_inv]

    return post_process(translated_sentence)


In [101]:
model.load_state_dict(torch.load('seq2seq_model.pth', map_location=device))
model = model.to(device)

# Example sentence to translate (detoxify)
example_sentence = "Now you're getting nasty."

translated_sentence_tokens = translate_sentence(model, example_sentence, vocab, device)

translated_sentence = ' '.join(translated_sentence_tokens)
print(f"Original sentence: {example_sentence}")
print(f"Detoxified sentence: {translated_sentence}")


Original sentence: Now you're getting nasty.
Detoxified sentence: personality woman back sack crack stand cutter opened would done
