In [None]:
#Importing libraries

from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import ast

In [None]:
#Loading Data and Data Preparation
dataset = load_dataset("humarin/chatgpt-paraphrases")
dataset = dataset.remove_columns(['category', 'source'])
dataset = dataset['train']



new_dataset = []

for example in dataset:
    text = example['text']
    paraphrases = ast.literal_eval(example['paraphrases'])  # Extract all paraphrases
    if paraphrases:  # Check if there are any paraphrases
        first_paraphrase = paraphrases[0]  # Retain only the first paraphrase
        new_example = (text, first_paraphrase)
        new_dataset.append(new_example)

X = [example[0] for example in new_dataset]
Y = [example[1] for example in new_dataset]

In [None]:
#Data Preprocessing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
Y_seq = tokenizer.texts_to_sequences(Y)


max_length = max(len(seq) for seq in X_seq)
X_padded = pad_sequences(X_seq, maxlen=max_length, padding='post')
Y_padded = pad_sequences(Y_seq, maxlen=max_length, padding='post')

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_padded, Y_padded, test_size=0.2, random_state=42)


X_train = torch.tensor(X_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.long)
Y_train = torch.tensor(Y_train, dtype=torch.long)
Y_test = torch.tensor(Y_test, dtype=torch.long)


train_dataset = TensorDataset(X_train, Y_train)
test_dataset = TensorDataset(X_test, Y_test)


batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
#Defining Model Architecture

class ParaphraserModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, num_heads):
        super(ParaphraserModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pre_attn_bilstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True)
        self.post_attn_lstm = nn.LSTM(hidden_dim * 2, hidden_dim, num_layers=num_layers, batch_first=True)
        self.attention = nn.MultiheadAttention(hidden_dim * 2, num_heads)
        self.softmax = nn.Softmax(dim=-1)
        self.dense = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        pre_attn_output, _ = self.pre_attn_bilstm(embedded)
        attn_output, _ = self.attention(pre_attn_output, pre_attn_output, pre_attn_output)
        post_attn_output, _ = self.post_attn_lstm(attn_output)
        softmax_output = self.softmax(post_attn_output[:, -1, :])  # Use the last hidden state
        dense_output = self.dense(softmax_output)
        return dense_output
        return dense_output



In [None]:
#Training the model

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128
hidden_dim = 256
num_layers = 2
num_heads = 8

model = ParaphraserModel(vocab_size, embedding_dim, hidden_dim, num_layers, num_heads).to(device)


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    for batch_inputs, batch_targets in train_loader:
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)

        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_targets[:, -1])  # Use the last token as the target
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

In [None]:
# Evaluation

model.eval()
with torch.no_grad():
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for batch_inputs, batch_targets in test_loader:
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)

        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_targets[:, -1])
        total_loss += loss.item() * batch_inputs.size(0)

        _, predicted = torch.max(outputs, dim=-1)
        total_correct += (predicted == batch_targets[:, -1]).sum().item()
        total_samples += batch_inputs.size(0)

    avg_loss = total_loss / len(test_dataset)
    accuracy = total_correct / total_samples

    print(f"Test Loss: {avg_loss:.4f}, Test Accuracy: {accuracy:.4f}")

In [None]:
# Evaluation metrics

from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

def calculate_metrics(model, tokenizer, test_loader, device):
    model.eval()
    bleu_scores = []
    semantic_similarities = []
    perplexities = []

    sentence_transformer = SentenceTransformer('paraphrase-distilroberta-base-v1')
    gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')
    gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    with torch.no_grad():
        for batch_inputs, batch_targets in test_loader:
            batch_inputs = batch_inputs.to(device)
            batch_targets = batch_targets.to(device)

            # Generate paraphrases
            generated_paraphrases = []
            for input_seq in batch_inputs:
                input_seq = input_seq.unsqueeze(0)
                output = model(input_seq)
                predicted_seq = torch.argmax(output, dim=-1).squeeze()
                paraphrase = tokenizer.decode(predicted_seq, skip_special_tokens=True)
                generated_paraphrases.append(paraphrase)

            # Calculate BLEU score
            for reference, candidate in zip(batch_targets, generated_paraphrases):
                reference_text = tokenizer.decode(reference, skip_special_tokens=True)
                candidate_text = candidate
                bleu_score = sentence_bleu([reference_text.split()], candidate_text.split())
                bleu_scores.append(bleu_score)

            # Calculate semantic similarity
            reference_embeddings = sentence_transformer.encode([tokenizer.decode(ref, skip_special_tokens=True) for ref in batch_targets])
            candidate_embeddings = sentence_transformer.encode(generated_paraphrases)
            for reference_embedding, candidate_embedding in zip(reference_embeddings, candidate_embeddings):
                similarity_score = torch.cosine_similarity(torch.tensor(reference_embedding), torch.tensor(candidate_embedding), dim=0)
                semantic_similarities.append(similarity_score.item())

            # Calculate perplexity
            for paraphrase in generated_paraphrases:
                input_ids = gpt2_tokenizer.encode(paraphrase, return_tensors='pt')
                with torch.no_grad():
                    outputs = gpt2_model(input_ids, labels=input_ids)
                    loss = outputs.loss
                    perplexity = torch.exp(loss).item()
                    perplexities.append(perplexity)

    avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
    avg_semantic_similarity = sum(semantic_similarities) / len(semantic_similarities)
    avg_perplexity = sum(perplexities) / len(perplexities)

    return avg_bleu_score, avg_semantic_similarity, avg_perplexity

# Example usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

avg_bleu_score, avg_semantic_similarity, avg_perplexity = calculate_metrics(model, tokenizer, test_loader, device)

print(f"Average BLEU Score: {avg_bleu_score:.4f}")
print(f"Average Semantic Similarity: {avg_semantic_similarity:.4f}")
print(f"Average Perplexity: {avg_perplexity:.4f}")

In [None]:
#Human Evaluation through Implementation
def paraphrase_sentence(model, tokenizer, input_sentence, max_length=50):
    model.eval()

    with torch.no_grad():

        input_tokens = tokenizer.texts_to_sequences([input_sentence])
        input_tokens = torch.tensor(input_tokens, dtype=torch.long).to(device)


        output_sentence = input_tokens.clone()


        for _ in range(max_length):

            predictions = model(output_sentence)
            _, predicted_index = torch.max(predictions, dim=-1)


            output_sentence = torch.cat([output_sentence, predicted_index.unsqueeze(1)], dim=-1)


            if predicted_index.item() == tokenizer.word_index.get('<end>', None):
                break


        paraphrase = tokenizer.sequences_to_texts(output_sentence.cpu().numpy())[0]

        return paraphrase

sentence = "The bank shall, however, continue to provide services to its existing customers, including its credit card customers"
paraphrase = paraphrase_sentence(sentence)
print("Original Sentence:")
print(sentence)
print("Paraphrased Sentence:")
print(paraphrase)
