## 1. About this notebook

This notebook uses a Kaggle dataset containing Wikipedia phrases to create paraphrases and non-paraphrases that will later be used to tran a paraphrase-detection network based on transformers. THe Wikipedia sentences dataset can be found [here](https://www.kaggle.com/datasets/mikeortman/wikipedia-sentences)

In [1]:
import os
from random import shuffle
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

In [4]:
# Import Wikipedia sentences dataset from Kaggle
sentences_path = "/kaggle/input/wikipedia-sentences/wikisent2.txt"
sentences=[]
with open(sentences_path) as file:
    for item in file:
        sentences.append(item[:-1])
        
shuffle(sentences)

len(sentences)

# Save sentences used to generate paraphrases
sentences_for_paraphrase = sentences[:len(sentences)//2]
with open('/kaggle/working/sentences_for_paraphrase.txt', 'w') as f:
    for sentence in sentences_for_paraphrase:
        f.write(f"{sentence}\n")
        
# Save sentences used to generate non-paraphrases
sentences_for_non_paraphrase = sentences[len(sentences)//2:]
with open('/kaggle/working/sentences_for_non_paraphrase.txt', 'w') as f:
    for sentence in sentences_for_non_paraphrase:
        f.write(f"{sentence}\n")

In [5]:
# Import the model 
tokenizer_paraphrase_generation = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")  
model_paraphrase_generation = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_paraphrase_generation.to(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [6]:
# Generating paraphrases
sentences_paraphrased = []

for sentence in range(len(sentences_for_paraphrase[:100000])):
    text =  "paraphrase: " + sentences_for_paraphrase[sentence] + " </s>"

    encoding = tokenizer_paraphrase_generation.encode_plus(text,pad_to_max_length=True, return_tensors="pt")

    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    outputs = model_paraphrase_generation.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        max_length=40,
        do_sample=True,
        top_k=200,
        top_p=0.95,
        early_stopping=True,
        num_return_sequences=1,
        temperature=1.3
    )

    sentences_paraphrased.append(tokenizer_paraphrase_generation.decode(outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True))
    
    if (sentence % 10000) == 0:
        print(sentence)
        with open('/kaggle/working/sentences_paraphrased.txt', 'w') as f:
            for sentence in sentences_paraphrased:
                f.write(f"{sentence}\n")



0


KeyboardInterrupt: 

In [None]:
# Generating non-paraphrases
sentences_non_paraphrased = []

for sentence in range(len(sentences_for_non_paraphrase[:100000])):
    text =  "paraphrase: " + sentences_for_non_paraphrase[sentence] + " </s>"

    encoding = tokenizer_paraphrase_generation.encode_plus(text,pad_to_max_length=True, return_tensors="pt")

    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    outputs = model_paraphrase_generation.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        max_length=40,
        do_sample=True,
        top_k=200,
        top_p=0.95,
        early_stopping=True,
        num_return_sequences=1,
        temperature=2.0
    )

    sentences_non_paraphrased.append(tokenizer_paraphrase_generation.decode(outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True))
    
    if (sentence % 10000) == 0:
        print(sentence)
        with open('/kaggle/working/sentences_non_paraphrased.txt', 'w') as f:
            for sentence in sentences_non_paraphrased:
                f.write(f"{sentence}\n")