In [2]:
%%capture
!pip install transformers datasets sentencepiece

In [3]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset

In [4]:
# Load the model and tokenizer
model_name = "t5-small"  # You can choose t5-base or t5-large as well
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
# Load a paraphrase dataset (Quora Question Pairs)
dataset = load_dataset("quora")

Downloading builder script:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.69k [00:00<?, ?B/s]

The repository for quora contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/quora.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/404290 [00:00<?, ? examples/s]

In [6]:
# dataset = dataset['train'].to_dict()

In [7]:
# dataset['questions'][0]['text'][0], dataset['questions'][0]['text'][1]

In [8]:
# Preprocess the data
def preprocess_function(examples):
    inputs = ["paraphrase: " + q['text'][0] for q in examples['questions']]  # Access the first question
    targets = [q['text'][1] for q in examples['questions']]  # Access the paraphrased question
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

In [9]:
# Apply preprocessing to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/404290 [00:00<?, ? examples/s]

In [10]:
from datasets import DatasetDict
train_dataset = tokenized_dataset["train"]

# Split the 'train' dataset into train and validation sets (80% train, 20% validation)
train_test_split = train_dataset.train_test_split(test_size=0.2)

# Create a new DatasetDict with 'train' and 'validation' splits
tokenized_dataset_ = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})

In [11]:
# Prepare training and validation data
train_dataset = tokenized_dataset_["train"].shuffle(seed=42).select(range(10000))  # Take a subset for quick training
validation_dataset = tokenized_dataset_["validation"]

In [12]:
# Define PyTorch dataloader
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(validation_dataset, batch_size=8)

In [13]:
# Define optimizer
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)




In [14]:
# Training loop
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in tqdm(range(1)):  # Train for 3 epochs
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = torch.stack(batch['input_ids']).to(device)
        attention_mask = torch.stack(batch['attention_mask']).to(device)
        labels = torch.stack(batch['labels']).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")



 33%|███▎      | 1/3 [10:22<20:45, 622.73s/it]

Epoch 1, Loss: 0.219134122133255


 67%|██████▋   | 2/3 [20:53<10:27, 627.60s/it]

Epoch 2, Loss: 0.2358573079109192


100%|██████████| 3/3 [31:24<00:00, 628.14s/it]

Epoch 3, Loss: 0.4410006105899811





In [15]:
# Evaluation
model.eval()
for batch in val_dataloader:
    input_ids = torch.stack(batch['input_ids']).to(device)
    attention_mask = torch.stack(batch['attention_mask']).to(device)

    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=512)
    decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    print(decoded_outputs[:5])  # Print the first 5 generated paraphrases



KeyboardInterrupt: 

In [25]:
# Inference Function
def paraphrase_text(input_text, min_length=0.8):
    model.eval()
    input_text = "paraphrase: " + input_text
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    # print(inputs)
    # Generate paraphrase
    outputs = model.generate(**inputs, max_length=512)
    print(outputs)
    paraphrase = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return paraphrase
    # # Length check (80% constraint)
    # if len(paraphrase.split()) < min_length * len(input_text.split()):
    #     print(f"Paraphrase is shorter than 80% of the input length.")
    # return paraphrase



In [27]:
# Example usage
input_text = """A cover letter is a formal document that accompanies your resume when you apply for a job. It serves as
an introduction and provides additional context for your application. Here’s a breakdown of its various
aspects:
Purpose
The primary purpose of a cover letter is to introduce yourself to the hiring manager and to provide context
for your resume. It allows you to elaborate on your qualifications, skills, and experiences in a way that
your resume may not fully capture. It’s also an opportunity to express your enthusiasm for the role and the
company, and to explain why you would be a good fit.
Content
A typical cover letter includes the following sections:
1. Header: Includes your contact information, the date, and the employer’s contact information.
2. Salutation: A greeting to the hiring manager, preferably personalized with their name.
3. Introduction: Briefly introduces who you are and the position you’re applying for.
4. Body: This is the core of your cover letter where you discuss your qualifications, experiences, and
skills that make you suitable for the job. You can also mention how you can contribute to the company.
5. Conclusion: Summarizes your points and reiterates your enthusiasm for the role. You can also include
a call to action, like asking for an interview.
6. Signature: A polite closing (“Sincerely,” “Best regards,” etc.) followed by your name.
Significance in the Job Application Process
The cover letter is often the first document that a hiring manager will read, so it sets the tone for your
entire application. It provides you with a chance to stand out among other applicants and to make a
strong first impression. Some employers specifically require a cover letter, and failing to include one could
result in your application being disregarded.
In summary, a cover letter is an essential component of a job application that serves to introduce you,
elaborate on your qualifications, and make a compelling case for why you should be considered for the
position."""
paraphrased_text = paraphrase_text(input_text)
print(f"Original: {input_text}")
print(f"Paraphrase: {paraphrased_text}")

In [None]:
chatgpt_output = """A cover letter is a formal document that accompanies your resume when applying for a job. It serves as an introduction and offers extra details about your application. Here's an overview of its key components:

Purpose
The main goal of a cover letter is to introduce yourself to the hiring manager and provide background on your resume. It allows you to expand on your qualifications, skills, and experience, highlighting things that your resume alone might not cover. It's also a chance to convey your enthusiasm for the position and explain why you're a great fit for the company.

Content
A standard cover letter includes the following elements:

Header: Contains your contact details, the date, and the employer's information.
Salutation: A greeting directed to the hiring manager, ideally personalized with their name.
Introduction: Briefly explains who you are and the job you are applying for.
Body: This section highlights your relevant qualifications, experience, and skills that make you a strong candidate for the role. You can also describe how you can add value to the company.
Conclusion: Summarizes your key points, reiterates your interest in the role, and may include a call to action, such as requesting an interview.
Signature: Ends with a courteous closing phrase (e.g., "Sincerely," "Best regards") followed by your name.
Importance in the Application Process
Since a cover letter is often the first document a hiring manager reviews, it shapes the initial impression of your application. It gives you an opportunity to distinguish yourself from other candidates and create a positive impression. Some employers specifically ask for a cover letter, and neglecting to include one could result in your application being overlooked.

In conclusion, a cover letter is a vital part of the job application process, helping you introduce yourself, emphasize your qualifications, and present a strong case for being considered for the role."""

We can evaluate the quality of the generated paraphrases using text similarity metrics such as BLEU, ROUGE, and METEOR. These metrics compare the generated text to reference text based on n-gram overlaps and other linguistic features.


**BLEU Score:**

`sentence_bleu()` computes the BLEU score by comparing the n-grams in the reference text and the hypothesis (paraphrased text).
We use the SmoothingFunction to avoid very low BLEU scores for short sentences.

**ROUGE Score:**

`rouge_scorer.RougeScorer` computes ROUGE-1, ROUGE-2, and ROUGE-L scores, which represent unigram, bigram, and longest common subsequence overlaps, respectively.

**METEOR Score:**

`nltk.meteor_score()` computes the METEOR score, which accounts for synonymy and stemming, making it more flexible than BLEU.

In [None]:
%%capture
!pip install nltk rouge-score
import nltk
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score

nltk.download('wordnet')  # For METEOR


In [None]:
# BLEU Score Calculation
def compute_bleu(reference, hypothesis):
    reference = [reference.split()]  # BLEU expects references as a list of lists
    hypothesis = hypothesis.split()
    smoothie = SmoothingFunction().method4  # To avoid zero scores for small samples
    bleu = sentence_bleu(reference, hypothesis, smoothing_function=smoothie)
    return bleu

# ROUGE Score Calculation
def compute_rouge(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, hypothesis)
    return scores

# METEOR Score Calculation
def compute_meteor(reference, hypothesis):
    meteor = meteor_score([reference], hypothesis)
    return meteor



In [None]:
# Compute metrics for our model
reference_text = input_text
generated_paraphrase = paraphrased_text

# Evaluate Paraphrase Quality
bleu = compute_bleu(reference_text, generated_paraphrase)
rouge_scores = compute_rouge(reference_text, generated_paraphrase)
meteor = compute_meteor(reference_text, generated_paraphrase)

# Print evaluation results
print("\nEvaluation Results for trained model:")
print(f"BLEU Score: {bleu:.4f}")
print(f"ROUGE-1: {rouge_scores['rouge1'].fmeasure:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2'].fmeasure:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL'].fmeasure:.4f}")
print(f"METEOR Score: {meteor:.4f}")


In [None]:
# Compute metrics for our model
reference_text = input_text
generated_paraphrase = chatgpt_output

# Evaluate Paraphrase Quality
bleu = compute_bleu(reference_text, generated_paraphrase)
rouge_scores = compute_rouge(reference_text, generated_paraphrase)
meteor = compute_meteor(reference_text, generated_paraphrase)

# Print evaluation results
print("\nEvaluation Results for chatgpt:")
print(f"BLEU Score: {bleu:.4f}")
print(f"ROUGE-1: {rouge_scores['rouge1'].fmeasure:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2'].fmeasure:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL'].fmeasure:.4f}")
print(f"METEOR Score: {meteor:.4f}")
