In [4]:
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import pandas as pd

In [6]:
df = pd.read_csv(r"D:\CloudSEK\train.csv")

In [37]:
df = df.drop_duplicates()
df.dropna(inplace = True)
unique_categories = list(df['Category'].unique())
l =[]
for i in (unique_categories):
    a = df[df['Category'] == i]
    a = a.sample(frac=0.20, random_state=42)
    l.append(a)

df = pd.concat(l)
df.reset_index(inplace = True)
df.drop(columns =  'index',inplace = True)
df.dropna(inplace = True)

In [38]:
# Tokenize and preprocess the dummy data
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
input_sequences = []
target_sequences = []

In [39]:
df.iloc[0]

Headline    Couple Married For 68 Years Dies In Colorado W...
Category                                            U.S. NEWS
Summary     A relative said they were found in each other'...
Name: 0, dtype: object

In [40]:
tokenizer.pad_token = tokenizer.eos_token

In [41]:
# Define the device for the data tensors
device = 'cuda'

# Tokenize and preprocess the dummy data on the GPU
input_sequences = []
target_sequences = []
for i in range(len(df)):
    input_text = df.iloc[i]['Summary']
    target_text = df.iloc[i]['Headline']
    input_encoded = tokenizer.encode(input_text, truncation=True, padding='max_length', max_length=50, return_tensors='pt').to(device)
    target_encoded = tokenizer.encode(target_text, truncation=True, padding='max_length', max_length=50, return_tensors='pt').to(device)
    input_sequences.append(input_encoded)
    target_sequences.append(target_encoded)

In [42]:
# Create DataLoader for training with tensors on GPU
train_dataset = TensorDataset(
    torch.cat(input_sequences).to('cuda'),  # Move input sequences to GPU
    torch.cat(target_sequences).to('cuda')  # Move target sequences to GPU
)

batch_size = 8  # Small batch size for the small dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [43]:
# Load pre-trained GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50257. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(50257, 768)

In [44]:
class HeadlineGenerator(nn.Module):
    def __init__(self, base_model):
        super(HeadlineGenerator, self).__init__()
        self.base_model = base_model.to("cuda")
        self.lm_head = nn.Linear(base_model.config.hidden_size, base_model.config.vocab_size, bias=True)

    def forward(self, input_ids, attention_mask, labels=None):
        # Forward pass for headline generation
        outputs = self.base_model(
            input_ids=input_ids.to('cuda'),
            attention_mask=attention_mask.to('cuda'),
            labels=labels.to('cuda')  # Pass labels here
        )
        return outputs

In [45]:
# Instantiate the custom model
headline_generator_model = HeadlineGenerator(model)

In [46]:
# Define optimizer and learning rate scheduler
optimizer = AdamW(headline_generator_model.parameters(), lr=1e-4)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader))



In [None]:
# Training loop
num_epochs = 3  # Small number of epochs for the example
for epoch in range(num_epochs):
    headline_generator_model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        # Unpack the batch
        # Move input and target tensors to GPU inside the training loop
        input_ids_batch, target_ids_batch = batch
        input_ids_batch = input_ids_batch.to('cuda')
        target_ids_batch = target_ids_batch.to('cuda')

        # Forward pass, compute loss
        outputs = headline_generator_model(input_ids_batch, attention_mask=input_ids_batch.ne(0), labels=target_ids_batch)
        loss = outputs.loss

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

Epoch 0: 100%|██████████| 3755/3755 [09:14<00:00,  6.77it/s]
Epoch 1: 100%|██████████| 3755/3755 [09:12<00:00,  6.79it/s]
Epoch 2: 100%|██████████| 3755/3755 [09:11<00:00,  6.80it/s]


In [None]:
# Define the path where you want to save the model
model_save_path = "/content/drive/MyDrive/Colab Notebooks/CloudSEK/headline_generator_model.pth"

# Save the model state dictionary and other necessary information
torch.save({
    'model_state_dict': headline_generator_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
}, model_save_path)

print(f"Model saved to {model_save_path}")

Model saved to /content/drive/MyDrive/Colab Notebooks/CloudSEK/headline_generator_model.pth


In [47]:
loaded_model = HeadlineGenerator(model)
model_save_path = "/content/drive/MyDrive/Colab Notebooks/CloudSEK/headline_generator_model.pth"
# Load the saved model checkpoint
checkpoint = torch.load(model_save_path)

# Load the state dictionaries into the new model
loaded_model.load_state_dict(checkpoint['model_state_dict'])

# Set the model to evaluation mode (important for inference)
loaded_model.eval()

print(f"Model loaded from {model_save_path}")

Model loaded from /content/drive/MyDrive/Colab Notebooks/CloudSEK/headline_generator_model.pth


In [48]:
# Set the device to 'cuda' if a GPU is available, else use 'cpu'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)

HeadlineGenerator(
  (base_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  )
  (lm_head): Linear(in_fe

In [7]:
df.iloc[3124]['Summary']

'Pontins kept a list of Irish last names in an attempt to keep out members of the Irish Traveler community, the U.K. equalities watchdog said.'

In [8]:
df.iloc[3124]['Headline']

"UK Resort Chain Created An 'Undesirable Guests' List -- Featuring All Irish Names"

In [59]:
def generate_headline(loaded_model, tokenizer, input_text, max_length=50):
    # Manually pad the input text with spaces on the left
    while len(input_text) < max_length:
        input_text = " " + input_text

    # Encode the input text
    input_encoded = tokenizer.encode(input_text, truncation=True, max_length=max_length, return_tensors='pt').to(device)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    # Create an attention mask to ignore padding tokens
    attention_mask = input_encoded != tokenizer.pad_token_id

    # Generate a headline using the original GPT-2 model (base_model)
    with torch.no_grad():
        output = loaded_model.base_model.generate(input_encoded, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.95)

    # Decode the generated headline and return it as a string
    generated_headline = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_headline

# Example usage:
input_summary = df.iloc[314]['Summary']
generated_headline = generate_headline(loaded_model, tokenizer, input_summary)
print("Generated Headline:", generated_headline)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Headline: The former 'Tonight Show' host returned once again to do his shtick.


In [65]:
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_cosine_similarity(reference, candidate):
    """
    Calculate cosine similarity between a reference and a candidate sentence.

    Args:
        reference (str): The reference headline.
        candidate (str): The generated headline.

    Returns:
        float: The cosine similarity score.
    """
    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the vectorizer on the reference and candidate
    tfidf_matrix = vectorizer.fit_transform([reference, candidate])

    # Calculate the cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

    return cosine_sim[0][0]

# Assuming you have a test dataset with summaries and reference headlines

# Initialize variables to keep track of evaluation results
cosine_similarity_scores = []

for i in range(1000):
    input_summary = df.iloc[i]["Summary"]
    reference_headline = df.iloc[i]["Headline"]

    # Generate a headline
    generated_headline = generate_headline(loaded_model, tokenizer, input_summary)

    # Calculate BLEU score for this sample
    cosine_similarity_score = calculate_cosine_similarity(reference_headline, generated_headline)

    cosine_similarity_scores.append(cosine_similarity_score)

print("Cosine Similarity Scores:", cosine_similarity_scores)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Cosine Similarity Scores: [0.04723062045463175, 0.0, 0.0, 0.0, 0.0867205950866215, 0.0, 0.06997254341808111, 0.03291927102997832, 0.23790309463326234, 0.09269789668627061, 0.17496389928471745, 0.14443048485126755, 0.08926582357187389, 0.050656087645095244, 0.09429894142444722, 0.03673406111884345, 0.0, 0.03894906299301254, 0.13947951652652732, 0.03371450987326244, 0.0693959471906616, 0.04604360647426782, 0.03918461513531269, 0.16036743554083258, 0.08288681245523305, 0.0, 0.15062714099404245, 0.05961489684039471, 0.0396627494204973, 0.0, 0.03461018129193951, 0.0, 0.0, 0.19228885214978728, 0.0, 0.0, 0.15272516469408268, 0.0, 0.0, 0.11031569691241044, 0.048679579010973785, 0.04220166890638128, 0.11521554337793126, 0.17389613837457454, 0.06804963087832475, 0.0, 0.058407397662648505, 0.16679989272321652, 0.0, 0.06655924660794857, 0.16499519418432368, 0.0329571777403657, 0.0, 0.039184615135312684, 0.0, 0.05951439576030657, 0.06980396906720633, 0.046135533763374155, 0.09555392669038872, 0.143

In [68]:
sum(cosine_similarity_scores)/len(cosine_similarity_scores)*100

5.926442070659209