<a href="https://colab.research.google.com/github/vishnushukla1729/GenAI---BERT-and-T5-Summariation/blob/main/Dialougue_Summarization_GPT_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# Cell A: Create a dummy summarization dataset as a list of paragraphs to bypass pyarrow issues
# We'll create a small dataset with 5 examples for demonstration.

dummy_paragraphs = [
    "Machine learning is a method of data analysis that automates analytical model building. It is a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention.",
    "Deep learning is a subset of machine learning that uses neural networks with many layers. It learns representations of data with multiple levels of abstraction. It has been applied to many fields including speech recognition, computer vision, and natural language processing.",
    "Natural language processing enables computers to understand, interpret and generate human language in a valuable way. NLP combines computational linguistics—rule-based modeling of human language—with statistical, machine learning, and deep learning models.",
    "Reinforcement learning is an area of machine learning concerned with how agents ought to take actions in an environment in order to maximize some notion of cumulative reward. It is employed in various applications such as gaming and robotics.",
    "Supervised learning is the machine learning task of learning a function that maps an input to an output based on example input-output pairs. It infers a function from labeled training data consisting of a set of training examples."
]

print('Dummy summarization dataset created with', len(dummy_paragraphs), 'examples.')

# Cell B: Build vocabulary and prepare data_pairs from dummy_paragraphs
import re

def build_vocab(texts, vocab_size=5000):
    word_counts = {}
    for text in texts:
        words = re.findall(r'\w+', text.lower())
        for word in words:
            word_counts[word] = word_counts.get(word, 0) + 1
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:vocab_size-2]
    word2idx = {word: idx+2 for idx, (word, count) in enumerate(sorted_words)}
    word2idx['<PAD>'] = 0
    word2idx['<UNK>'] = 1
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

word2idx, idx2word = build_vocab(dummy_paragraphs, vocab_size=5000)


def tokenize(text, word2idx, max_length=128):
    words = re.findall(r'\w+', text.lower())
    tokens = [word2idx.get(word, word2idx['<UNK>']) for word in words]
    # Pad or truncate
    if len(tokens) < max_length:
        tokens = tokens + [word2idx['<PAD>']] * (max_length - len(tokens))
    else:
        tokens = tokens[:max_length]
    return tokens


def prepare_data(text, word2idx, max_length=128):
    words = re.findall(r'\w+', text.lower())
    split_idx = len(words) // 2
    input_text = ' '.join(words[:split_idx])
    target_text = ' '.join(words[split_idx:])
    input_ids = tokenize(input_text, word2idx, max_length)
    target_ids = tokenize(target_text, word2idx, max_length)
    return input_ids, target_ids


data_pairs = [prepare_data(text, word2idx, max_length=128) for text in dummy_paragraphs]
print('Data pairs prepared. Sample input_ids:', data_pairs[0][0][:20])

# Cell C: Define the SimpleGPT model (using a small architecture as before)
import torch
import torch.nn as nn

class SimpleGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, max_seq_length, dropout=0.1):
        super(SimpleGPT, self).__init__()
        self.embed_dim = embed_dim
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(max_seq_length, embed_dim)
        self.dropout = nn.Dropout(dropout)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.lm_head = nn.Linear(embed_dim, vocab_size, bias=False)
        self.max_seq_length = max_seq_length

    def forward(self, x):
        # x is of shape (batch, seq_length)
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)  # shape (1, seq_length)
        token_embeds = self.token_embedding(x)
        position_embeds = self.position_embedding(positions)
        x = token_embeds + position_embeds
        x = self.dropout(x)
        # Transformer expects (seq_length, batch, embed_dim)
        x = x.transpose(0, 1)
        x = self.transformer(x)
        x = x.transpose(0, 1)
        logits = self.lm_head(x)
        return logits

# Instantiate the model with our vocabulary size
vocab_size = len(word2idx)
embed_dim = 256
num_heads = 4
hidden_dim = 512
num_layers = 2  # fewer layers since data is small
max_seq_length = 128

model = SimpleGPT(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, max_seq_length)
print('SimpleGPT model defined.')

# Cell D: Train the model on our dummy summarization dataset
import torch.optim as optim

# Convert data_pairs to tensors
inputs_tensor = torch.tensor([pair[0] for pair in data_pairs], dtype=torch.long)
targets_tensor = torch.tensor([pair[1] for pair in data_pairs], dtype=torch.long)

batch_size = 2
num_samples = inputs_tensor.size(0)

criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=1e-3)

model.train()
num_epochs = 5

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for i in range(0, num_samples, batch_size):
        inp_batch = inputs_tensor[i:i+batch_size]  # shape: (batch, seq_length)
        tar_batch = targets_tensor[i:i+batch_size]   # shape: (batch, seq_length)
        optimizer.zero_grad()
        outputs = model(inp_batch)  # shape: (batch, seq_length, vocab_size)
        loss = criterion(outputs.view(-1, outputs.size(-1)), tar_batch.view(-1))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print('Epoch ' + str(epoch+1) + ' Loss: ' + str(epoch_loss))

print('Model trained on dummy summarization dataset.')

# Cell E: Generate summary using the trained model
model.eval()

demo_input = torch.tensor(data_pairs[0][0], dtype=torch.long).unsqueeze(0)  # take first example
with torch.no_grad():
    out_logits = model(demo_input)  # shape: (1, seq_length, vocab_size)

# Greedy decoding: take argmax for each token
out_tokens = torch.argmax(out_logits, dim=-1).squeeze(0).tolist()

# Convert token ids back to words until <PAD>
generated_words = []
for token in out_tokens:
    word = idx2word.get(token, '<UNK>')
    if word == '<PAD>':
        break
    generated_words.append(word)

generated_summary = ' '.join(generated_words)


# End of demonstration.

Dummy summarization dataset created with 5 examples.
Data pairs prepared. Sample input_ids: [6, 3, 5, 4, 29, 2, 11, 30, 12, 31, 32, 33, 34, 7, 5, 4, 35, 2, 36, 0]
SimpleGPT model defined.




Epoch 1 Loss: 15.011993885040283
Epoch 2 Loss: 11.237849950790405
Epoch 3 Loss: 9.115229606628418
Epoch 4 Loss: 6.854491233825684
Epoch 5 Loss: 4.865272879600525
Model trained on dummy summarization dataset.


In [15]:
print('Generated summary:\n')
print(generated_summary)

Generated summary:

intelligence example on the idea it systems can learn from data identify patterns and make decisions with minimal human examples processing examples examples examples processing processing examples of examples examples examples examples with examples examples of examples examples examples examples examples processing intervention processing examples examples processing examples examples examples examples examples examples examples examples examples examples intervention examples examples examples examples examples examples examples intervention examples examples intervention examples with examples examples examples examples intervention intervention examples with examples examples intervention examples intervention intervention examples intervention intervention examples examples examples examples examples in intervention examples examples examples examples examples examples examples processing examples examples intervention examples examples examples data examples 

In [26]:
from transformers import pipeline

# Load GPT-2 model for text generation (dialogue generation)
dialogue_generator = pipeline("text-generation", model="gpt2", tokenizer="gpt2")

# Sample dialogue prompt
dialogue_prompt = "User: Hi, how are you doing today , I hope you are enjoying session ? \n Assistant:"

# Generate dialogue
generated_dialogue = dialogue_generator(dialogue_prompt, max_length=100, num_return_sequences=1
                                      )
print("Generated Dialogue:")
print(generated_dialogue[0]['generated_text'])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Dialogue:
User: Hi, how are you doing today , I hope you are enjoying session ? 
 Assistant: Ok

: Hi, how all is done today, I hope you are enjoying session?  Assistant: 

: Hey, here's the best way I know how:

Interviewed by:

1. ikr_1

2. ikr_2

3. ikr_2

4. ikr_2


In [19]:
from transformers import pipeline

# Load a pretrained summarization pipeline (BART model)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn")

# Sample long paragraph for summarization
long_paragraph = (
    "Artificial intelligence (AI) has seen tremendous growth over the past decade, "
    "leading to significant advancements in a multitude of fields, including natural language "
    "processing, computer vision, and robotics. These advancements have been driven by the "
    "development of deep learning algorithms, which are capable of processing vast amounts of data "
    "and making complex inferences. As a result, AI-powered technologies are being deployed in various "
    "applications, such as self-driving cars, voice-controlled assistants, and recommendation systems. "
    "The continued evolution of AI promises to further transform industries and create new opportunities "
    "in research and innovation."
)

# Generate summary
generated_summary = summarizer(long_paragraph, max_length=60, min_length=30, do_sample=False)
print("Generated Summary:")
print(generated_summary[0]['summary_text'])

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


Generated Summary:
Artificial intelligence (AI) has seen tremendous growth over the past decade. AI-powered technologies are being deployed in various applications, such as self-driving cars and voice-controlled assistants.


In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Use a smaller open-source model for local execution
MODEL_NAME = "gpt2-medium"  # You can also use "gpt2-large" or "EleutherAI/gpt-j-6B or EleutherAI/gpt-neox-20b"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# Set pad token
tokenizer.pad_token = tokenizer.eos_token


In [10]:
def generate_dialogue(prompt, max_length=20):
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Dialogue Response:", generate_dialogue("Hello, how are you?"))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Dialogue Response: Hello, how are you?

I'm fine.

I'm fine.




In [13]:
def summarize_text(text, max_length=150):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs["input_ids"], max_length=max_length, min_length=50, num_beams=4)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Summary:", summarize_text("A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. Almost every piece of writing you do that is longer than a few sentences should be organized into paragraphs. This is because paragraphs show a reader where the subdivisions of an essay begin and end, and thus help the reader see the organization of the essay and grasp its main points."


))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Summary: A paragraph is a series of sentences that are organized and coherent, and are all related to a single topic. Almost every piece of writing you do that is longer than a few sentences should be organized into paragraphs. This is because paragraphs show a reader where the subdivisions of an essay begin and end, and thus help the reader see the organization of the essay and grasp its main points.

Punctuation

Punctuation is used to mark the beginning and end of a sentence. It is also used to mark the beginning and end of a paragraph.

Punctuation is used to mark the beginning and end of a sentence. It is also used to mark the beginning and end of a paragraph.

Punctuation is


In [16]:
# Dummy dialogue dataset (context-response pairs)
dummy_dialogues = [
    ("Hello, how are you?", "I'm good! How about you?"),
    ("What is AI?", "AI stands for Artificial Intelligence. It enables machines to learn and make decisions."),
    ("Tell me a joke.", "Why did the computer catch a cold? Because it had too many windows open!"),
    ("What is your favorite color?", "I like blue because it's calming."),
    ("Who invented the light bulb?", "Thomas Edison is credited with inventing the first practical light bulb.")
]

print('Dummy dialogue dataset created with', len(dummy_dialogues), 'examples.')


Dummy dialogue dataset created with 5 examples.


In [17]:
import re

# Function to build a vocabulary
def build_vocab(texts, vocab_size=5000):
    word_counts = {}
    for text in texts:
        words = re.findall(r'\w+', text.lower())
        for word in words:
            word_counts[word] = word_counts.get(word, 0) + 1
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:vocab_size-2]
    word2idx = {word: idx+2 for idx, (word, count) in enumerate(sorted_words)}
    word2idx['<PAD>'] = 0
    word2idx['<UNK>'] = 1
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

# Combine input and responses for vocab building
all_texts = [pair[0] + " " + pair[1] for pair in dummy_dialogues]
word2idx, idx2word = build_vocab(all_texts, vocab_size=5000)

# Function to tokenize a sentence
def tokenize(text, word2idx, max_length=64):
    words = re.findall(r'\w+', text.lower())
    tokens = [word2idx.get(word, word2idx['<UNK>']) for word in words]
    if len(tokens) < max_length:
        tokens += [word2idx['<PAD>']] * (max_length - len(tokens))
    else:
        tokens = tokens[:max_length]
    return tokens

# Prepare (input-response) pairs
def prepare_data(input_text, response_text, word2idx, max_length=64):
    input_ids = tokenize(input_text, word2idx, max_length)
    response_ids = tokenize(response_text, word2idx, max_length)
    return input_ids, response_ids

# Convert dataset to tokenized format
data_pairs = [prepare_data(inp, resp, word2idx) for inp, resp in dummy_dialogues]
print('Dialogue data prepared. Sample input tokens:', data_pairs[0][0][:10])


Dialogue data prepared. Sample input tokens: [14, 5, 15, 6, 0, 0, 0, 0, 0, 0]


In [18]:
import torch
import torch.nn as nn

class SimpleGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, max_seq_length, dropout=0.1):
        super(SimpleGPT, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(max_seq_length, embed_dim)
        self.dropout = nn.Dropout(dropout)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.lm_head = nn.Linear(embed_dim, vocab_size, bias=False)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)  # (1, seq_length)
        token_embeds = self.token_embedding(x)
        position_embeds = self.position_embedding(positions)
        x = token_embeds + position_embeds
        x = self.dropout(x)
        x = x.transpose(0, 1)  # Transformer expects (seq_length, batch, embed_dim)
        x = self.transformer(x)
        x = x.transpose(0, 1)
        logits = self.lm_head(x)
        return logits

# Model hyperparameters
vocab_size = len(word2idx)
embed_dim = 256
num_heads = 4
hidden_dim = 512
num_layers = 2
max_seq_length = 64

# Instantiate the model
model = SimpleGPT(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, max_seq_length)
print('SimpleGPT model defined.')


SimpleGPT model defined.


In [19]:
import torch.optim as optim

# Convert (input-response) pairs to tensors
inputs_tensor = torch.tensor([pair[0] for pair in data_pairs], dtype=torch.long)
responses_tensor = torch.tensor([pair[1] for pair in data_pairs], dtype=torch.long)

batch_size = 2
num_samples = inputs_tensor.size(0)

criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=1e-3)

model.train()
num_epochs = 5

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for i in range(0, num_samples, batch_size):
        inp_batch = inputs_tensor[i:i+batch_size]
        tar_batch = responses_tensor[i:i+batch_size]
        optimizer.zero_grad()
        outputs = model(inp_batch)  # (batch, seq_length, vocab_size)
        loss = criterion(outputs.view(-1, outputs.size(-1)), tar_batch.view(-1))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch {epoch+1} Loss: {epoch_loss:.4f}')

print('Model training completed.')


Epoch 1 Loss: 13.3755
Epoch 2 Loss: 9.4343
Epoch 3 Loss: 7.1032
Epoch 4 Loss: 5.0579
Epoch 5 Loss: 3.6558
Model training completed.


In [21]:
model.eval()

def generate_response(input_text, model, word2idx, idx2word, max_length=20):
    input_tokens = tokenize(input_text, word2idx, max_length)
    input_tensor = torch.tensor(input_tokens, dtype=torch.long).unsqueeze(0)  # (1, seq_length)

    with torch.no_grad():
        out_logits = model(input_tensor)  # (1, seq_length, vocab_size)

    out_tokens = torch.argmax(out_logits, dim=-1).squeeze(0).tolist()

    # Convert token IDs to words
    generated_words = []
    for token in out_tokens:
        word = idx2word.get(token, '<UNK>')
        if word == '<PAD>':
            break
        generated_words.append(word)

    return ' '.join(generated_words)

# Test the model
test_input = "What is AI?"
response = generate_response(test_input, model, word2idx, idx2word)
print("Bot Response:", response)


Bot Response: i stands for artificial intelligence inventing enables machines practical light bulb make decisions open machines artificial make calming and machines
