## Step 1: Scraping web data using BeautifulSoup

In [1]:
%pip install bs4
import requests
from bs4 import BeautifulSoup

Note: you may need to restart the kernel to use updated packages.


In [2]:
# URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/Pittsburgh"
url2="https://en.wikipedia.org/wiki/History_of_Pittsburgh"

In [3]:
# Send a GET request to the URL
response = requests.get(url)
if response.status_code == 200:
    html_content = response.text
else:
    print("Failed to retrieve the page")
    exit()

In [4]:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")

In [5]:
# Extract the main content of the page; Wikipedia's content is inside <div id="bodyContent"> or <div id="mw-content-text">
content_div = soup.find("div", id="mw-content-text")
if content_div is None:
    print("Could not find the main content!")
    exit()

In [6]:
# Get all paragraph texts from the content
paragraphs = content_div.find_all("p")
page_text = "\n".join([p.get_text().strip() for p in paragraphs if p.get_text().strip() != ""])

# Print or save the extracted text
print(page_text)

Pittsburgh (/Ààp…™tsb…úÀêr…°/ PITS-burg) is a city in and the county seat of Allegheny County, Pennsylvania, United States. It is the second-most populous city in Pennsylvania (after Philadelphia) and the 68th-most populous city in the U.S., with a population of 302,971 as of the 2020 census. The city is located in southwestern Pennsylvania at the confluence of the Allegheny River and Monongahela River, which combine to form the Ohio River.[7] It anchors the Pittsburgh metropolitan area, which had a population of 2.457 million residents and is the largest metro area in both the Ohio Valley and Appalachia, the second-largest in Pennsylvania, and the 26th-largest in the U.S. Pittsburgh is the principal city of the greater Pittsburgh‚ÄìWeirton‚ÄìSteubenville combined statistical area which includes parts of Ohio and West Virginia.
Pittsburgh is known as "the Steel City" for its dominant role in the history of the U.S. steel industry.[8] It developed as a vital link of the Atlantic coast a

In [7]:
# Optionally, save the extracted text to a file
with open("pittsburgh_wikipedia.txt", "w", encoding="utf-8") as f:
    f.write(page_text)

## Step 2: Converting .txt file into Chunks

In [8]:
def chunk_text(text, chunk_size=300):
    """
    Splits the text into chunks, each containing approximately chunk_size words.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

In [9]:
# Read the text from the file we saved earlier
with open("pittsburgh_wikipedia.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Split the text into chunks of 300 words each
chunks = chunk_text(text, chunk_size=300)

In [10]:
# Let's print out some info about our chunks
print("Total number of chunks:", len(chunks))
print("First chunk:\n", chunks[0])


Total number of chunks: 42
First chunk:
 Pittsburgh (/Ààp…™tsb…úÀêr…°/ PITS-burg) is a city in and the county seat of Allegheny County, Pennsylvania, United States. It is the second-most populous city in Pennsylvania (after Philadelphia) and the 68th-most populous city in the U.S., with a population of 302,971 as of the 2020 census. The city is located in southwestern Pennsylvania at the confluence of the Allegheny River and Monongahela River, which combine to form the Ohio River.[7] It anchors the Pittsburgh metropolitan area, which had a population of 2.457 million residents and is the largest metro area in both the Ohio Valley and Appalachia, the second-largest in Pennsylvania, and the 26th-largest in the U.S. Pittsburgh is the principal city of the greater Pittsburgh‚ÄìWeirton‚ÄìSteubenville combined statistical area which includes parts of Ohio and West Virginia. Pittsburgh is known as "the Steel City" for its dominant role in the history of the U.S. steel industry.[8] It develope

## Step 3: Converting Chunks to Embeddings

In [11]:
%pip install sentence-transformers


Note: you may need to restart the kernel to use updated packages.


In [12]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Encode the chunks into embeddings
chunk_embeddings = model.encode(chunks)

In [14]:
# Let's see the shape of our embeddings
print("Number of chunks:", len(chunk_embeddings))
print("Embedding size for each chunk:", len(chunk_embeddings[0]))
print(chunk_embeddings[0])


Number of chunks: 42
Embedding size for each chunk: 384
[ 6.69733360e-02  9.13290586e-03  4.58831228e-02  5.95601015e-02
 -4.00966182e-02  1.80845428e-02 -7.55067915e-02 -2.86436491e-02
 -5.51363081e-02 -9.17582437e-02 -6.63802922e-02 -9.03211683e-02
  2.78344061e-02  1.20455278e-02 -4.44494896e-02  4.48295958e-02
  6.27688095e-02  5.60258701e-03  8.31092000e-02  5.27451895e-02
 -1.73825864e-02 -6.21511154e-02  3.66939902e-02 -2.30070259e-02
 -2.59499848e-02  7.56240040e-02 -2.11047214e-02  1.07153900e-01
 -3.77437449e-03  4.42617089e-02  3.32766213e-03 -3.95799130e-02
 -5.24191260e-02  2.78451554e-02  6.73767179e-02 -2.08626762e-02
 -7.25351200e-02  5.74528091e-02  4.32409253e-03  6.90284073e-02
 -9.64344293e-03  1.39100011e-02 -8.71037990e-02  5.46867959e-03
  4.81062643e-02  9.01905820e-03 -6.50698915e-02  4.83859926e-02
 -7.85996299e-03 -7.95857832e-02  3.13218683e-02  2.03666482e-02
  2.75037661e-02  4.57434766e-02  3.13159148e-03  1.47119537e-02
  3.88591029e-02  8.17928985e-02 -

## Storing in Vector Database

In [15]:
import numpy as np
import json

# Convert embeddings to numpy array
embeddings = np.array(chunk_embeddings)

# Save the embeddings to a .npy file
np.save('pittsburgh_embeddings.npy', embeddings)

# Create a mapping of index to text chunk
index_to_chunk = {i: chunks[i] for i in range(len(chunks))}

# Save the mapping as a JSON file
with open("index_to_chunk.json", "w") as f:
    json.dump(index_to_chunk, f, indent=4)

print("NumPy embeddings and chunk mapping saved successfully.")




NumPy embeddings and chunk mapping saved successfully.


In [16]:
from sentence_transformers import SentenceTransformer

# Load the embeddings and index-to-chunk mapping
embeddings = np.load('pittsburgh_embeddings.npy')
with open("index_to_chunk.json", "r") as f:
    index_to_chunk = json.load(f)

# Load the same model used for encoding the chunks
model = SentenceTransformer('all-MiniLM-L6-v2')

# User query
query = "Where is Pittsburgh located geographically?"

# Encode the query to an embedding
query_embedding = model.encode([query])

# Compute cosine similarity
cosine_similarities = np.dot(embeddings, query_embedding.T).flatten()

# Get the index of the most similar chunk
most_similar_idx = np.argmax(cosine_similarities)

# Retrieve the corresponding text chunk
most_relevant_chunk = index_to_chunk[str(most_similar_idx)]

print("Most relevant chunk for the query:\n", most_relevant_chunk)



Most relevant chunk for the query:
 Pittsburgh (/Ààp…™tsb…úÀêr…°/ PITS-burg) is a city in and the county seat of Allegheny County, Pennsylvania, United States. It is the second-most populous city in Pennsylvania (after Philadelphia) and the 68th-most populous city in the U.S., with a population of 302,971 as of the 2020 census. The city is located in southwestern Pennsylvania at the confluence of the Allegheny River and Monongahela River, which combine to form the Ohio River.[7] It anchors the Pittsburgh metropolitan area, which had a population of 2.457 million residents and is the largest metro area in both the Ohio Valley and Appalachia, the second-largest in Pennsylvania, and the 26th-largest in the U.S. Pittsburgh is the principal city of the greater Pittsburgh‚ÄìWeirton‚ÄìSteubenville combined statistical area which includes parts of Ohio and West Virginia. Pittsburgh is known as "the Steel City" for its dominant role in the history of the U.S. steel industry.[8] It developed as 

## Feed to Model

In [17]:
%pip install torch transformers
%pip install sentencepiece



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [30]:
import json
import torch
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer

# Load a more powerful T5 model
model_name = "google/flan-t5-large"  # Upgrade from 'base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load embeddings and index-to-chunk mapping
embeddings = np.load('pittsburgh_embeddings.npy')
with open("index_to_chunk.json", "r") as f:
    index_to_chunk = json.load(f)

# Load Sentence Transformer for retrieval
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

# Read questions from questions.txt
with open("data/train/questions.txt", "r", encoding="utf-8") as f:
    questions = [line.strip() for line in f.readlines() if line.strip()]

# Dictionary to store answers
system_output = {}

# Process each question
for idx, question in enumerate(questions, start=1):
    # Encode question
    query_embedding = sentence_model.encode([question])

    # Compute cosine similarity for top-k retrieval
    cosine_similarities = np.dot(embeddings, query_embedding.T).flatten()
    top_k_indices = np.argsort(cosine_similarities)[-5:]  # Retrieve top 3 chunks

    # Combine retrieved chunks
    retrieved_chunks = " ".join([index_to_chunk[str(i)] for i in top_k_indices])

    # **Truncate to fit 512 tokens**
    max_context_tokens = 450
    context_tokens = tokenizer.tokenize(retrieved_chunks)[:max_context_tokens]
    truncated_context = tokenizer.convert_tokens_to_string(context_tokens)

    # Construct prompt
    prompt = f"question: {question} context: {truncated_context}"

    # Tokenize and generate an answer
    input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    output_ids = model.generate(
        input_ids, 
        max_length=100,  
        num_beams=7,  # More diverse answers
        early_stopping=True
    )
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Store the answer
    system_output[str(idx)] = answer

    print(f"Q: {question}")
    print(f"A: {answer}\n")

# Save all answers to system_output.json
with open("system_output.json", "w") as f:
    json.dump(system_output, f, indent=4)

print("All answers saved in system_output.json.")


Q: Where is Pittsburgh located geographically?
A: on the Allegheny Plateau

Q: What are the major rivers that converge in Pittsburgh?
A: Monongahela

Q: Why is Pittsburgh known as the 'Steel City'?
A: many Welsh people from the Merthyr steelworks

Q: What is the historical significance of Pittsburgh in the industrial revolution?
A: Andrew Carnegie opened the Pittsburgh Locomotive and Car Works, which manufactured for the industry until 1919

Q: Who were the early settlers of Pittsburgh?
A: Virginians, Whiskey Rebels, and Civil War raiders

Q: What are the major educational institutions in Pittsburgh?
A: Carnegie Museums of Pittsburgh

Q: How has Pittsburgh's economy transitioned from manufacturing to technology and healthcare?
A: manufacturing was key to growth of Pittsburgh

Q: What are the popular cultural and tourist attractions in Pittsburgh?
A: Harris as well as various African-American jazz clubs

Q: What are the major sports teams based in Pittsburgh?
A: Pittsburgh area, the Pit

## Evaluating the Current Model

In [25]:
%pip install nltk -q
%pip install rouge -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [31]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
import json

# Load system-generated answers
with open("system_output.json", "r") as f:
    system_output = json.load(f)

# Load reference answers
with open("data/train/reference_answers.json", "r") as f:
    reference_answers = json.load(f)

# Initialize metrics
rouge = Rouge()
total_bleu = 0
total_rouge = 0
exact_match = 0
num_questions = len(reference_answers)

# Smoothing function for BLEU
smooth_fn = SmoothingFunction().method1  # Prevents BLEU from being 0 due to missing n-grams

# Compare each generated answer with the reference
for idx, ref_list in reference_answers.items():
    generated_answer = system_output.get(idx, "").strip()

    # Extract the reference answer
    ref_answer = ref_list[0].strip()

    # Compute BLEU score with smoothing
    bleu_score = sentence_bleu([ref_answer.split()], generated_answer.split(), smoothing_function=smooth_fn)
    total_bleu += bleu_score

    # Compute ROUGE score
    rouge_score = rouge.get_scores(generated_answer, ref_answer)[0]["rouge-l"]["f"]
    total_rouge += rouge_score

    # Compute Exact Match
    if generated_answer.lower() == ref_answer.lower():
        exact_match += 1

# Compute final averages
avg_bleu = total_bleu / num_questions
avg_rouge = total_rouge / num_questions
exact_match_score = exact_match / num_questions

# Print results
print(f"üîπ Average BLEU Score (with smoothing): {avg_bleu:.4f}")
print(f"üîπ Average ROUGE Score: {avg_rouge:.4f}")
print(f"üîπ Exact Match Accuracy: {exact_match_score:.4f}")


üîπ Average BLEU Score (with smoothing): 0.0110
üîπ Average ROUGE Score: 0.1085
üîπ Exact Match Accuracy: 0.0000


## Fine tuning the model

In [32]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset
import torch

# Load pre-trained model
model_name = "google/flan-t5-large"  
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Load questions & reference answers
with open("data/train/questions.txt", "r", encoding="utf-8") as f:
    questions = [line.strip() for line in f.readlines() if line.strip()]

with open("data/train/reference_answers.json", "r") as f:
    reference_answers = json.load(f)

# Convert data to a format suitable for training
class QADataset(Dataset):
    def __init__(self, tokenizer, questions, answers, max_length=512):
        self.tokenizer = tokenizer
        self.questions = questions
        self.answers = answers
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = reference_answers[str(idx + 1)][0]  # Extract reference answer

        # Encode question and answer
        inputs = tokenizer(
            f"question: {question}", 
            padding="max_length", 
            truncation=True, 
            max_length=self.max_length, 
            return_tensors="pt"
        )

        targets = tokenizer(
            answer, 
            padding="max_length", 
            truncation=True, 
            max_length=100, 
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": targets["input_ids"].squeeze(),
        }

# Create dataset
dataset = QADataset(tokenizer, questions, reference_answers)

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,  # Increase to 5 epochs for better learning
    logging_dir="./logs",
    logging_steps=10,
    fp16=torch.cuda.is_available(),
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save model
model.save_pretrained("./flan-t5-finetuned")
tokenizer.save_pretrained("./flan-t5-finetuned")
print("Fine-tuning completed. Model saved!")


  trainer = Seq2SeqTrainer(


ValueError: You have set `args.eval_strategy` to IntervalStrategy.EPOCH but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. 

## Using the fine-tuned model

In [None]:
import json
import torch
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer

# Load a more powerful T5 model
model_name = "./flan-t5-finetuned"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load embeddings and index-to-chunk mapping
embeddings = np.load('pittsburgh_embeddings.npy')
with open("index_to_chunk.json", "r") as f:
    index_to_chunk = json.load(f)

# Load Sentence Transformer for retrieval
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

# Read questions from questions.txt
with open("data/train/questions.txt", "r", encoding="utf-8") as f:
    questions = [line.strip() for line in f.readlines() if line.strip()]

# Dictionary to store answers
system_output = {}

# Process each question
for idx, question in enumerate(questions, start=1):
    # Encode question
    query_embedding = sentence_model.encode([question])

    # Compute cosine similarity for top-k retrieval
    cosine_similarities = np.dot(embeddings, query_embedding.T).flatten()
    top_k_indices = np.argsort(cosine_similarities)[-5:]  # Retrieve top 3 chunks

    # Combine retrieved chunks
    retrieved_chunks = " ".join([index_to_chunk[str(i)] for i in top_k_indices])

    # **Truncate to fit 512 tokens**
    max_context_tokens = 450
    context_tokens = tokenizer.tokenize(retrieved_chunks)[:max_context_tokens]
    truncated_context = tokenizer.convert_tokens_to_string(context_tokens)

    # Construct prompt
    prompt = f"question: {question} context: {truncated_context}"

    # Tokenize and generate an answer
    input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    output_ids = model.generate(
        input_ids, 
        max_length=100,  
        num_beams=7,  # More diverse answers
        early_stopping=True
    )
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Store the answer
    system_output[str(idx)] = answer

    print(f"Q: {question}")
    print(f"A: {answer}\n")

# Save all answers to system_output.json
with open("system_output.json", "w") as f:
    json.dump(system_output, f, indent=4)

print("All answers saved in system_output.json.")


# Approach 2

In [18]:
%pip install transformers sentencepiece

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sentencepiece
  Downloading sentencepiece-0.2.0.tar.gz (2.6 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.6/2.6 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: sentencepiece
  Building wheel for sentencepiece (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m√ó[0m [32mBuilding wheel for sentencepiece [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m‚îÇ[0m exit code: [1;36m1[0m
  [31m‚ï∞‚îÄ>[0m [31m[108 lines of output][0m
  [31m   [0m running bdist_wheel
  [31m   [0m running build
  [31m   [0m running build_py
  [31m   [0m creating build/lib.macosx-10.13-un

In [19]:
import requests
from bs4 import BeautifulSoup

def scrape_wikipedia(url="https://en.wikipedia.org/wiki/Pittsburgh"):
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve the page.")
        return ""
    soup = BeautifulSoup(response.text, "html.parser")
    content_div = soup.find("div", id="mw-content-text")
    if not content_div:
        return ""
    paragraphs = content_div.find_all("p")
    text = "\n".join([p.get_text().strip() for p in paragraphs if p.get_text().strip() != ""])
    return text

def chunk_text(text, chunk_size=250):  # using a smaller chunk size
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks



# Scrape Pittsburgh Wikipedia page and chunk the text
full_text = scrape_wikipedia("https://en.wikipedia.org/wiki/Pittsburgh")
chunks = chunk_text(full_text, chunk_size=250)


In [20]:
from transformers import pipeline

# Initialize the question generation pipeline
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-small-qg-prepend")

# Initialize the question answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

all_qa_pairs = []
for i, chunk in enumerate(chunks):
    try:
        # Generate a question from the chunk.
        prompt = "generate question: " + chunk
        qg_outputs = qg_pipeline(prompt)
        for out in qg_outputs:
            question = out['generated_text'].strip()
            
            # Now extract an answer from the same chunk using the generated question.
            qa_result = qa_pipeline(question=question, context=chunk)
            answer = qa_result.get('answer', "No answer found")
            
            qa_pair = {
                "question": question,
                "answer": answer,
                "chunk_index": i
            }
            all_qa_pairs.append(qa_pair)
    except Exception as e:
        print(f"Error generating Q&A for chunk {i}: {e}")

print(f"Total Q&A pairs generated: {len(all_qa_pairs)}")

# Show a few examples
for idx, qa in enumerate(all_qa_pairs[:100], start=1):
    print(f"Q{idx}: {qa['question']}\nA{idx}: {qa['answer']}\n")


ValueError: Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast convertors: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']

In [29]:
import requests
from bs4 import BeautifulSoup
import json
from transformers import pipeline

def scrape_wikipedia(url="https://en.wikipedia.org/wiki/Pittsburgh"):
    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve the page.")
        return ""
    soup = BeautifulSoup(response.text, "html.parser")
    content_div = soup.find("div", id="mw-content-text")
    if not content_div:
        return ""
    paragraphs = content_div.find_all("p")
    text = "\n".join([p.get_text().strip() for p in paragraphs if p.get_text().strip() != ""])
    return text

def chunk_text(text, chunk_size=250):  # using a smaller chunk size
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# Scrape Pittsburgh Wikipedia page and chunk the text
full_text = scrape_wikipedia("https://en.wikipedia.org/wiki/Pittsburgh")
chunks = chunk_text(full_text, chunk_size=250)

# Initialize the question generation pipeline (using text2text-generation with a prompt)
qg_pipeline = pipeline("text2text-generation", model="valhalla/t5-small-qg-prepend")

# Initialize the question answering pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

all_qa_pairs = []
for i, chunk in enumerate(chunks):
    try:
        # Generate a question from the chunk.
        prompt = "generate question: " + chunk
        qg_outputs = qg_pipeline(prompt)
        for out in qg_outputs:
            question = out['generated_text'].strip()
            
            # Now extract an answer from the same chunk using the generated question.
            qa_result = qa_pipeline(question=question, context=chunk)
            answer = qa_result.get('answer', "No answer found")
            
            qa_pair = {
                "question": question,
                "answer": answer,
                "chunk_index": i
            }
            all_qa_pairs.append(qa_pair)
    except Exception as e:
        print(f"Error generating Q&A for chunk {i}: {e}")

print(f"Total Q&A pairs generated: {len(all_qa_pairs)}")

# Show a few examples
for idx, qa in enumerate(all_qa_pairs[:5], start=1):
    print(f"Q{idx}: {qa['question']}\nA{idx}: {qa['answer']}\n")

# ----- Now write out the files -----

# Write questions.txt (one question per line)
with open("questions.txt", "w", encoding="utf-8") as f:
    for qa in all_qa_pairs:
        f.write(qa["question"] + "\n")

# Write system_output.json: mapping from question number to the generated answer
system_output = {str(idx + 1): qa["answer"] for idx, qa in enumerate(all_qa_pairs)}
with open("system_output.json", "w", encoding="utf-8") as f:
    json.dump(system_output, f, indent=2)

# Write reference_answers.json: here we use the same answers as placeholders.
# In practice, you would update this file with manually verified or annotated answers.
reference_answers = {str(idx + 1): qa["answer"] for idx, qa in enumerate(all_qa_pairs)}
with open("reference_answers.json", "w", encoding="utf-8") as f:
    json.dump(reference_answers, f, indent=2)


Device set to use mps:0
Device set to use mps:0


Total Q&A pairs generated: 50
Q1: What is the name of the city that is located in Pennsylvania?
A1: Philadelphia

Q2: What did the city focus on?
A2: healthcare, education, and technology industries

Q3: What is the name of the borough of Pittsburgh?
A3: Pittsburgh for ever

Q4: What did the British build Fort Prince George?
A4: hastily

Q5: What direction did Pittsburgh's first civilian local government go?
A5: westward



In [30]:
from transformers import pipeline

# For example, using a text generation model
llm_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

# A function that generates an answer using the LLM
def generate_answer(question, context):
    prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"
    output = llm_pipeline(prompt, max_length=100, truncation=True)
    return output[0]['generated_text'].strip()

# Now, assuming you already generated your questions (e.g., using the qg_pipeline), you can generate answers:
all_qa_pairs = []
for i, chunk in enumerate(chunks):
    try:
        # Generate a question from the chunk.
        prompt = "generate question: " + chunk
        qg_outputs = qg_pipeline(prompt)
        for out in qg_outputs:
            question = out['generated_text'].strip()
            
            # Now generate an answer using the LLM and the chunk as context.
            answer = generate_answer(question, chunk)
            
            qa_pair = {
                "question": question,
                "answer": answer,
                "chunk_index": i
            }
            all_qa_pairs.append(qa_pair)
    except Exception as e:
        print(f"Error generating Q&A for chunk {i}: {e}")

print(f"Total Q&A pairs generated: {len(all_qa_pairs)}")


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use mps:0


Total Q&A pairs generated: 50


In [1]:
import json

# List of questions from the Pittsburgh Wikipedia page
questions = [
    "Where is Pittsburgh located geographically?",
    "What are the major rivers that converge in Pittsburgh?",
    "Why is Pittsburgh known as the 'Steel City'?",
    "What is the historical significance of Pittsburgh in the industrial revolution?",
    "Who were the early settlers of Pittsburgh?",
    "What are the major educational institutions in Pittsburgh?",
    "How has Pittsburgh's economy transitioned from manufacturing to technology and healthcare?",
    "What are the popular cultural and tourist attractions in Pittsburgh?",
    "What are the major sports teams based in Pittsburgh?",
    "How does Pittsburgh contribute to arts and theater?"
]

# Corresponding answers (one or more per question)
answers = {
    "1": ["Pittsburgh is located in western Pennsylvania, at the confluence of the Allegheny, Monongahela, and Ohio rivers."],
    "2": ["The Allegheny, Monongahela, and Ohio rivers converge in Pittsburgh."],
    "3": ["Pittsburgh is known as the 'Steel City' due to its historical role as a major steel manufacturing hub."],
    "4": ["Pittsburgh was a key industrial center during the industrial revolution, known for its steel production and manufacturing."],
    "5": ["The early settlers of Pittsburgh were Native Americans, followed by French and British colonists."],
    "6": ["Major educational institutions in Pittsburgh include the University of Pittsburgh and Carnegie Mellon University."],
    "7": ["Pittsburgh's economy has transitioned from manufacturing to focus on technology, healthcare, and education."],
    "8": ["Popular attractions in Pittsburgh include the Carnegie Museum of Art, Phipps Conservatory, and the Andy Warhol Museum."],
    "9": ["Major sports teams in Pittsburgh are the Pittsburgh Steelers (NFL), Pittsburgh Penguins (NHL), and Pittsburgh Pirates (MLB)."],
    "10": ["Pittsburgh contributes to arts and theater through institutions like the Pittsburgh Symphony Orchestra and the Benedum Center."]
}

# Write questions to questions.txt
with open('questions.txt', 'w') as file:
    for question in questions:
        file.write(question + '\n')

# Write reference answers to reference_answers.json
with open('reference_answers.json', 'w') as file:
    json.dump(answers, file, indent=4)

print("Files 'questions.txt' and 'reference_answers.json' have been created successfully.")


Files 'questions.txt' and 'reference_answers.json' have been created successfully.
