In [1]:
import pandas as pd
import json
import threading
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer
)
import torch
import re
from tqdm import tqdm, trange
import gc

gc.collect()
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
df = pd.read_csv('../data/texts_urls_filtered.csv')

In [4]:
df.head()

Unnamed: 0,URL,SUCCESS,TEXT
0,https://3riversstudios.com,YES,» Skip to content Search Toggle Production Ser...
1,https://60minutemissions.com/book,YES,Mission Scheduler • 60 Minute Missions Escape ...
2,https://abcnews.go.com/wnt/video/superhero-win...,YES,Video Superhero Window Washers - ABC News ABC ...
3,https://acfree.librarycalendar.com/event/pitts...,YES,Pittsburgh Classic Movie Club presents Spring ...
4,https://acrisurestadium.com,YES,"Home - Acrisure Stadium in Pittsburgh, PA Skip..."


In [5]:
df.iloc[11]['TEXT']

'Maps and Seating Charts - Acrisure Stadium in Pittsburgh, PA Skip to content Getting Here Contact Us Facebook Twitter Instagram Menu Stadium Parking & Directions Seating Charts Food and Beverage 100 Level 200 Level Club Level 500 Level Employment Suites Tours & Experiences Hall of Honor Museum Stadium Tours Plan Your Day Clear Bag Policy Transformation of the North Shore Team Member Login Football Pittsburgh Steelers My Steelers Account Buy Steelers Tickets Steelers Mobile Tickets Premium Seating Home Game Packages Season Ticket Transfers Steelers Hall of Honor Museum Shop the Steelers Pro Shop Pitt Panthers Buy Pitt Football Tickets Buy Panthers Gear WPIAL Football Championships Concerts & Events Pittsburgh Steelers Football Games Pitt Panthers Football Games Concerts Kickoff and Rib Festival Rib Fest Schedule Rib Fest Ribs Book a Private Event Audio / Visual Services Contact Us Event Spaces UPMC Club West Club PNC Champions Club FedEx Great Hall North Club Press Box Dining Room Ford

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "meta-llama/Llama-3.1-8B-Instruct"
# model_name = "unsloth/phi-4-bnb-4bit"
custom_cache_dir = "/mnt/new_volume"

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=custom_cache_dir)
tokenizer.pad_token = tokenizer.eos_token  # set the pad token to be the same as the eos token
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    cache_dir=custom_cache_dir
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# def generate_qa_pairs_streaming(document_text, model, tokenizer,  num_pairs=10):

#     prompt = f"""
#     Generate {num_pairs} high-quality question-answer pairs about anything related Pittsburgh and Carnegie Mellon University 
#     from the following document:
    
#     ---
#     {document_text}
#     ---
    
#     Format the output as a JSON list with each entry containing a 'question' and an 'answer'.
#     Example:
#     [
#         {{"question": "Who is Pittsburgh named after?", "answer": "William Pitt"}},
#         {{"question": "What famous machine learning venue had its first conference in Pittsburgh in 1980?", "answer": "ICML"}},
#         {{"question": "What musical artist is performing at PPG Arena on October 13?", "answer": "Billie Eilish"}}
#     ]
#     """
    
#     inputs = tokenizer(prompt, return_tensors="pt").to(device)

#     # Create the streamer object.
#     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)

#     # Define the generation arguments.
#     generation_kwargs = dict(
#         **inputs,
#         max_new_tokens=200,
#         do_sample=True,
#         streamer=streamer
#     )

#     # Run generation in a separate thread.
#     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
#     thread.start()
    
#     generated_text = ""
#     # Read the tokens from the streamer as they arrive.
#     for new_text in streamer:
#         # print(new_text, end="", flush=True)  # Print tokens in real time
#         generated_text += new_text

#     pattern = r'\[.*?\]'
#     matches = re.findall(pattern, generated_text, flags=re.DOTALL)
#     # Attempt to parse as JSON:

#     del thread
#     del streamer
#     del generation_kwargs
#     del inputs
#     del prompt
#     # del document_text
    
#     error = None
#     if len(matches) <= 1:
#         # print("No QA pairs generated")
#         error = (generated_text, document_text)
    
#     del generated_text
#     del document_text
    
#     return matches, error

In [None]:
def generate_qa_pairs(document_text, model, tokenizer, num_pairs=10, use_streamer=True):
    # prompt = f"""
    # Generate {num_pairs} high-quality question-answer pairs about anything related to Pittsburgh and Carnegie Mellon University 
    # from the following document:
    
    # ---
    # {document_text}
    # ---
    
    # Format the output as a JSON list with each entry containing a 'question' and an 'answer'.
    # Example:
    # [
    #     {{"question": "Who is Pittsburgh named after?", "answer": "William Pitt"}},
    #     {{"question": "What famous machine learning venue had its first conference in Pittsburgh in 1980?", "answer": "ICML"}},
    #     {{"question": "What musical artist is performing at PPG Arena on October 13?", "answer": "Billie Eilish"}}
    # ]
    # """
    prompt = f"""
    Generate {num_pairs} high-quality question-answer pairs based on the given document about Pittsburgh and Carnegie Mellon University (CMU).

    # from the following document:
    
    # ---
    # {document_text}
    # ---
    
    **Requirements:**
    - Ensure questions are diverse in type, including but not limited to:
    - Historical facts
    - Notable events
    - Landmarks and locations
    - University-specific information
    - Cultural or entertainment-related facts
    - Important figures related to CMU or Pittsburgh
    - Each question should have a **direct answer** based on the content provided.
    - Ensure questions are **concise** and **clear**.
    - Answers should be **factual** and **direct**.
    - Avoid simple question like "What is the name of"

    **Output Format:**
    Return a JSON-formatted list where each entry contains:
    - `"question"`: The generated question.
    - `"answer"`: The corresponding answer.

    **Example Output:**

    [
        {{"question": "Who is Pittsburgh named after?", "answer": "William Pitt"}},
        {{"question": "What year was Carnegie Mellon University founded?", "answer": "1900"}},
        {{"question": "Which bridge in Pittsburgh is famously yellow?", "answer": "Roberto Clemente Bridge"}},
        {{"question": "Which famous AI professor at CMU co-founded Duolingo?", "answer": "Luis von Ahn"}},
        {{"question": "Who hosts the Burgh Bus comedy tour in Pittsburgh?", "answer": "Matt Light."}}
    ]

    Now it's your turn.

    Output:
    
    """
    # print(prompt)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    if use_streamer:
        streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
        generation_kwargs = dict(
            **inputs,
            max_new_tokens=350,
            do_sample=True,
            streamer=streamer
        )
        
        thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()
        
        generated_text = ""
        for new_text in streamer:
            print(new_text, end="", flush=True)
            generated_text += new_text
        thread.join()
        
    else:
        

        output = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True
        )
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        # print(generated_text)
    
    pattern = r'\[.*?\]'
    matches = re.findall(pattern, generated_text, flags=re.DOTALL)

    error = None
    if len(matches) <= 1:
        # print("No QA pairs generated")
        error = (generated_text, document_text)
    
    del generated_text
    del document_text
    del inputs
    del prompt
    
    return matches, error
    
    # try:
    #     qa_pairs = json.loads(matches[0]) if matches else []
    # except json.JSONDecodeError:
    #     qa_pairs = []
    
    # error = None if qa_pairs else (generated_text, document_text)
    
    # return qa_pairs, error

In [49]:
import random
# add random seed
random.seed(42)

idx = list(range(0, len(df)))
# shuffle the list
random.shuffle(idx)


In [50]:
all_matches = []
error_responses = []

In [None]:
for i in trange(100,700):
    document_text = df.iloc[idx[i]]['TEXT']
    url = df.iloc[idx[i]]['URL']
    if len(document_text) > 40000:
        continue
    qa_pairs_matches, error = generate_qa_pairs(document_text, model, tokenizer, num_pairs=5, use_streamer=False)
    # print(qa_pairs_matches)
    if error:
        error_responses.append(error)

    all_matches.append((url, qa_pairs_matches))

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 1/100 [00:08<14:09,  8.58s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 2/100 [00:18<15:01,  9.20s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 3/100 [00:26<14:20,  8.87s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 4/100 [00:35<13:55,  8.71s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  5%|▌         | 5/100 [00:44<14:18,  9.03s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  6%|▌         | 6/100 [00:52<13:36,  8.68s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|▋         | 7/100 [01:01<13:16,  8.57s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  8%|▊         | 8/100 [01:10<13:46,  8.98s/it]S

In [52]:
# create a pd dataframe from a list of json objects

questions = []
answers = []
url_li = []

wrong_json = []

for url, sub_list in all_matches:
    for sub_sub_list in sub_list:
        try:
            sub_sub_list = json.loads(sub_sub_list)
        except:
            wrong_json.append(sub_sub_list)
            continue
        for qa in sub_sub_list:
            if type(qa) != dict:
                continue
            if 'question' not in qa or 'answer' not in qa:
                continue

            questions.append(qa['question'])
            answers.append(qa['answer'])
            url_li.append(url)

qa_df = pd.DataFrame({'url': url_li, 'question': questions, 'reference_answer': answers})


In [53]:
qa_df

Unnamed: 0,url,question,reference_answer
0,https://trustarts.org/pct_home/support/planned...,Who is Pittsburgh named after?,William Pitt
1,https://trustarts.org/pct_home/support/planned...,What year was Carnegie Mellon University founded?,1900
2,https://trustarts.org/pct_home/support/planned...,Which bridge in Pittsburgh is famously yellow?,Roberto Clemente Bridge
3,https://trustarts.org/pct_home/support/planned...,Which famous AI professor at CMU co-founded Du...,Luis von Ahn
4,https://trustarts.org/pct_home/support/planned...,Who hosts the Burgh Bus comedy tour in Pittsbu...,Matt Light.
...,...,...,...
950,https://www.mlb.com/angels,What is the name of the famous university in P...,Carnegie Mellon University
951,https://www.mlb.com/angels,Who is the founder of Carnegie Mellon University?,Andrew Carnegie and Andrew Mellon
952,https://www.mlb.com/angels,What is the name of the famous bridge in Pitts...,Roberto Clemente Bridge
953,https://www.mlb.com/angels,What is the name of the famous comedy tour in ...,Burgh Bus


In [54]:
qa_df.drop_duplicates(subset=['question'], inplace=True)



In [None]:
qa_df.to_csv('qa_pairs_new_600_38.csv', index=False)

In [None]:
# df_loaded_new = pd.read_csv('qa_pairs_new_500.csv')

In [1]:
# df_loaded_new.iloc[2089]['url']

In [2]:
# qa_df.to_csv('qa_pairs_new.csv', index=False)

In [3]:
# df_loaded = pd.read_csv('qa_pairs_new.csv')
# df_loaded