In [1]:
import pandas as pd
import json
import threading
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer
)
import torch
import re
from tqdm import tqdm, trange
import gc

gc.collect()
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
df = pd.read_csv('../data/texts_urls_filtered.csv')

In [4]:
df.head()

Unnamed: 0,URL,SUCCESS,TEXT
0,https://3riversstudios.com,YES,» Skip to content Search Toggle Production Ser...
1,https://60minutemissions.com/book,YES,Mission Scheduler • 60 Minute Missions Escape ...
2,https://abcnews.go.com/wnt/video/superhero-win...,YES,Video Superhero Window Washers - ABC News ABC ...
3,https://acfree.librarycalendar.com/event/pitts...,YES,Pittsburgh Classic Movie Club presents Spring ...
4,https://acrisurestadium.com,YES,"Home - Acrisure Stadium in Pittsburgh, PA Skip..."


In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "meta-llama/Llama-3.1-8B-Instruct"
custom_cache_dir = "/mnt/new_volume"

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=custom_cache_dir)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    cache_dir=custom_cache_dir
)


Loading checkpoint shards: 100%|██████████| 4/4 [02:01<00:00, 30.38s/it]


In [None]:
# def generate_qa_pairs_streaming(document_text, model, tokenizer,  num_pairs=10):

#     prompt = f"""
#     Generate {num_pairs} high-quality question-answer pairs about anything related Pittsburgh and Carnegie Mellon University 
#     from the following document:
    
#     ---
#     {document_text}
#     ---
    
#     Format the output as a JSON list with each entry containing a 'question' and an 'answer'.
#     Example:
#     [
#         {{"question": "Who is Pittsburgh named after?", "answer": "William Pitt"}},
#         {{"question": "What famous machine learning venue had its first conference in Pittsburgh in 1980?", "answer": "ICML"}},
#         {{"question": "What musical artist is performing at PPG Arena on October 13?", "answer": "Billie Eilish"}}
#     ]
#     """
    
#     inputs = tokenizer(prompt, return_tensors="pt").to(device)

#     # Create the streamer object.
#     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)

#     # Define the generation arguments.
#     generation_kwargs = dict(
#         **inputs,
#         max_new_tokens=200,
#         do_sample=True,
#         streamer=streamer
#     )

#     # Run generation in a separate thread.
#     thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
#     thread.start()
    
#     generated_text = ""
#     # Read the tokens from the streamer as they arrive.
#     for new_text in streamer:
#         # print(new_text, end="", flush=True)  # Print tokens in real time
#         generated_text += new_text

#     pattern = r'\[.*?\]'
#     matches = re.findall(pattern, generated_text, flags=re.DOTALL)
#     # Attempt to parse as JSON:

#     del thread
#     del streamer
#     del generation_kwargs
#     del inputs
#     del prompt
#     # del document_text
    
#     error = None
#     if len(matches) <= 1:
#         # print("No QA pairs generated")
#         error = (generated_text, document_text)
    
#     del generated_text
#     del document_text
    
#     return matches, error

In [46]:
def generate_qa_pairs(document_text, model, tokenizer, num_pairs=10, use_streamer=True):
    prompt = f"""
    Generate {num_pairs} high-quality question-answer pairs about anything related to Pittsburgh and Carnegie Mellon University 
    from the following document:
    
    ---
    {document_text}
    ---
    
    Format the output as a JSON list with each entry containing a 'question' and an 'answer'.
    Example:
    [
        {{"question": "Who is Pittsburgh named after?", "answer": "William Pitt"}},
        {{"question": "What famous machine learning venue had its first conference in Pittsburgh in 1980?", "answer": "ICML"}},
        {{"question": "What musical artist is performing at PPG Arena on October 13?", "answer": "Billie Eilish"}}
    ]
    """
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    if use_streamer:
        streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
        generation_kwargs = dict(
            **inputs,
            max_new_tokens=200,
            do_sample=True,
            streamer=streamer
        )
        
        thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()
        
        generated_text = ""
        for new_text in streamer:
            print(new_text, end="", flush=True)
            generated_text += new_text
        thread.join()
        
    else:
        

        output = model.generate(
            **inputs,
            max_new_tokens=200,
            do_sample=True
        )
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        # print(generated_text)
    
    pattern = r'\[.*?\]'
    matches = re.findall(pattern, generated_text, flags=re.DOTALL)

    error = None
    if len(matches) <= 1:
        # print("No QA pairs generated")
        error = (generated_text, document_text)
    
    del generated_text
    del document_text
    del inputs
    del prompt
    
    return matches, error
    
    # try:
    #     qa_pairs = json.loads(matches[0]) if matches else []
    # except json.JSONDecodeError:
    #     qa_pairs = []
    
    # error = None if qa_pairs else (generated_text, document_text)
    
    # return qa_pairs, error

In [47]:
import random
# add random seed
random.seed(42)

idx = list(range(0, len(df)))
# shuffle the list
random.shuffle(idx)


In [62]:
all_matches = []
error_responses = []

In [63]:
for i in trange(100, 700):
    document_text = df.iloc[idx[i]]['TEXT']
    url = df.iloc[idx[i]]['URL']
    if len(document_text) > 40000:
        continue
    qa_pairs_matches, error = generate_qa_pairs(document_text, model, tokenizer, num_pairs=5, use_streamer=False)
    if error:
        error_responses.append(error)

    all_matches.append((url, qa_pairs_matches))

  0%|          | 0/600 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 1/600 [00:09<1:36:33,  9.67s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 2/600 [00:19<1:35:00,  9.53s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 3/600 [00:28<1:35:12,  9.57s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 4/600 [00:38<1:35:46,  9.64s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 5/600 [00:46<1:31:22,  9.21s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 6/600 [00:55<1:28:04,  8.90s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 7/600 [01:03<1:27:33,  8.86s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|▏         | 8/600 [01:12<1:27

In [64]:
# create a pd dataframe from a list of json objects

questions = []
answers = []
url_li = []

wrong_json = []

for url, sub_list in all_matches:
    for sub_sub_list in sub_list:
        try:
            sub_sub_list = json.loads(sub_sub_list)
        except:
            wrong_json.append(sub_sub_list)
            continue
        for qa in sub_sub_list:
            if type(qa) != dict:
                continue
            if 'question' not in qa or 'answer' not in qa:
                continue

            questions.append(qa['question'])
            answers.append(qa['answer'])
            url_li.append(url)

qa_df = pd.DataFrame({'url': url_li, 'question': questions, 'reference_answer': answers})


In [65]:
qa_df.drop_duplicates(subset=['question'], inplace=True)



In [66]:
qa_df.to_csv('qa_pairs_new_500.csv', index=False)

In [58]:
qa_df

Unnamed: 0,url,question,reference_answer
0,https://trustarts.org/pct_home/support/planned...,Who is Pittsburgh named after?,William Pitt
1,https://trustarts.org/pct_home/support/planned...,What famous machine learning venue had its fir...,ICML
2,https://trustarts.org/pct_home/support/planned...,What musical artist is performing at PPG Arena...,Billie Eilish
3,https://trustarts.org/pct_home/support/planned...,What is the name of the largest cultural arts ...,The Pittsburgh Cultural Trust
4,https://trustarts.org/pct_home/support/planned...,What is the name of the street where the Pitts...,Liberty Avenue
...,...,...,...
693,https://www.pittsburghpa.gov/safety/fire/burea...,What is the phone number of the Pittsburgh Bur...,412-255-2860
694,https://www.pittsburghpa.gov/safety/fire/burea...,What is the name of the Pittsburgh University ...,Carnegie Mellon University
699,https://www.mlb.com/angels,What is the name of the Pittsburgh Pirates' ho...,PNC Park
700,https://www.mlb.com/angels,What is the name of the university that is a m...,Carnegie Mellon University


In [67]:
df_loaded_new = pd.read_csv('qa_pairs_new_500.csv')

In [70]:
df_loaded_new.iloc[2089]['url']

'https://www.pittsburghsymphony.org/pso_home/biographies/musicians/drew-collins'

In [59]:
qa_df.to_csv('qa_pairs_new.csv', index=False)

In [60]:
df_loaded = pd.read_csv('qa_pairs_new.csv')
df_loaded

Unnamed: 0,url,question,reference_answer
0,https://trustarts.org/pct_home/support/planned...,Who is Pittsburgh named after?,William Pitt
1,https://trustarts.org/pct_home/support/planned...,What famous machine learning venue had its fir...,ICML
2,https://trustarts.org/pct_home/support/planned...,What musical artist is performing at PPG Arena...,Billie Eilish
3,https://trustarts.org/pct_home/support/planned...,What is the name of the largest cultural arts ...,The Pittsburgh Cultural Trust
4,https://trustarts.org/pct_home/support/planned...,What is the name of the street where the Pitts...,Liberty Avenue
...,...,...,...
382,https://www.pittsburghpa.gov/safety/fire/burea...,What is the phone number of the Pittsburgh Bur...,412-255-2860
383,https://www.pittsburghpa.gov/safety/fire/burea...,What is the name of the Pittsburgh University ...,Carnegie Mellon University
384,https://www.mlb.com/angels,What is the name of the Pittsburgh Pirates' ho...,PNC Park
385,https://www.mlb.com/angels,What is the name of the university that is a m...,Carnegie Mellon University
