In [1]:
import pandas as pd
import json
import threading
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TextIteratorStreamer
)
import torch
import re
from tqdm import tqdm, trange
import gc

gc.collect()
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [50]:
df = pd.read_csv('../data/texts_urls_filtered.csv')

In [51]:
df.head()

Unnamed: 0,URL,SUCCESS,TEXT
0,https://3riversstudios.com,YES,» Skip to content Search Toggle Production Ser...
1,https://60minutemissions.com/book,YES,Mission Scheduler • 60 Minute Missions Escape ...
2,https://abcnews.go.com/wnt/video/superhero-win...,YES,Video Superhero Window Washers - ABC News ABC ...
3,https://acfree.librarycalendar.com/event/pitts...,YES,Pittsburgh Classic Movie Club presents Spring ...
4,https://acrisurestadium.com,YES,"Home - Acrisure Stadium in Pittsburgh, PA Skip..."


In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "meta-llama/Llama-3.1-8B-Instruct"
custom_cache_dir = "/mnt/new_volume"

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=custom_cache_dir)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    cache_dir=custom_cache_dir
)


Downloading shards: 100%|██████████| 4/4 [01:50<00:00, 27.58s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [02:16<00:00, 34.20s/it]


In [12]:
# model_name="meta-llama/Llama-3.1-8B-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     torch_dtype=torch.float16,
#     device_map= device
# )


In [53]:
def generate_qa_pairs_streaming(document_text, model, tokenizer,  num_pairs=10):

    prompt = f"""
    Generate {num_pairs} high-quality question-answer pairs about anything related Pittsburgh and Carnegie Mellon University 
    from the following document:
    
    ---
    {document_text}
    ---
    
    Format the output as a JSON list with each entry containing a 'question' and an 'answer'.
    Example:
    [
        {{"question": "Who is Pittsburgh named after?", "answer": "William Pitt"}},
        {{"question": "What famous machine learning venue had its first conference in Pittsburgh in 1980?", "answer": "ICML"}},
        {{"question": "What musical artist is performing at PPG Arena on October 13?", "answer": "Billie Eilish"}}
    ]
    """
    
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Create the streamer object.
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)

    # Define the generation arguments.
    generation_kwargs = dict(
        **inputs,
        max_new_tokens=200,
        do_sample=True,
        streamer=streamer
    )

    # Run generation in a separate thread.
    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    generated_text = ""
    # Read the tokens from the streamer as they arrive.
    for new_text in streamer:
        # print(new_text, end="", flush=True)  # Print tokens in real time
        generated_text += new_text

    pattern = r'\[.*?\]'
    matches = re.findall(pattern, generated_text, flags=re.DOTALL)
    # Attempt to parse as JSON:

    error = None
    if len(matches) <= 1:
        # print("No QA pairs generated")
        error = (generated_text, document_text)
    
    return matches, error

In [54]:
idx = list(range(0, len(df)))

# shuffle the list
import random
random.shuffle(idx)


In [55]:
all_matches = []
error_responses = []

for i in trange(100):
    document_text = df.iloc[idx[i]]['TEXT']
    url = df.iloc[i]['URL']
    qa_pairs_matches, error = generate_qa_pairs_streaming(document_text, model, tokenizer, num_pairs=5)
    if error:
        error_responses.append(error)

    all_matches.append((url, qa_pairs_matches))

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 1/100 [00:10<17:44, 10.75s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 2/100 [00:18<15:04,  9.23s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 3/100 [00:27<14:14,  8.81s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 4/100 [00:37<14:53,  9.31s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  5%|▌         | 5/100 [00:46<14:39,  9.26s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  6%|▌         | 6/100 [00:56<14:45,  9.42s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|▋         | 7/100 [01:04<14:16,  9.21s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  8%|▊         | 8/100 [01:12<13:31,  8.82s/it]S

In [61]:
# create a pd dataframe from a list of json objects

questions = []
answers = []
url_li = []

wrong_json = []

for url, sub_list in all_matches:
    for sub_sub_list in sub_list:
        try:
            sub_sub_list = json.loads(sub_sub_list)
        except:
            wrong_json.append(sub_sub_list)
            continue
        for qa in sub_sub_list:
            if type(qa) != dict:
                continue
            if 'question' not in qa or 'answer' not in qa:
                continue

            questions.append(qa['question'])
            answers.append(qa['answer'])
            url_li.append(url)

qa_df = pd.DataFrame({'url': url_li, 'question': questions, 'reference_answer': answers})


In [62]:
qa_df.drop_duplicates(subset=['question'], inplace=True)



In [63]:
qa_df.to_csv('qa_pairs.csv', index=False)

In [64]:
df_loaded = pd.read_csv('qa_pairs.csv')
df_loaded

Unnamed: 0,url,question,reference_answer
0,https://3riversstudios.com,Who is Pittsburgh named after?,William Pitt
1,https://3riversstudios.com,What famous machine learning venue had its fir...,ICML
2,https://3riversstudios.com,What musical artist is performing at PPG Arena...,Billie Eilish
3,https://3riversstudios.com,What is the name of the city where Carnegie Me...,Pittsburgh
4,https://3riversstudios.com,What is the name of the university where the R...,Carnegie Mellon University
...,...,...,...
386,https://carnegiemuseums.org/carnegie-magazine/...,What is Picklesburgh?,The destination for all things pickled
387,https://carnegiemuseums.org/carnegie-magazine/...,Where is Picklesburgh produced by?,The Pittsburgh Downtown Partnership
388,https://carnegiemuseums.org/carnegie-magazine/...,When will Picklesburgh return in 2025?,July
389,https://carnegiemuseums.org/carnegie-magazine/...,What is the name of the organization that prod...,The Pittsburgh Downtown Partnership
