In [1]:
pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
from transformers import pipeline

# Load the pipeline for text-to-text generation
# Specify the model and tokenizer explicitly
question_generator = pipeline(
    "text2text-generation",
    model="valhalla/t5-base-qa-qg-hl",  # A model fine-tuned for question generation
    tokenizer="valhalla/t5-base-qa-qg-hl",
    device=device  # Forces CPU usage
)

question_answerer = pipeline(
    "question-answering",
    model="deepset/roberta-base-squad2",  # A model fine-tuned for question answering
    tokenizer="deepset/roberta-base-squad2",
    device=device  # Forces CPU usage
)

# Define the input context
context = "The Kepler spacecraft identified over 2,600 exoplanets during its mission."

# Format the input text as required by the model
formatted_input = f"generate question: {context}"

# Generate text with beam search
results = question_generator(
    formatted_input,
    max_length=64,
    num_return_sequences=3,  # Number of outputs to generate
    num_beams=3  # Enable beam search with the same number as return sequences
)

# Print the generated questions and retrieve answers
for idx, result in enumerate(results, 1):
    question = result['generated_text']
    print(f"Question {idx}: {question}")
    
    # Use the question-answering pipeline to retrieve the answer
    answer = question_answerer(question=question, context=context)
    print(f"Answer {idx}: {answer['answer']}")
    print(answer)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Question 1: How many exoplanets did the Kepler spacecraft identify?
Answer 1: 2,600
{'score': 0.5196231007575989, 'start': 38, 'end': 43, 'answer': '2,600'}
Question 2: How many exoplanets did the Kepler spacecraft identify during its mission?
Answer 2: over 2,600
{'score': 0.5593836903572083, 'start': 33, 'end': 43, 'answer': 'over 2,600'}
Question 3: How many exoplanets did the Kepler spacecraft find?
Answer 3: 2,600
{'score': 0.5790000557899475, 'start': 38, 'end': 43, 'answer': '2,600'}


## variation: using only abstracts to generate QA

In [5]:
def generate_qa_pairs_from_abstract(chunks,abstract):
    qa_pairs = []
    for chunk in chunks:
        try:
            formatted_input = f"generate question: {chunk}"

            # Generate text with beam search
            results = question_generator(
                formatted_input,
                max_length=32,
                num_return_sequences=3,  # Number of outputs to generate
                num_beams=5  # Enable beam search with the same number as return sequences
            )
            
            for idx, result in enumerate(results, 1):
                question = result['generated_text']    
            # Use the question-answering pipeline to retrieve the answer
                answer = question_answerer(question=question, context=chunk)
 
                qa_pairs.append({
                    "context": chunk,
                    "question": question,
                    "answer": answer,
                    "large context": abstract
                })
                
        except Exception as e:
            print(f"Error generating QA: {e}")
    return qa_pairs



In [6]:
file = 'data_paper_arXiv_all.json'

import json
with open(file, 'r') as f:
        data = f.read()
dict_all_papers = json.loads(data)


# Open and read the JSON file
with open(file, 'r') as file:
    data = json.load(file)

len(dict_all_papers.keys())

30204

In [7]:
abstracts = []
for key in data.keys():
    abstracts.append(data[key]['abstract'])

## to avoid leaking, we split the abstracts in train/test/eval


In [8]:
from sklearn.model_selection import train_test_split

# Split the data into train, test, and validation sets
train_data, temp_data = train_test_split(abstracts, test_size=0.2, random_state=42)  # 80% train
test_data, eval_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # 10% test, 10% validation

# Save the splits back to JSON files (optional)
with open("train_abstracts.json", "w") as file:
    json.dump(train_data, file, indent=4)

with open("test_abstracts.json", "w") as file:
    json.dump(test_data, file, indent=4)

with open("eval_abstracts.json", "w") as file:
    json.dump(eval_data, file, indent=4)


In [9]:
import math
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def split_text_into_chunks(text, num_chunks=4):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    total_sentences = len(sentences)
    
    # Calculate the size of each chunk
    chunk_size = math.ceil(total_sentences / num_chunks)
    
    # Split sentences into chunks
    chunks = [sentences[i:i + chunk_size] for i in range(0, total_sentences, chunk_size)]
    
    # Ensure there are exactly `num_chunks` chunks
    if len(chunks) > num_chunks:
        chunks = chunks[:num_chunks]
    
    # Join sentences back into text for each chunk
    chunk_texts = [' '.join(chunk) for chunk in chunks]
    
    return chunk_texts

# Example usage
text = """The Kepler spacecraft was launched in 2009. Its primary mission was to discover Earth-sized planets in the habitable zones of other stars. 
It observed over 150,000 stars and identified thousands of exoplanet candidates. 
The mission was initially planned for 3.5 years but extended due to its success. 
Kepler's observations revolutionized our understanding of planetary systems. 
In 2013, it suffered a mechanical failure, ending its primary mission. 
However, the spacecraft continued its work under the K2 mission. 
The K2 mission focused on different regions of the sky. 
Kepler ultimately discovered over 2,600 confirmed exoplanets."""
    
chunks = split_text_into_chunks(text)

# Print the three chunks
for i, chunk in enumerate(chunks, 0):
    print(f"Chunk {i}:\n{chunk}\n")


Chunk 0:
The Kepler spacecraft was launched in 2009. Its primary mission was to discover Earth-sized planets in the habitable zones of other stars. It observed over 150,000 stars and identified thousands of exoplanet candidates.

Chunk 1:
The mission was initially planned for 3.5 years but extended due to its success. Kepler's observations revolutionized our understanding of planetary systems. In 2013, it suffered a mechanical failure, ending its primary mission.

Chunk 2:
However, the spacecraft continued its work under the K2 mission. The K2 mission focused on different regions of the sky. Kepler ultimately discovered over 2,600 confirmed exoplanets.



[nltk_data] Downloading package punkt to
[nltk_data]     /storage/homefs/alibert/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
import tqdm

In [None]:
# Generate Q&A pairs from abstracts
qa_pairs_fromabstracts = []

dataset = train_data

for abstract in tqdm.tqdm(dataset):
    qa_pairs_fromabstracts.extend(generate_qa_pairs_from_abstract(split_text_into_chunks(abstract),abstract))

print(f"Generated {len(qa_pairs_fromabstracts)} Q&A pairs.")

with open("train_dataset.json", "w") as f:
    json.dump(qa_pairs_fromabstracts, f, indent=2)



  0%|          | 0/24163 [00:00<?, ?it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
  0%|          | 58/24163 [00:42<5:09:54,  1.30it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (944 > 512). Running this sequence through the model will result in indexing errors
 32%|███▏      | 7819/24163 [1:34:46<3:23:22,  1.34it/s]

In [None]:
# Generate Q&A pairs from abstracts
qa_pairs_fromabstracts = []

dataset = test_data

for abstract in tqdm.tqdm(dataset):
    qa_pairs_fromabstracts.extend(generate_qa_pairs_from_abstract(split_text_into_chunks(abstract),abstract))

print(f"Generated {len(qa_pairs_fromabstracts)} Q&A pairs.")

with open("test_dataset.json", "w") as f:
    json.dump(qa_pairs_fromabstracts, f, indent=2)



In [None]:
# Generate Q&A pairs from abstracts
qa_pairs_fromabstracts = []

dataset = eval_data

for abstract in tqdm.tqdm(dataset):
    qa_pairs_fromabstracts.extend(generate_qa_pairs_from_abstract(split_text_into_chunks(abstract),abstract))

print(f"Generated {len(qa_pairs_fromabstracts)} Q&A pairs.")

with open("eval_dataset.json", "w") as f:
    json.dump(qa_pairs_fromabstracts, f, indent=2)

