In [1]:
import os
import PyPDF2

def extract_text_from_pdfs(folder_path):
    pdf_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_file = os.path.join(folder_path, filename)
            with open(pdf_file, "rb") as file:
                pdf_reader = PyPDF2.PdfReader(file)
                pdf_text = ""
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    pdf_text += page.extract_text()
                pdf_texts.append({"filename": filename, "text": pdf_text})
    return pdf_texts

# Usage
folder_path = "./Data"  # Replace with the path where your PDFs are stored
pdf_texts = extract_text_from_pdfs(folder_path)

for pdf in pdf_texts:
    print(f"First 1000 characters of {pdf['filename']}:")
    print(pdf['text'][:1000])  # Print the first 1000 characters for review


First 1000 characters of CNT growth functions.pdf:
Applied  Surface  Science  332  (2015)  756–760
Contents  lists  available  at  ScienceDirect
Applied  Surface  Science
journal  h  om  epa  ge:  www.elsevier.com/locate/apsusc
A  new understanding of  carbon  nanotube  growth:  Different  functions
of
 carbon species
Yueling  Zhanga,∗,  Baojun  Wangb,  Qing  Yuc,  Yajun  Tiand
aCollege  of  Engineering,  Peking  University,  Summer  Palace  Road  5,  Beijing  100871,  PR  China
bKey  Laboratory  of  Coal  Science  and  Technology,  Taiyuan  University  of  Technology,  Yingze  West  Street  79,  Taiyuan  030024,  PR  China
cSchool  of  Chemistry  and  Chemical  Engineer  and  Center  of  Modern  Analysis,  Nanjing  University,  Hankou  Road  22,  Nanjing  210093,  PR  China
dNational  Institute  of  Clean-and-low-carbon  Energy,  P.O.  Box  001  Shenhua  NICE,  Future  Science  &  Technology  Park,  Beijing  102209,  PR  China
a r t  i  c  l  e  i  n  f  o
Article history:
Received
 1

In [2]:
import re

def generate_qa_pairs(pdf_texts):
    qa_pairs = []
    
    for pdf in pdf_texts:
        text = pdf['text']
        
        # Example 1: Temperature ranges for CNT growth
        temp_match = re.search(r'temperature range.*?(\d+°C–\d+°C)', text)
        if temp_match:
            question = "What is the temperature range for CNT growth in the study?"
            answer = f"The temperature range for CNT growth is {temp_match.group(1)}."
            qa_pairs.append({"question": question, "answer": answer})

        # Example 2: CNT growth mechanism
        if "growth mechanism" in text.lower():
            question = "What mechanism was used for carbon nanotube growth?"
            answer = "The CNT growth mechanism involves chemical vapor deposition (CVD) with acetylene and a catalyst."
            qa_pairs.append({"question": question, "answer": answer})

        # Example 3: Characterization methods for CNT
        if "characterized by" in text.lower():
            question = "What techniques were used to characterize CNTs?"
            answer = "The CNTs were characterized by scanning electron microscopy (SEM), transmission electron microscopy (TEM), and Raman spectroscopy."
            qa_pairs.append({"question": question, "answer": answer})

        # Example 4: Impact of catalyst lifetime
        if "catalyst lifetime" in text.lower():
            question = "How does catalyst lifetime affect CNT growth?"
            answer = "Catalyst lifetime affects CNT growth by limiting the time during which the catalyst can facilitate CNT formation before deactivation."
            qa_pairs.append({"question": question, "answer": answer})

        # Add more Q&A patterns based on the structure of your texts
    
    return qa_pairs

# Example usage
qa_pairs = generate_qa_pairs(pdf_texts)

# Output the generated Q&A pairs
for pair in qa_pairs:
    print(f"Q: {pair['question']}")
    print(f"A: {pair['answer']}")
    print()


Q: What mechanism was used for carbon nanotube growth?
A: The CNT growth mechanism involves chemical vapor deposition (CVD) with acetylene and a catalyst.

Q: What techniques were used to characterize CNTs?
A: The CNTs were characterized by scanning electron microscopy (SEM), transmission electron microscopy (TEM), and Raman spectroscopy.

Q: How does catalyst lifetime affect CNT growth?
A: Catalyst lifetime affects CNT growth by limiting the time during which the catalyst can facilitate CNT formation before deactivation.



In [3]:
import pandas as pd
from datasets import Dataset

# Convert Q&A pairs to DataFrame
df = pd.DataFrame(qa_pairs)

# Create a Hugging Face dataset
dataset = Dataset.from_pandas(df)
print(dataset)


  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['question', 'answer'],
    num_rows: 3
})


In [4]:
df

Unnamed: 0,question,answer
0,What mechanism was used for carbon nanotube gr...,The CNT growth mechanism involves chemical vap...
1,What techniques were used to characterize CNTs?,The CNTs were characterized by scanning electr...
2,How does catalyst lifetime affect CNT growth?,Catalyst lifetime affects CNT growth by limiti...


In [5]:
from transformers import BertTokenizer

# Load SciBERT tokenizer
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# Tokenize the dataset
def preprocess_function(examples):
    return tokenizer(examples["question"], examples["answer"], truncation=True, padding=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/3 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 3/3 [00:00<00:00, 60.22 examples/s]


In [6]:
import os
import PyPDF2
import torch
from transformers import BertForQuestionAnswering, BertTokenizer, T5Tokenizer, T5ForConditionalGeneration

# Step 1: Extract Text from PDFs
def extract_text_from_pdfs(folder_path):
    pdf_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_file = os.path.join(folder_path, filename)
            with open(pdf_file, "rb") as file:
                pdf_reader = PyPDF2.PdfReader(file)
                pdf_text = ""
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    pdf_text += page.extract_text()
                pdf_texts.append({"filename": filename, "text": pdf_text})
    return pdf_texts

# Step 2: SciBERT for Question Answering
class SciBERTQA:
    def __init__(self, model_name="allenai/scibert_scivocab_uncased"):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForQuestionAnswering.from_pretrained(model_name)

    def answer_question(self, question, context, max_length=512):
        # Tokenize with truncation to avoid long input issues
        inputs = self.tokenizer(question, context, return_tensors="pt", truncation=True, max_length=max_length, padding=True)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            start_scores = outputs.start_logits
            end_scores = outputs.end_logits
            all_tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
            answer = ' '.join(all_tokens[torch.argmax(start_scores): torch.argmax(end_scores)+1])
        return answer.replace(" ##", "")  # Fix tokenization artifacts

# Step 3: T5 for Generating Fluent Responses
class T5ResponseGenerator:
    def __init__(self, model_name="t5-base"):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)

    def generate_response(self, question, answer, max_length=150):
        input_text = f"question: {question} context: {answer}"
        inputs = self.tokenizer.encode(input_text, return_tensors="pt", truncation=True)
        
        outputs = self.model.generate(inputs, max_length=max_length, num_beams=5, early_stopping=True)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

# Step 4: Full Pipeline - Combine SciBERT and T5
def qa_pipeline(folder_path):
    # Extract text from PDFs
    pdf_texts = extract_text_from_pdfs(folder_path)

    # Initialize SciBERT QA model and T5 generator
    scibert_qa = SciBERTQA()
    t5_generator = T5ResponseGenerator()

    for pdf in pdf_texts:
        print(f"\n--- Processing PDF: {pdf['filename']} ---\n")
        
        # Define your question (this can be dynamic or user-input)
        question = "What is the growth temperature for carbon nanotubes?"

        # Step 1: Use SciBERT to get a concise answer
        answer = scibert_qa.answer_question(question, pdf['text'])
        print(f"SciBERT Answer: {answer}")

        # Step 2: Use T5 to generate a more detailed response
        detailed_response = t5_generator.generate_response(question, answer)
        print(f"T5 Generated Response: {detailed_response}\n")

# Run the pipeline
folder_path = "./Data"  # Replace with the actual folder path containing your PDFs
qa_pipeline(folder_path)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized f


--- Processing PDF: CNT growth functions.pdf ---

SciBERT Answer: 
T5 Generated Response: 0.4


--- Processing PDF: CNT growth mechanism.pdf ---



Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


SciBERT Answer: reserved . unraveling the mechanisms of carbon nanotube growth by chemical vapor deposition georgios p . gakis , stefania termine , aikaterini - flora a . trompeta , ioannis g . aviziotis , costas a . charitidis * research lab of advanced , composite , nano - materials and nanotechnology , materials science and engineering department , school of chemical engineering , national technical university of athens , 9 heroon polytechneiou street , zografos , athens 15780 , greece article info keywords : cvd of cnts macroscopic model growth mechanism competitive phenomena carbon diffusion catalyst lifetime abstract the mechanisms of carbon nanotube ( cnt ) growth by chemical vapor deposition of acetylene on fe / sio 2 : al2o
T5 Generated Response: al2o



Updated Pipeline

In [7]:
import os
import PyPDF2
import torch
from transformers import BertForQuestionAnswering, BertTokenizer, T5Tokenizer, T5ForConditionalGeneration

# Step 1: Extract Text from PDFs
def extract_text_from_pdfs(folder_path):
    pdf_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_file = os.path.join(folder_path, filename)
            with open(pdf_file, "rb") as file:
                pdf_reader = PyPDF2.PdfReader(file)
                pdf_text = ""
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    pdf_text += page.extract_text()
                pdf_texts.append({"filename": filename, "text": pdf_text})
    return pdf_texts

# Step 2: SciBERT for Question Answering
class SciBERTQA:
    def __init__(self, model_name="allenai/scibert_scivocab_uncased"):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForQuestionAnswering.from_pretrained(model_name)

    def answer_question(self, question, context, max_length=512):
        # Tokenize with truncation to avoid long input issues
        inputs = self.tokenizer(question, context, return_tensors="pt", truncation=True, max_length=max_length, padding=True)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            start_scores = outputs.start_logits
            end_scores = outputs.end_logits
            all_tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
            answer = ' '.join(all_tokens[torch.argmax(start_scores): torch.argmax(end_scores)+1])
        return answer.replace(" ##", "")  # Fix tokenization artifacts

# Split long text into smaller chunks
def split_text_into_chunks(text, chunk_size=512):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield " ".join(words[i:i + chunk_size])

# Function to handle chunked input for SciBERT
def answer_question_from_chunks(question, context, model, tokenizer, max_length=512):
    best_answer = ""
    
    # Split the context into smaller chunks
    chunks = split_text_into_chunks(context)
    
    for chunk in chunks:
        inputs = tokenizer(question, chunk, return_tensors="pt", truncation=True, max_length=max_length, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
            start_scores = outputs.start_logits
            end_scores = outputs.end_logits
            all_tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
            answer = ' '.join(all_tokens[torch.argmax(start_scores): torch.argmax(end_scores)+1])
            answer = answer.replace(" ##", "")  # Fix tokenization artifacts

        # Replace with the first non-empty answer
        if answer.strip():
            best_answer = answer
            break
    
    return best_answer

# Step 3: T5 for Generating Fluent Responses
class T5ResponseGenerator:
    def __init__(self, model_name="t5-base"):
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)

    def generate_response(self, question, answer, max_length=200):
        input_text = f"question: {question} context: {answer}"
        inputs = self.tokenizer.encode(input_text, return_tensors="pt", truncation=True)
        
        # Adjust generation parameters
        outputs = self.model.generate(
            inputs,
            max_length=max_length,
            num_beams=5,            # Use beam search to improve quality
            no_repeat_ngram_size=2, # Avoid repetition
            early_stopping=True,
            length_penalty=1.0,     # Penalize short answers
        )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

# Step 4: Full Pipeline - Combine SciBERT and T5
def qa_pipeline(folder_path):
    # Extract text from PDFs
    pdf_texts = extract_text_from_pdfs(folder_path)

    # Initialize SciBERT QA model and T5 generator
    scibert_qa = SciBERTQA()
    t5_generator = T5ResponseGenerator()

    for pdf in pdf_texts:
        print(f"\n--- Processing PDF: {pdf['filename']} ---\n")
        
        # Define your question (this can be dynamic or user-input)
        question = "What is the growth temperature for carbon nanotubes?"

        # Step 1: Use SciBERT to get a concise answer from chunks
        answer = answer_question_from_chunks(question, pdf['text'], scibert_qa.model, scibert_qa.tokenizer)
        print(f"SciBERT Answer: {answer}")

        if not answer.strip():
            print("No valid answer found from SciBERT.")
            continue

        # Step 2: Use T5 to generate a more detailed response
        detailed_response = t5_generator.generate_response(question, answer)
        print(f"T5 Generated Response: {detailed_response}\n")

# Run the pipeline
folder_path = "./Data"  # Replace with the actual folder path containing your PDFs
qa_pipeline(folder_path)


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized f


--- Processing PDF: CNT growth functions.pdf ---

SciBERT Answer: pr china dnational institute of clean - and - low - carbon energy , p . o . box 001 shenhua nice , future science & technology park , beijing 102209 , pr china a r t i c l e i n f o article history : received 13 november 2014 received in revised form 12 january 2015 accepted 18 january 2015 available online 24 january 2015 keywords : chemical vapor deposition carbon nanotubes interface dynamics carbon species functiona b s t r a c t understanding the formation mechanism of carbon nanotubes ( cnts ) from carbon source is critical for controlled - production of cnts . in this study , the functions of carbon species were investigated by a thermogravimetric analyzer coupled with a mass spectroscope in using methane as carbon source of cnt growth in chemical vapor deposition ( cvd ) . it was found that a negative peak of c2h2species and a positive peak of c2h4species appeared at the cnt growth moment . accordingly it is dedu

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


T5 Generated Response: cnt growth moment


--- Processing PDF: CNT growth mechanism.pdf ---

SciBERT Answer: al2o3 ( zeolite y ) catalyst are
T5 Generated Response: 0.4

