In [None]:
# make sure you are working from a conda environment before proceeding!
%pip install huggingface_hub hf_transfer openai ollama

Collecting openai
  Downloading openai-1.61.1-py3-none-any.whl.metadata (27 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Downloading anyio-4.8.0-py3-none-any.whl.metadata (4.6 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting sniffio (from openai)
  Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.7-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Collecti

In [None]:
# import os # Optional for faster downloading
# os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

from huggingface_hub import snapshot_download
snapshot_download(
  repo_id = "bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF",
  local_dir = "DeepSeek-R1-Qwen-32B-GGUF",
  allow_patterns = ["*Q8_0*"],
)

# after downloading, run ollama serve in terminal, then continue onto next step

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

DeepSeek-R1-Distill-Qwen-32B-Q8_0.gguf:   0%|          | 0.00/34.8G [00:00<?, ?B/s]

'/home/a4969/dev/DeepSeek-R1-Qwen-32B-GGUF'

In [None]:
# TEST RUN

from openai import OpenAI
client = OpenAI(
    api_key="ollama",
    base_url="http://localhost:11434/v1",
)

context = """
Rechargeable alkali metal−CO2 batteries, which combine
high theoretical energy density and environmentally friendly CO2 fixation
ability, have attracted worldwide attention. Unfortunately, their electro-
chemical performances are usually inferior for practical applications.
Aiming to reveal the underlying causes, a combinatorial usage of
advanced nondestructive and postmortem characterization tools is used
to intensively study the failure mechanisms of Li/Na−CO2 batteries. It is
found that a porous interphase layer is formed between the separator and
the Li/Na anode during the overvoltage rising and battery performance
decaying process. A series of control experiments are designed to identify
the underlying mechanisms dictating the observed morphological
evolution of Li/Na anodes, and it is found that the CO2 synergist
"""

system_prompt = "You are a material's science expert"
# adapted prompt from SciQAG paper
user_prompt = f"""
    Attached is a detailed scientific paper.\n\n{context}\n\n
    Your task is to formulate 10 sophisticated Q&A pairs that delve into the underlying 
    scientific principles and knowledge presented in this paper, focusing specifically on extract keywords. 
    Steer clear of questions that are purely section-specific (e.g., 'What does Figure 5 represent?') 
    or basic or definitional questions (e.g., 'What is XXX?'). Instead, focus on questions that require a 
    deeper understanding of the subject matter, especially those relating to the main idea in this paragraph. 
    Ensure diversity in your Q&A pairs, avoiding any duplication. Answers should be rich in detail, drawing 
    on specific data, material properties, and contextual insights from the paper. Strive for clarity and depth 
    in your responses, aiming to enhance the reader's comprehension of the intricate concepts discussed.

    output must not include any end of generation summary or start of generation preamble
    output must only include the questions and answers and be formatted like this: 

    ###QUESTION: <question here>
    ###ANSWER: <answer here>

"""

response = client.chat.completions.create(
    model="DeepSeek-R1-Qwen-32B",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
)

print(response.choices[0].message.content)

     Repeat for 10 times.

</think>

###QUESTION: What role does the porous interphase layer play in the degradation of Li/Na−CO2 batteries?
###ANSWER: The porous interphase layer, formed between the separator and the anode during battery operation, contributes to performance decay by causing overvoltage increases. This layer likely hinders ion transport, leading to poor electrochemical performance.

###QUESTION: How do the control experiments explain the evolution of Li/Na anodes?
###ANSWER: Control experiments demonstrate that CO2 acts as a synergist, influencing the structural changes in Li and Na anodes. These insights reveal how operational conditions affect anode morphologies, impacting overall battery functionality.

###QUESTION: What challenges persist with rechargeable alkali metal−CO2 batteries for practical use?
###ANSWER: Despite their high theoretical energy density and CO2 fixation benefits, these batteries face performance limitations and durability issues, hindering rea

In [None]:

%pip install marker-pdf

In [3]:
# for marker conversion use the command line interface to avoid GPU memory overload
# https://github.com/VikParuchuri/marker/tree/master?tab=readme-ov-file#convert-a-single-file

from pathlib import Path

def load_markdown_file(filepath:str) -> str:

    return Path(filepath).read_text(encoding="utf-8")

text = load_markdown_file("/home/a4969/dev/marker_out/codependent_failure/codependent_failure.md")

# removing asterisks to remove additional characters that the llm would 
# have to go over when analysing the headings in the next step
text = "\n".join([line.replace('*', '') for line in text.splitlines()])

print(text[:500])

[Science Bulletin 68 (2023) 813–825](https://doi.org/10.1016/j.scib.2023.03.021)

Contents lists available at [ScienceDirect](http://www.sciencedirect.com/science/journal/20959273)

# Science Bulletin

journal homepage: [www.elsevier.com/locate/scib](http://www.elsevier.com/locate/scib)

# Article

# Codependent failure mechanisms between cathode and anode in solid state lithium metal batteries: mediated by uneven ion flux

Yue Zheng a,b , Shu Zhang a , Jun Ma a,c,⇑ , Fu Sun a,⇑ , Markus Osenber


In [4]:
import re

def extract_md_headings(text:str) -> str:
    # this function looks for header tags (e.g '##') and returns them
    # the output is the unfilterred header and subheader list 
    headings = ''

    for line in text.splitlines():
        match = re.match(r'^(#{1,4})\s+(.+)', line.strip())
        if match:
            headings += f"{match.group(0)}\n"

    return headings

md_headings = extract_md_headings(text)
print(md_headings)

# Science Bulletin
# Article
# Codependent failure mechanisms between cathode and anode in solid state lithium metal batteries: mediated by uneven ion flux
# article info
# abstract
### 1. Introduction
### 2. Experimental
### 2.1. Material synthesis
# 2.2. Material characterization
# 2.3. Fabrication of sulfide-based ASSLMBs using the swage-cell design and their electrochemical measurements
### 2.4. Tomo-cell assembly and the electrochemical measurements
# 2.5. Synchrotron X-ray tomography measurement
## 2.6. 3D tomography data processing
### 2.7. Finite element method simulation
# 3. Results and discussion
# 3.1. Performance test of the ASSLMBs using swage-cells and tomo-cells
#### 3.2. SXCT results of the tomo-cells using bare NCM
# 3.3. SXCT results of the tomo-cells using LZP-NCM
# 3.4. Postmortem analysis of the cycled tomo-cells and finite element simulation of the codependent failure mechanism
# 3.5. A summary and further discussion of the codependent failure mechanism
# 4. Conc

In [8]:
from openai import OpenAI
client = OpenAI(
    api_key="ollama",
    base_url="http://localhost:11434/v1",
)

def extract_main_body_headings(md_headings:str) -> str:

    headings_prompt = f"""
    here is a list of headings from a research paper, 
    your task is to return only the headings that are likely part of the main body text.  
    the paper subheadings must all be carried over if their heading is part of the body text.
    for example, Introduction, Conclusion and Methods are usually part of the main body, while References or Acknowledgments are not.
    the output must only include headings and subheadings and return them in verbatim, exactly as they appear in the input\n
    do not output any additional text other than the extracted headings, do not output a summary of any sort
    input: \n{md_headings}
    output:
    """

    response = client.chat.completions.create(
        model="DeepSeek-R1-Qwen-32B",
        messages=[
            {"role": "system", "content": "You are a material science PHD researcher"},
            {"role": "user", "content": headings_prompt},
        ]
    )

    headings_text = response.choices[0].message.content
    
    return headings_text

headings_text = extract_main_body_headings(md_headings)

print(headings_text)

 (only the main body headings with their subheadings)
    
### 1. Introduction
### 2. Experimental
### 2.1. Material synthesis
# 2.2. Material characterization
# 2.3. Fabrication of sulfide- based ASSLMBs using swage-cell design and their electrochemical measurements
### 2.4. Tomo-cell assembly and the electrochemical measurements
# 2.5. Synchrotron X- ray tomography measurement
## 2.6. 3D tomography data processing
### 2.7. Finite element method simulation
# 3. Results and discussion
# 3.1. Performance test of ASSLMBs using swage-cells and tomo-cells.
#### 3.2. SXCT results of the tomo-cells using bare NCM
# 3.3. SXCT results of the tomo-cells using LZP-NCM
# 3.4. Postmortem analysis of cycled tomo-cells and finite element simulation of codependent failure mechanism
# 3.5. Summary and further discussion of codependent failure mechanism
# 4. Conclusion

</think>

### 1. Introduction  
### 2. Experimental  
### 2.1. Material synthesis  
# 2.2. Material characterization  
# 2.3. Fabricat

In [9]:
### Helper functions for cleaning up the output from the LLM heading extraction


def remove_think_text(input:str):
    """ 
    The output from Deepseek models often times contains 
    the thinking steps and ends the thinking with the 
    </think> tag. For the Q&A generation we want to 
    remove the thinking text. 
    """
    think_tag = "</think>" 
    think_tag_location = input.find(think_tag)

    # if think tag exists, return only the text after it
    if think_tag_location != -1:
        return input[think_tag_location + len(think_tag):]
    else:
        return input
    
def clean_lines(text: str) -> list:
    """
    Takes a string of text, separates it into lines, and removes asterisks from the beginning and end of each line.
    
    :param text: Input string with multiple lines
    :return: List of cleaned lines
    """
    lines = text.split('\n')  # Split text into lines
    cleaned_lines = [line.strip('*') for line in lines]  # Remove asterisks from beginning and end
    return "\n".join(cleaned_lines)
    
headings_no_think = remove_think_text(headings_text)
cleaned_headings = clean_lines(headings_no_think)

print(cleaned_headings)



### 1. Introduction  
### 2. Experimental  
### 2.1. Material synthesis  
# 2.2. Material characterization  
# 2.3. Fabrication of sulfide-based ASSLMBs using swage-cell design and their electrochemical measurements  
### 2.4. Tomo-cell assembly and the electrochemical measurements  
# 2.5. Synchrotron X-ray tomography measurement  
## 2.6. 3D tomography data processing  
### 2.7. Finite element method simulation  
# 3. Results and discussion  
# 3.1. Performance test of ASSLMBs using swage-cells and tomo-cells.  
#### 3.2. SXCT results of the tomo-cells using bare NCM  
# 3.3. SXCT results of the tomo-cells using LZP-NCM  
# 3.4. Postmortem analysis of cycled tomo-cells and finite element simulation of the codependent failure mechanism  
# 3.5. Summary and further discussion of the codependent failure mechanism.  
# 4. Conclusion


In [11]:
def extract_main_body_text(article_text: str, main_body_headings_text: str) -> str:
    """
    Extracts all text from the main body of the article. Capturing starts at the first
    main body heading and stops when the heading immediately following the last main body
    heading is encountered.

    Parameters:
        article_text (str): The full publication text (with all headings).
        main_body_headings_text (str): A multiline string containing only the main body headings,
                                       in the order they appear.

    Returns:
        str: The concatenated text from the main body.
    """
    # Create an ordered list of main body headings (after stripping whitespace)
    main_body_headings = [
        line.strip() for line in main_body_headings_text.strip().splitlines() if line.strip()
    ]
    if not main_body_headings:
        return ""
    
    first_heading = main_body_headings[0]
    last_heading = main_body_headings[-1]

    capturing = False
    saw_last_heading = False
    collected_lines = []

    for line in article_text.splitlines():
        stripped_line = line.strip()

        # Before capturing: look for the first main body heading.
        if not capturing:
            if stripped_line == first_heading:
                capturing = True
                collected_lines.append(line)
                # In case the first and last are the same heading:
                if stripped_line == last_heading:
                    saw_last_heading = True
            continue

        # When capturing:
        # If we've already seen the last main body heading and the current line is a heading,
        # then that signals the end of the main body.
        if saw_last_heading and stripped_line.startswith("#"):
            break

        collected_lines.append(line)

        # Mark that we have seen the last main body heading.
        if stripped_line == last_heading:
            saw_last_heading = True

    return "\n".join(collected_lines).strip()

main_body_text = extract_main_body_text(text, headings_no_think)
print(len(main_body_text))
print(main_body_text[:200])

61283
### 1. Introduction

All solid-state lithium metal battery (ASSLMB) technology, which features improved safety property and higher energy density compared with the state-of-the-art lithium ion battery


In [None]:
from openai import OpenAI
client = OpenAI(
    api_key="ollama",
    base_url="http://localhost:11434/v1",
)

def query_llm(text_chunk: str)->str:
    """
    This funcion sends a generation query to the LLM to create a set of Q&A pairs for the given chunk of text.
    """
    
    system_prompt = "You are a material's science expert"
    # prompt adapted from SciQAG paper
    user_prompt = f"""
        Attached is a detailed excerpt from a scientific paper.\n\n{text_chunk}\n\n
        Your task is to formulate 10 thorough Q&A pairs that delve into the underlying 
        scientific principles and knowledge presented in this paper, focusing specifically on extract keywords. 
        Steer clear of questions that are purely section-specific (e.g., 'What does Figure 5 represent?') 
        or basic or definitional questions (e.g., 'What is XXX?'). Instead, focus on questions that require a 
        deeper understanding of the subject matter, especially those relating to the main idea in this paragraph. 
        Ensure diversity in your Q&A pairs, avoiding any duplication. Answers should be rich in detail, drawing 
        on specific data, material properties, and contextual insights from the paper. Strive for clarity and depth 
        in your responses, aiming to enhance the reader's comprehension of the intricate concepts discussed.

        output must be in english only
        output must not include any summary of the task or preamble

        output must only include the questions and answers and be formatted like this: 
        ------
        ###QUESTION: <question here> 
        ###ANSWER: <answer here>
        ------
    """

    print(f"generating QA pairs for chunk of size {len(text_chunk)}...")

    response = client.chat.completions.create(
        model="DeepSeek-R1-Qwen-32B",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ]
    )

    clean_response = remove_think_text(str(response.choices[0].message.content))

    print(clean_response)
    return clean_response


def generate_question_answer_list(text, chunk_size=2000, overlap=200) -> str:
    """
    Splits the full body of text into chunks and preserves word boundaries.
    Includes 'overlap' characters between chunks to preserve context.
    For each chunk, a list of questions and answers is generated.
    """
    full_text_length = len(text)
    print(f"starting generation of Q&A for full text with length {full_text_length}")
    start = 0

    question_answer_generation = ""
    while start < full_text_length:
        end = start + chunk_size

        # Make sure we are not splitting words by moving to the last space or punctuation
        if end < len(text):
            match = re.search(r'\s', text[end - 10:])
            if match: 
                end = start + match.start() + (chunk_size - 10)

        chunk = text[start:end].strip()

        question_answer_generation += query_llm(chunk)

        start = end - overlap

        print(f"\nprocessed {end}/{full_text_length}")
    
    return question_answer_generation

generate_question_answer_list(main_body_text)

### PDF ingestion process

Here you can see the process from start to end 

In [None]:
from pathlib import Path

def ingest_md_folder(folder_path):

    md_folder = Path(folder_path)

    for file in md_folder.glob('*.pdf'):

        print(f'processing file {file.name}\n')

        file_path = str(file.absolute())
        
        # full text
        text, _, images = Path(file_path).read_text(enoding="utf-8")
        # removing asterisks, for easier processing downstream
        text = "\n".join([line.replace('*', '') for line in text.splitlines()])

        all_text_headings = extract_md_headings(text)
        all_headings_count = len(all_text_headings.splitlines())
        print(f'found a total of {all_headings_count} headings')

        # get a list of the main body headings
        main_body_headings = extract_main_body_headings(all_text_headings) 
        headings_no_think = remove_think_text(main_body_headings)
        cleaned_headings = clean_lines(headings_no_think)
        print(f'found a total of {len(cleaned_headings)} main body headings')

        # extract the main body text, to be used for Q&A generation
        main_body_text = extract_main_body_text(text, cleaned_headings)
        print(f'main body text contains {len(main_body_text)} characters\n')


        print('saving document text to local file')



        print(text[:200])

    # return 's'

ingest_pdf_folder("/home/a4969/Downloads/publication_database")