# Collate PDFs

In [None]:
# @title 📥 Download and Extract Demo Papers
# @markdown This cell removes any existing alloy-papers folder, creates a fresh one,
# @markdown downloads a zipped archive of demo papers from Google Drive using gdown,
# @markdown and extracts the contents into the `alloy-papers/` directory.

!rm -rf alloy-papers
!mkdir alloy-papers
!cd alloy-papers

!pip install -q gdown
!gdown --id 1DvejY9En4cZlMlCs3Wgwspmjwmd8a902 --output alloy-papers/demo_papers.zip
!unzip alloy-papers/demo_papers.zip -d alloy-papers/

# Convert PDFs to raw text


In [None]:
# @title 🛠️ Install Nougat from GitHub
!pip install -q git+https://github.com/facebookresearch/nougat.git

In [None]:
# @title 🕵️ Convert PDFs to raw text using Nougat

# @markdown Please enter the paper PDF file name and the folder to store the output MMD file
pdf_filename = "alloy-papers/dummy_alloy_paper.pdf"  # @param {type:"string"}
output_folder = "alloy-papers/nougat"  # @param {type:"string"}

# !nougat "{pdf_filename}" -o "{output_folder}" -m 0.1.0-base --no-skipping
!nougat "{pdf_filename}" -o "{output_folder}" --no-skipping


# Force cleanup
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [None]:
# @title 📄 Preview the Extracted File
# @markdown Check the extracted MMD file.
# @markdown You can compare it with the PDF file.

mmd_filename = "alloy-papers/nougat/dummy_alloy_paper.mmd"  # @param {type:"string"}

with open(mmd_filename, "r") as file:
    latex_content = file.read()

from IPython.display import display, Markdown
display(Markdown(latex_content))


# Clean raw text


In [None]:
# @title 🧼 Data Cleaning for MMD Files
# @markdown Remove Acknowledgement and References.

# @markdown If you want to further remove Abstract and Introduction, please uncomment the relevant code.

input_folder = "alloy-papers/nougat"  # @param {type:"string"}
output_folder = "alloy-papers/cleaned"  # @param {type:"string"}

# --- Script starts here ---
from tqdm import tqdm
import os
import re

def remove_abstract_introduction(text):
    # Define patterns to match Abstract, Introduction, and Experiment sections
    abstract_pattern = re.compile(r"(?i)^#+\s*([ivxlcdm\d]*\s*)?a\s*bstract\s*(.*?)", re.DOTALL | re.MULTILINE)
    introduction_pattern = re.compile(r"(?i)^#+\s*([ivxlcdm\d]*\s*)?introduction\s*(.*?)", re.DOTALL | re.MULTILINE)
    experiment_pattern = re.compile(
        r"(?i)^#+\s*([ivxlcdm\d]*\s*)?(experiment(?:al|s)?|materials and methods|methodology)\s*(.*?)(?=#+|$)", re.DOTALL | re.MULTILINE
    )

    # Search for matches
    abstract_matches = list(abstract_pattern.finditer(text))
    introduction_match = re.search(introduction_pattern, text)
    experiment_match = re.search(experiment_pattern, text)

    # Check if multiple abstracts exist
    if len(abstract_matches) > 1:
        # If multiple abstracts exist, keep the last one
        abstract_match = abstract_matches[-1]
    elif len(abstract_matches) == 1:
        abstract_match = abstract_matches[0]
    else:
        abstract_match = None

    # Check if abstract exists
    if abstract_match:
        if introduction_match:
            if experiment_match:
                # If abstract, introduction and experiment exist, remove content before experiment
                text = text[experiment_match.start() :]
            else:
                # If abstract and introduction exist but experiment does not exist, remove content before introduction
                text = text[introduction_match.start() :]
        else:
            # If abstract exists but introduction and experiment do not exist, only remove content before
            # abstract to avoid removing the useful content
            text = text[abstract_match.start() :]
    else:
        if introduction_match:
            if experiment_match:
                # If abstract does not exist but introduction and experiment exist, remove content before experiment
                text = text[experiment_match.start() :]
            else:
                # If abstract and experiment do not exist but introduction exists, only remove content before
                # introduction
                text = text[introduction_match.start() :]
        else:
            # If abstract, introduction and experiment do not exist, keep the content
            text = text

    return text.strip()


def remove_references_acknowledgement(text):
    # Remove 'References' section (any heading level, with optional numbering)
    text = re.sub(
        r"^#{1,6}\s*\d*\s*References\b.*?(?=^#{1,6}\s|\Z)",
        "",
        text,
        flags=re.DOTALL | re.IGNORECASE | re.MULTILINE
    )

    sections_to_remove = [
        "Acknowledgements",
        "Acknowledgments",
        "Conflicts of interest",
        "Declaration of competing interest",
        "Disclosure statement",
        "Funding",
        "Conflicts of Interest",
        "Supporting Information",
        "Author Contributions",
    ]

    # Match any heading level (# to ######), optional numbering, section name, and its content
    pattern = (
            r"^#{1,6}\s*\d*\s*(?:" +
            "|".join(re.escape(s) for s in sections_to_remove) +
            r")\b.*?(?=^#{1,6}\s|\Z)"
    )

    text = re.sub(pattern, "", text, flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
    return text


def remove_repetitions(text, threshold=3):
    pattern = re.compile(r'\b(.+?)\b(?:\s+\1\b){{{},}}'.format(threshold - 1))
    cleaned = pattern.sub(r'\1', text)
    return cleaned


def process_files(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for file_name in tqdm(os.listdir(input_folder)):
        if file_name.endswith(".mmd"):  # Assuming the articles are in text files
            with open(os.path.join(input_folder, file_name), "r", encoding="utf-8") as file:
                text = file.read()

            cleaned_text = remove_repetitions(text)
            cleaned_text = remove_references_acknowledgement(cleaned_text)
            cleaned_text = remove_abstract_introduction(cleaned_text)

            output_file_name = f"{file_name}"
            with open(os.path.join(output_folder, output_file_name), "w", encoding="utf-8") as output_file:
                output_file.write(cleaned_text)

process_files(input_folder, output_folder)

In [None]:
# @title 📄 Preview LaTeX from .mmd File
# @markdown Check the cleaned MMD file again.

mmd_filename = "alloy-papers/cleaned/dummy_alloy_paper.mmd"  # @param {type:"string"}

with open(mmd_filename, "r") as file:
    latex_content = file.read()

from IPython.display import display, Markdown
display(Markdown(latex_content))

# LLM Prompting


In [None]:
# @title 🔐 Login to Hugging Face
# @markdown Install the Hugging Face Hub and follow the instruction to generate your Access Token.

# @markdown  You need to have an HuggingFace account to get the Token.

!pip install -q huggingface_hub

from huggingface_hub import notebook_login

notebook_login()

print("Successful login.")

In [None]:
# @title 🚀 Install Transformers 4.49.0 & Restart Runtime
# @markdown This will install a specific version of Hugging Face Transformers library, used for working with large language models.
# @markdown After installation, the Colab runtime will restart automatically to apply the changes.
# @markdown Please note this Transformer version doesn't work with Nougat, so you should restart the session if you want to run Nougat again.

!pip install -q transformers==4.49.0
!pip install -q bitsandbytes

import os
os.kill(os.getpid(), 9)

In [None]:
# @title 🔍 Extract Alloy & Phase Info from MMD Files
# @markdown This cell runs an LLM-based extraction pipeline on MMD files generated from academic PDFs.
# @markdown It identifies alloy compositions, and phase information based on structured prompts,
# @markdown then cleans and saves the output as JSON in the specified output folder.

llm_model = "meta-llama/Llama-3.2-3B-Instruct" # @param {type:"string"}
input_folder = "alloy-papers/cleaned" # @param {type:"string"}
output_dir = "alloy-papers/output" # @param {type:"string"}
max_len = "120000" # @param {type:"string"}
quantize = "" # @param {type:"string"}

# @markdown LLM model options are [meta-llama/Llama-3.2-1B-Instruct, meta-llama/Llama-3.2-3B-Instruct]


# --- Script starts here ---
import json
import os
import re

import torch
from tqdm import tqdm
from transformers import pipeline


def split(tokenizer, text, max_len):
    chunks, c = [], []
    for line in text.split("\n"):
        c.append(line)
        if len(tokenizer.tokenize("\n".join(c))) > max_len:
            chunks.append("\n".join(c[:-1]))
            c = [line]
    chunks.append("\n".join(c))
    return chunks


def chat(generator, dialog):
    terminators = [
        generator.tokenizer.eos_token_id,
        generator.tokenizer.convert_tokens_to_ids("<|eot_id|>"),
    ]

    outputs = generator(
        dialog,
        max_new_tokens=1024,
        eos_token_id=terminators,
        pad_token_id=generator.tokenizer.eos_token_id,
        do_sample=True,
        temperature=0.1,
    )

    return outputs[0]["generated_text"][-1]["content"]


def clean(text):
    pattern = re.compile(r"\{.*?\}", re.DOTALL)
    match = pattern.findall(text)
    cleaned = "[" + ",".join(match) + "]" if match else "[]"
    try:
        json.loads(cleaned)
        return cleaned
    except json.JSONDecodeError:
        return "[]"

def prompt_1(text):
    return f"""You are a helpful assistant. Your task is to extract alloy and its corresponding phase information from the text.

    Please follow these instructions carefully:

    **Extract alloy and phase information** from the paper text, delimited by —————.
        Please use the following JSON schema to generate the output:
    {{
       "Alloy": "XXX",
       "Phase": "XXX"
    }}

    "Alloy": Elemental composition of the alloy mentioned in the paper. Alloy compositions contain multiple elements. Ignore alloys that include unknown placeholders (e.g., "X", "Y", or invalid symbols).
    "Phase": Phase information of the alloy in this paper with possible values: 'BCC', 'FCC', 'Im', 'FCC + BCC', 'BCC + Im', 'FCC + Im', 'FCC + BCC + Im'

    Return a list of JSON objects if there are several alloys. Only return valid JSON objects, with no additional text.
—————
{text}
—————
"""

def main(model, folder, output_dir, max_len, quantize):
    generator = pipeline(
        "text-generation",
        model=model,
        model_kwargs={
            "torch_dtype": torch.bfloat16,
            "load_in_4bit": quantize == "4bit",
            "load_in_8bit": quantize == "8bit",
        },
        device_map="auto",
    )

    results = "{}/{}-{}-{}".format(output_dir, model.split("/")[-1], quantize, max_len)
    print(f"Results will be saved in {results}")
    if not os.path.exists(results):
        os.makedirs(results)

    # Step 1: Get all .mmd files in the folder
    paper_list = [f[:-4] for f in os.listdir(folder) if f.endswith(".mmd")]

    # Step 2: Loop through them
    for mmd in tqdm(sorted(paper_list)):
        print("Processing:", mmd)
        mmd_path = os.path.join(folder, mmd + ".mmd")

        if not os.path.exists(mmd_path):
            print("File not found:", mmd_path)
            continue

        with open(mmd_path, "r") as f:
            ocr = f.read()
            paper = split(generator.tokenizer, ocr, max_len)

        outputs = []
        for part in paper:
            dialog = [{"role": "user", "content": prompt_1(part)}]
            response = clean(chat(generator, dialog))

            outputs.append(response)
            del dialog, response
            torch.cuda.empty_cache()

        merged = []
        for o in outputs:
            merged.extend(json.loads(o))
        unique = [
            dict(t) for t in {
                tuple(sorted((k, tuple(v) if isinstance(v, list) else v) for k, v in d.items()))
                for d in merged
            }
        ]
        with open(os.path.join(results, mmd), "w") as f:
            f.write(json.dumps(unique, indent=2))

max_len = int(max_len)
print("Files to process:", os.listdir(input_folder))
main(llm_model, input_folder, output_dir, max_len, quantize)


In [None]:
# @title 📄 Preview Extracted Alloy data
# @markdown This cell reads and displays the extracted alloy information.

output_file = "alloy-papers/output/Llama-3.2-3B-Instruct--120000/dummy_alloy_paper"  # @param {type:"string"}

import json, pandas as pd
with open(output_file, "r") as f:
    data = json.load(f)

pd.DataFrame(data)


# [Skip] Utilities

In [None]:
# @title 🧹 Free Up GPU Memory
# @markdown This cell clears unused variables and empties the GPU memory cache.
# @markdown Run this if you experience CUDA out-of-memory errors or after large model inference.

import gc
import torch

gc.collect()
torch.cuda.empty_cache()


In [None]:
# @title 🧹 Free Up Hugging Face & PyTorch Cache
# @markdown This cell clears cache downloaded by Hugging Face (e.g., LLM models) and/or PyTorch (e.g., Nougat models).
# @markdown Tick one or both boxes and run the cell to clean them.

clean_huggingface_cache = True  # @param {type:"boolean"}
clean_torch_cache = True        # @param {type:"boolean"}

if clean_huggingface_cache:
    print("Cleaning Hugging Face cache...")
    !rm -rf /root/.cache/huggingface
    print("✅ Hugging Face cache removed.")

if clean_torch_cache:
    print("Cleaning PyTorch cache...")
    import torch, gc
    gc.collect()
    torch.cuda.empty_cache()
    print("✅ PyTorch cache cleared.")

if not clean_huggingface_cache and not clean_torch_cache:
    print("⚠️ Nothing selected to clean.")
