In [None]:
import importlib.metadata

    # LangChain check package version
try:
    version = importlib.metadata.version('langchain')
    print(f"LangChain version: {version}")
except importlib.metadata.PackageNotFoundError:
    print("LangChain package is not installed.")

    # LangChain-core check package version
try:
    version = importlib.metadata.version('langchain-core')
    print(f"LangChain-core version: {version}")
except importlib.metadata.PackageNotFoundError:
    print("LangChain-core package is not installed.")

LangChain version: 0.2.14


# 0. Enter username (output folder will made for this name)

In [None]:
user_name = "user_name"

In [None]:
import time
notebook_start_time = time.time()

# 1. Basic prompt template of LangChain enviroment

In [None]:
template = """
You are a data analyst specializing in nano-toxicity.
Your task is extracting data about the physicochemical and cytotoxic properties of nanomaterials used in the research paper.
Use the following pieces of instruction and format to answer the user's questions.
Don't be lazy.
Be very strict and answer the question very accurate and scientific manner.
If data is not specified in the paper, just answer 'None'.
Do not use a full sentence.
Please provide the output as a valid JSON object without any additional explanations.
Context: {context}
Question: {question}
Format instruction: {format_instructions}
"""

# 2. Material data extraction prompt

In [None]:
q_mat = """
Just answer with following format: 'Nanomaterial name (Nanomaterial type)'. Do not use a full sentence.
Question: What nanoparticles were used in the characterization experiments in the research paper? Refer to the following descriptions.
Description 1. Provide the nanoparticles that were used for core size(i.e. primary size, nominal size) or hydrodynamic size(i.e. Z-average size) or surface charge(i.e. zeta potential) or surface area in the research paper.
Description 2. Provide information on all the nanoparticles actually used in the authors' experiments. Do not include nanoparticles used in other papers (i.e., mentioned in the reference) in your answer.
Description 3. If there were multiple nanoparticles of the same type, the author would have named them differently. What did the author name them?
Description 4. Write the name of the nanoparticle followed by its type in chemical formula within parentheses.
Description 5. Several examples of the format.
-Written in the document: Al2O3, GO, ZnO, SiO2, Fe2O3 (normal form), Response: Al2O3 (Al2O3), GO (GO), ZnO (ZnO), SiO2 (SiO2), Fe2O3 (Fe2O3).
-Written in the document: T10, T100 (labeled differently according to 'size'), Response: T10 (TiO2), T100 (TiO2).
-Written in the document: ZnAc, ZnChl (ZnO-Acetate, ZnO-Chloride; labeled differently according to 'chemical'), Response: ZnAc (ZnO), ZnChl (ZnO).
-Written in the document: TiO2-PVP, TiO2-Citrate (labeled differently according to 'coating'), Response: TiO2-PVP (TiO2), TiO2-Citrate (TiO2).
-Written in the document: P25, Nanofilament (labeled differently according to 'manufacturer'), Response: P25 (TiO2), Nanofilament (TiO2).
-Written in the document: CuO-USA, CuO-UK (labeled differently according to 'location'), Response: CuO-USA (CuO), CuO-UK (CuO).
Description 6. Do not omit the information.
Description 7. Do not write 'NP' or 'nanoparticles' followed by nanoparticles'name
"""

# 3. PChem/Tox data extraction prompt
- Pchem
- Tox

In [None]:
import pprint
from typing import Any, Dict
import datetime
from pytz import timezone

import pandas as pd
from langchain.output_parsers import PydanticOutputParser
#from langchain.output_parsers import PandasDataFrameParser

from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator

class pchem_product_info(BaseModel):
    mat_name: str = Field(description="Question: What nanoparticles were used in the characterization experiments in the research paper? Refer to the following descriptions. Description 0. Do not include metal nanoparticles such as Silver (Ag) or Gold (Au). 1. Provide the nanoparticles that were used for core size(i.e. primary size, nominal size) or hydrodynamic size(i.e. Z-average size) or surface charge(i.e. zeta potential) or surface area in the research paper. Description 2. Provide information on all the nanoparticles actually used in the authors' experiments. Do not include nanoparticles used in other papers (i.e., mentioned in the reference) in your answer. Description 3. If there were multiple nanoparticles of the same type, the author would have named them differently. What did the author name them? Description 4. Write the name of the nanoparticle followed by its type in chemical formula within parentheses.   Description 5. Several examples of the format. -Written in the document: Al2O3 NPs, ZnO, SiO2, Fe2O3 (normal form), Response: Al2O3 (Al2O3), ZnO (ZnO), SiO2 (SiO2), Fe2O3 (Fe2O3). -Written in the document: T10, T100 (labeled differently according to 'size'), Response: T10 (TiO2), T100 (TiO2). -Written in the document: ZnAc, ZnChl (ZnO-Acetate, ZnO-Chloride; labeled differently according to 'chemical'), Response: ZnAc (ZnO), ZnChl (ZnO). -Written in the document: TiO2-PVP, TiO2-Citrate (labeled differently according to 'coating'), Response: TiO2-PVP (TiO2), TiO2-Citrate (TiO2). -Written in the document: P25, Nanofilament (labeled differently according to 'manufacturer'), Response: P25 (TiO2), Nanofilament (TiO2). -Written in the document: CuO-USA, CuO-UK (labeled differently according to 'location'), Response: CuO-USA (CuO), CuO-UK (CuO). Description 6. Do not omit the information. Description 7. Do not write 'NP' or 'nanoparticles' followed by nanoparticles'name")
    def to_dict(self):
        return {"mat_name": self.mat_name}

class pchem_mat_synthesis(BaseModel):
    mat_synthesis: str = Field(description="The following nanoparticles were synthesized by researcher? or commercially available? If nanoparticles were synthesized, just answer 'Synthesized'. If nanoparticles were commercially available, just answer 'Commercially available (with cat# or product # in parentheses).'")
    def to_dict(self):
        return {"mat_synthesis": self.mat_synthesis}

class pchem_core_size(BaseModel):
    mat_core_size: str = Field(description="What is the value of core size or core size distribution (i.e. primary size, nominal size) of each material? Refer to the following format to answer. 0. TEM, SEM, AFM size. 1. Do not include unit. 2. Do not use a full sentence. 3. If there is no value, assign 'None'. 4. Do not include calculated size. 5. If the values are represented as a range, they are represented in the following format: value1-value2. ex) 50-100. 6. If the values are represented with an error rate '±', they are represented in the following format: value±error rate. ex) 35±10.")
    def to_dict(self):
        return {"mat_core_size": self.mat_core_size}

class pchem_hydrodynamic_size(BaseModel):
    mat_hydrodynamic_size: str = Field(description="What is the value of hydrodynamic size (i.e., Z-average size, size in media) of each material? Please provide details on the sizes under various conditions or in different media. 0. DLS size. 1. Do not include unit. 2. Do not include an explanation about hydrodynamic size. Just give me the value. 3. Do not use a full sentence. 4. If there is no value, assign 'None'. 5. If multiple values exist for each material, divide the value using ';', add parentheses after the value, and write the conditions in the parentheses. 6. Please refer to the following format when you write down the conditions in parentheses. Format: [Classification: detailed conditions]. ex) 50 (Solvent: water); 100 (Solvent: medium), 30 (Time: 2 h); 50 (Time: 24 h). Kind of Classification: Solvent, Time, Concentration, pH.7. If there are both water and culture media conditions, only the culture media condition is selected. However, if there are only conditions taken in water, select conditions taken in water. If t 8. If the values are represented as a range, they are represented in the following format: value1-value2. ex) 50-100. 9. If the values are represented with an error rate '±', they are represented in the following format: value±error rate. ex) 35±10.")
    def to_dict(self):
        return {"mat_hydrodynamic_size": self.mat_hydrodynamic_size}

class pchem_surface_charge(BaseModel):
    mat_surface_charge: str = Field(description="What is the value of surface charge (i.e., Zeta potential) of each material? Please provide details on the sizes under various conditions or in different media. 1. Do not include unit. 2. Do not include an explanation about hydrodynamic size. Just give me the value. 3. Do not use a full sentence. 4. If there is no value, assign 'None'. 5. If multiple values exist for each material, divide the value using ';', add parentheses after the value, and write the conditions in the parentheses. 6. Please refer to the following format when you write down the conditions in parentheses. Format: [Classification: detailed conditions]. ex) -10 (Solvent: water); 21 (Solvent: medium), 30 (Time: 2 h); 50 (Time: 24 h). Kind of Classification: Solvent, Time, Concentration, pH. 7. If the values are represented as a range, they are represented in the following format: value1-value2. ex) 50-100. 8. If the values are represented with an error rate '±', they are represented in the following format: value±error rate. ex) 35±10. 9. If there are both water and culture media conditions, only the culture media condition is selected. However, if there are only conditions taken in water, select conditions taken in water.")
    def to_dict(self):
        return {"mat_surface_charge": self.mat_surface_charge}

class pchem_surface_area(BaseModel):
    mat_surface_area: str = Field(description="What is the value of surface area of each material? Refer to the following format to answer. 1. Do not include unit. 2. Do not use a full sentence. 3. If there is no value, assign 'None'. 4. Do not include calculated size. 5. If the values are represented as a range, they are represented in the following format: value1-value2. ex) 50-100. 6. If the values are represented with an error rate '±', they are represented in the following format: value±error rate. ex) 35±10.")
    def to_dict(self):
        return {"mat_surface_area": self.mat_surface_area}

In [None]:
class tox_info(BaseModel):
    cell_type: str = Field(description="What cell lines were used in the cell viability assay? 1. Do not use a full sentence. 2. Please provide the abbreviation form of the cell name. ex) A549, THP-1, MRC-5, EA.hy926, BEAS-2B, HaCaT, L929, U87, etc. 3. If multiple cell lines were used, divide the value using ';'. ex) A549; THP-1; MRC-5.")
    cell_species: str = Field(description="What species the cell line originated from? 1. Do not use a full sentence. 2. Please refer to the following form. ex) Human, Rabbit, Mouse, Pig, etc. 3. If multiple cell lines were used and their species are different,  divide the value using ';' and write the name of the cell species followed by its cell type within parentheses. ex) Human (A549); Mouse (L929).")
    cell_organ: str = Field(description="What organ the cell line originated from? 1. Do not use a full sentence. 2. Please refer to the following form. ex) Lung, Breast, Kidney, Brain, Liver, Bronchial tube, Prostate, Spleen, etc. 3. If multiple cell lines were used and their organs are different, divide the value using ';' and write the name of the cell species followed by its cell type within parentheses. ex) Lung (A549); Fibroblast (L929).")
    cell_assay: str = Field(description="Which cell viability assays or cytotoxicity assay were conducted in this paper? 1. Do not use a full sentence. 2. Only reference the following cell viability assays, and if none, assign as 'None'. 2. Please refer to the following form. ex) CCK-8, MTT, MTS, WST, Alamar blue, CellTiter-Glo, Neutral Red, NRU, Trypan blue, XTT, Calcein-AM, BrdU, EdU, Propidium iodide, Hoechst33342 assays. 3. If multiple cell viability assays were used, divide the value using ';'. ex) MTT; MTS; CCK-8.")
    cell_classification: str = Field(description="Please determine the cell type: whether it is a normal cell or a cancer cell? 1. Do not use a full sentence. 2. Just answer, 'Normal' or 'Cancer'.")
    def to_dict(self):
        return {"cell_type": self.cell_type, "cell_species": self.cell_species, "cell_organ": self.cell_organ, "cell_assay": self.cell_assay,"cell_classification": self.cell_classification}

# 4. Connect LangChain with Zotero
- Zotero collection cionfiguration
- Set directory of output and zotero pdf download
- Extract data from Zotero
- Get PDF file list and IDs

In [None]:
##Zotero collection configuration

import os
from os.path import join, basename, splitext
import subprocess
from glob import glob
from shutil import copy
from random import shuffle, seed

from pyzotero import zotero

zot = zotero.Zotero(library_id, 'library_type', 'zotero_api_key') ## fill zotero.Zoter(library_id, library_type, zotero_api_key)

collections = {c['data']['name']: c for c in zot.collections()}

collection_names = []
for key, value in collections.items():
    #print(key)
    collection_names.append(key)

collection_names

['remove123456',
 'remove12345',
 'remove123',
 'PchemScore_1',
 'oxide af filter5',
 'oxide af filter3',
 'oxide af filter2',
 'pmfilter',
 'oxide af a',
 'oxide af filter1',
 'oxide af filter',
 'WOS 8',
 'WOS 7',
 'WOS 6',
 'WOS 5',
 'WOS 4',
 'WOS 3',
 'WOS 1',
 'WOS 2',
 'WOS 0',
 'PUBMED 9',
 'PUBMED 8',
 'PUBMED 7',
 'PUBMED 6',
 'PUBMED 2',
 'PUBMED 1',
 'PUBMED 5',
 'PUBMED 4',
 'PUBMED 3',
 'PUBMED 0',
 'testa',
 'new pdf',
 'Oxide af 2018 WOS',
 'Oxide af 2018 PUBMED',
 'Ha 1a-1',
 'Metal oxide after 2018 wos',
 'Metal oxide after 2018 pubmed',
 'test1',
 'Metal External dataset',
 'ddd',
 'Trinh_test',
 'External dataset',
 'Ha 1d',
 'Ha 1c',
 'Ha 1b',
 'Ha 1a',
 'Ha 3b',
 '2025_DB',
 'TrinhB',
 'Ha 3a',
 'Ha 2',
 'TrinhA',
 'training set oxide 3',
 'training set metal 2',
 'training set oxide 2',
 'training set metal',
 'test set for chatGPT',
 'traning set for chatGPT',
 'Ha IIIa',
 'Ha II',
 'Ha I',
 'Xiao Metal',
 'Ha Oxide 2018',
 'Xiao Oxide',
 'Trinh Metal 2018',
 'k

In [None]:
selected_collection = "enter_collection_name" # copy and pase collection name
pdf_folder = "/home/pdf_" + selected_collection.replace(" ", "_")
print("PDF files are stored at "+ pdf_folder + ".")

PDF 파일들은 /home/pdf_remove123456에 저장됩니다.


In [None]:
## PDF 파일 폴더와 output 폴더 설정

import os

def create_folder(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder created at {folder_path}")
    else:
        print(f"Folder already exists at {folder_path}")

output_folder = "output_" + user_name
create_folder(output_folder)
create_folder(pdf_folder)

Folder already exists at output_HEY20241019-Eunyong Ha-Claude 3.5 sonnet-Ha1
Folder already exists at /home/pdf_remove123456


In [None]:
## extract collection data from zotero

import os
from os.path import join, basename, splitext
import subprocess
from glob import glob
from shutil import copy
from random import shuffle, seed

from pyzotero import zotero
zot = zotero.Zotero(library_id, 'library_type', 'zotero_api_key') ## fill zotero.Zoter(library_id, library_type, zotero_api_key)

collections = {c['data']['name']: c for c in zot.collections()}
collection = collections[selected_collection]
key = collection['key']

items = [d for d in zot.everything(zot.collection_items(key))]

pdf_dict = {}
for item in items:
    pdf_dict[item['data']['key']] = item['data']['title']

### if there is 'title' error use this code
#for item in items:
#    # Check if the 'title' key exists in the item['data']
#    if 'title' in item['data']:
#        pdf_dict[item['data']['key']] = item['data']['title']
#    else:
#        print(f"Item {item['data']['key']} does not have a title.")

# Iterate over items in the collection
for item in items:
    title = item['data'].get('title', 'No Title')
    print(f"\nProcessing item: {title}")

    # Ensure we're not processing non-attachment items
    if item['data'].get('itemType') != 'attachment':
        try:
            children = [c for c in zot.children(item['key'])]
            print(f"Found {len(children)} children for item '{title}'")

            # Check for PDFs among the children
            pdfs = [c for c in children if c['data'].get('contentType') == 'application/pdf']

            if not children:
                print(f"Missing documents for: {title}")
            elif not pdfs:
                print(f"No PDFs found for: {title}")
            elif len(pdfs) > 1:
                print(f"Too many PDFs for: {title}")
            else:
                doc = pdfs[0]
                print(f"PDF found: {doc['data']['filename']}")

                pdf_file_path = os.path.join(pdf_folder, f"{doc['key']}.pdf")

                # Check if the PDF already exists, and download if it doesn't
                if not os.path.exists(pdf_file_path):
                    zot.dump(doc['key'], pdf_file_path, pdf_folder)
                    print(f"{pdf_file_path} is downloaded.")
                else:
                    print(f"{pdf_file_path} already exists.")

        except Exception as e:
            print(f"Error processing item {title}: {e}")
    else:
        print(f"Skipped non-PDF entry: {title}")


Processing item: 19922763
Skipped non-PDF entry: 19922763

Processing item: Oxidative stress and apoptosis induced by nanosized titanium dioxide in PC12 cells
Found 1 children for item 'Oxidative stress and apoptosis induced by nanosized titanium dioxide in PC12 cells'
PDF found: 19922763.pdf
/home/pdf_remove123456/WZ62GXAT.pdf already exists.

Processing item: Use of metal oxide nanoparticle band gap to develop a predictive paradigm for oxidative stress and acute pulmonary inflammation
Found 1 children for item 'Use of metal oxide nanoparticle band gap to develop a predictive paradigm for oxidative stress and acute pulmonary inflammation'
PDF found: 22502734.pdf
/home/pdf_remove123456/DUR86VJI.pdf already exists.

Processing item: Effects of iron oxide nanoparticle labeling on human endothelial cells
Found 1 children for item 'Effects of iron oxide nanoparticle labeling on human endothelial cells'
PDF found: 22776829.pdf
/home/pdf_remove123456/U5TWM8XC.pdf already exists.

Processing

In [None]:
## Get PDF file list and IDs

pdf_files = []

for file_path in os.listdir(pdf_folder):
    if os.path.isfile(os.path.join(pdf_folder, file_path)):
        pdf_files.append(os.path.join(pdf_folder, file_path))

print(pdf_files)

# get pdf ids and store as pdf_ids
pdf_ids = []
for file_path in pdf_files:
    # get base name of file_pafth and remove .pdf
    base_name = os.path.basename(file_path)
    base_name = os.path.splitext(base_name)[0]
    pdf_ids.append(base_name)
print(pdf_ids)

['/home/pdf_remove123456/WZ62GXAT.pdf', '/home/pdf_remove123456/UK8SV8XM.pdf', '/home/pdf_remove123456/JST2FHKJ.pdf', '/home/pdf_remove123456/KT5U49ZB.pdf', '/home/pdf_remove123456/95KXTZVR.pdf', '/home/pdf_remove123456/8A3RUDZD.pdf', '/home/pdf_remove123456/DUR86VJI.pdf', '/home/pdf_remove123456/U5TWM8XC.pdf', '/home/pdf_remove123456/I3MS5H2K.pdf', '/home/pdf_remove123456/9843ENP3.pdf', '/home/pdf_remove123456/W5DHTC2Z.pdf']
['WZ62GXAT', 'UK8SV8XM', 'JST2FHKJ', 'KT5U49ZB', '95KXTZVR', '8A3RUDZD', 'DUR86VJI', 'U5TWM8XC', 'I3MS5H2K', '9843ENP3', 'W5DHTC2Z']


# 5. Claude 3.5 Sonnet with LangChain

In [None]:
from langchain.embeddings import OpenAIEmbeddings
OPENAI_API_KEY="openai_api_key"
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model ="text-embedding-3-large")

  warn_deprecated(


In [None]:
## Generate embeddings & save as files

In [None]:

from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFium2Loader
import os


error_files = []

def get_pdf_text(file_pafth):
    try:
        # get base name of file_pafth and remove .pdf
        base_name = os.path.basename(file_pafth)
        base_name = os.path.splitext(base_name)[0]
        print(base_name)

        em_path = "/home/claude_em24080199/" + base_name + "_claude" ## embedding result save path
        if not os.path.exists(em_path):
            # Load the PDF document
            load = PyPDFium2Loader(file_pafth)
            document = load.load()

            # Split text into chunks
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
            texts = text_splitter.split_documents(document)

            # Embed the documents using FAISS
            db = FAISS.from_documents(texts, embeddings)

            # Save the FAISS index locally
            db.save_local(em_path)
            print(f"{em_path} is generated.")
        else:
            print(f"{em_path} already exists")
    except Exception as e:
        # Handle any exceptions and continue processing the next file
        print(f"Error processing {file_pafth}: {e}")
        # 오류가 발생한 파일명을 리스트에 추가
        error_files.append(file_pafth)

# Run get_pdf_text for all pdf_files
for file_path in pdf_files:
    get_pdf_text(file_path)

WZ62GXAT




/home/claude_em24080199/WZ62GXAT_claude is generated.
UK8SV8XM
/home/claude_em24080199/UK8SV8XM_claude is generated.
JST2FHKJ
/home/claude_em24080199/JST2FHKJ_claude is generated.
KT5U49ZB
/home/claude_em24080199/KT5U49ZB_claude is generated.
95KXTZVR
/home/claude_em24080199/95KXTZVR_claude is generated.
8A3RUDZD
/home/claude_em24080199/8A3RUDZD_claude is generated.
DUR86VJI
/home/claude_em24080199/DUR86VJI_claude is generated.
U5TWM8XC
/home/claude_em24080199/U5TWM8XC_claude is generated.
I3MS5H2K
/home/claude_em24080199/I3MS5H2K_claude is generated.
9843ENP3
/home/claude_em24080199/9843ENP3_claude is generated.
W5DHTC2Z
/home/claude_em24080199/W5DHTC2Z_claude is generated.


In [None]:

if error_files:
    print("The following files encountered errors:")
    for error_file in error_files:
        print(error_file)
else:
    print("No errors encountered.")

No errors encountered.


In [None]:
# 마지막 셀에 추가
notebook_end_time = time.time()
print(f"total notebook running time : {notebook_end_time - notebook_start_time}sec")

노트북 전체 실행 시간: 55.63834095001221초


In [None]:
from langchain_anthropic.chat_models import ChatAnthropic
from langchain.vectorstores import FAISS
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough


chat = ChatAnthropic(model="claude-3-5-sonnet-20240620", api_key="VOYAGE_API_KEY",
                     temperature=0)

In [None]:
from langchain.callbacks import get_openai_callback

openai_cb = {}

def get_answer(doc_id, q, parser):
    db = FAISS.load_local("/home/claude_em24080199/" + doc_id + "_claude", embeddings,allow_dangerous_deserialization=True)
    prompt = PromptTemplate(
        template=template,
        input_variables=['context', 'question'],
        partial_variables = {"format_instructions": parser.get_format_instructions()},)
    retriever = db.as_retriever(search_kwargs={'k': 10})
    qa_llm = RetrievalQA.from_chain_type(llm = chat,
                                        chain_type = 'stuff',
                                        retriever = retriever,
                                        return_source_documents = True,
                                        chain_type_kwargs = {'prompt': prompt})
    with get_openai_callback() as cb:
#        print('working {}\n'.format(doc_id))
        output = qa_llm({'query': q})
#        print(output['result'])
        now = datetime.datetime.now(timezone('Asia/Seoul'))
        now = now.strftime("%Y%m%d_%H%M%S")
        openai_cb[now] = cb
    return output

In [None]:
output_parser = CommaSeparatedListOutputParser()
mats_all_paper = {pdf_id: get_answer(pdf_id, q_mat, output_parser)['result'] for pdf_id in pdf_ids}

  warn_deprecated(


In [None]:
mats_all_paper

{'WZ62GXAT': '{"response": "nano-TiO2 (TiO2)"}',
 'UK8SV8XM': '{\n  "nanoparticles": "Citrate coated iron oxide (Fe3O4), PEG400DMA coated iron oxide (Fe3O4)"\n}',
 'JST2FHKJ': 'IONPs (Fe3O4)',
 'KT5U49ZB': 'Ferumoxtran-10 (Fe3O4)',
 '95KXTZVR': 'MgNPs-Fe3O4 (Fe3O4)',
 '8A3RUDZD': 'Fe3O4-PEI-FI-HA6K (Fe3O4), Fe3O4-PEI-FI-HA31K (Fe3O4)',
 'DUR86VJI': '{\n  "nanoparticles": "Al2O3 (Al2O3), CuO (CuO), CeO2 (CeO2), Co3O4 (Co3O4), CoO (CoO), Cr2O3 (Cr2O3), Fe2O3 (Fe2O3), Fe3O4 (Fe3O4), Gd2O3 (Gd2O3), HfO2 (HfO2), In2O3 (In2O3), Mn2O3 (Mn2O3), Ni2O3 (Ni2O3), NiO (NiO), SiO2 (SiO2), SnO2 (SnO2), TiO2 (TiO2), Y2O3 (Y2O3), Yb2O3 (Yb2O3), ZnO (ZnO), ZrO2 (ZrO2)"\n}',
 'U5TWM8XC': '{\n  "nanoparticles": "INOPS (Fe2O3), INOPS-PLL (Fe2O3)"\n}',
 'I3MS5H2K': '{\n  "nanoparticles": "MNP core (Fe3O4), MNP@PEG POSS (Fe3O4), MNP@OctaTMA POSS (Fe3O4)"\n}',
 '9843ENP3': '{\n  "nanoparticles": "SPIO (Fe3O4), SPIO@SiO2 (Fe3O4), SPIO@SiO2-NH2 (Fe3O4), SPIO@dextran (Fe3O4)"\n}',
 'W5DHTC2Z': 'SPIO nanocomposit

# material data extracted

In [None]:
# OutputFixingPaser

from langchain.output_parsers import OutputFixingParser
fixing_parser = OutputFixingParser.from_llm(parser = output_parser, llm = chat)

mats_all_paper_fix = {}
for key, value in mats_all_paper.items():
    value_fix = fixing_parser.parse(value)
    mats_all_paper_fix[key] = value_fix

mats_all_paper_fix

{'WZ62GXAT': ['{"response": "nano-TiO2 (TiO2)"}'],
 'UK8SV8XM': ['{\n  "nanoparticles": "Citrate coated iron oxide (Fe3O4)',
  'PEG400DMA coated iron oxide (Fe3O4)"\n}'],
 'JST2FHKJ': ['IONPs (Fe3O4)'],
 'KT5U49ZB': ['Ferumoxtran-10 (Fe3O4)'],
 '95KXTZVR': ['MgNPs-Fe3O4 (Fe3O4)'],
 '8A3RUDZD': ['Fe3O4-PEI-FI-HA6K (Fe3O4)', 'Fe3O4-PEI-FI-HA31K (Fe3O4)'],
 'DUR86VJI': ['{\n  "nanoparticles": "Al2O3 (Al2O3)',
  'CuO (CuO)',
  'CeO2 (CeO2)',
  'Co3O4 (Co3O4)',
  'CoO (CoO)',
  'Cr2O3 (Cr2O3)',
  'Fe2O3 (Fe2O3)',
  'Fe3O4 (Fe3O4)',
  'Gd2O3 (Gd2O3)',
  'HfO2 (HfO2)',
  'In2O3 (In2O3)',
  'Mn2O3 (Mn2O3)',
  'Ni2O3 (Ni2O3)',
  'NiO (NiO)',
  'SiO2 (SiO2)',
  'SnO2 (SnO2)',
  'TiO2 (TiO2)',
  'Y2O3 (Y2O3)',
  'Yb2O3 (Yb2O3)',
  'ZnO (ZnO)',
  'ZrO2 (ZrO2)"\n}'],
 'U5TWM8XC': ['{\n  "nanoparticles": "INOPS (Fe2O3)', 'INOPS-PLL (Fe2O3)"\n}'],
 'I3MS5H2K': ['{\n  "nanoparticles": "MNP core (Fe3O4)',
  'MNP@PEG POSS (Fe3O4)',
  'MNP@OctaTMA POSS (Fe3O4)"\n}'],
 '9843ENP3': ['{\n  "nanoparticles": 

In [None]:
# Pchem info. extraction

In [None]:
from dataclasses import dataclass

@dataclass
class gpt_responses:
    pdf_id: str
    mat_name: str
    pchem_product_info: str
    pchem_mat_synthesis: str
    pchem_core_size: str
    pchem_hydrodynamic_size: str
    pchem_surface_charge: str
    pchem_surface_area: str

In [None]:
import json
import textwrap
import langchain_core.output_parsers


def get_sub_answers(mat, q, key, data_class):
    schema = data_class.schema()
    field_names = list(schema['properties'].keys())
    field_names.insert(0, 'key')
    #print(field_names)

    output_parser = PydanticOutputParser(pydantic_object=data_class)
    ans = get_answer(key, q, output_parser)

    fixing_parser = OutputFixingParser.from_llm(parser = output_parser, llm = chat)

    text = ans['result']
    py = fixing_parser.parse(text)
    py_dict = py.to_dict()

    #print(py_dict)

    pchem_df = pd.DataFrame.from_records(py_dict, index = ["0"])
    pchem_df.insert(0, 'key', key)
    #pchem_df.insert(0, 'ref', pdf_dict[key])
    print(pchem_df)

    return(pchem_df)

#i = "TiO2 P25–70 nano-TiO2"
#q = "please pull out material information of " + i + " in the document."
#tmp = get_sub_answers(i, q, "95ASP9SZ", pchem_product_info)

In [None]:
all_pchem_dfs = []
all_gpt_responses = []
error_files = []

for key, value in mats_all_paper_fix.items():
    try:
        # make for loop to print each value
        for i in value:
            print(key + ": " + i)

            q = "please pull out material information of " + i + " in the document."
            pchem_df = get_sub_answers(i, q, key, pchem_product_info)

            q = "please pull out material synthesis information of " + i + " in the document."
            pchem_df_add = get_sub_answers(i, q, key, pchem_mat_synthesis)
            pchem_df = pd.merge(pchem_df, pchem_df_add, on='key')

            q = "please pull out core size information of " + i + " in the document."
            pchem_df_add = get_sub_answers(i, q, key, pchem_core_size)
            pchem_df = pd.merge(pchem_df, pchem_df_add, on='key')

            q = "please pull out hydrodynamic size information of " + i + " in the document."
            pchem_df_add = get_sub_answers(i, q, key, pchem_hydrodynamic_size)
            pchem_df = pd.merge(pchem_df, pchem_df_add, on='key')

            q = "please pull out surface charge information of " + i + " in the document."
            pchem_df_add = get_sub_answers(i, q, key, pchem_surface_charge)
            pchem_df = pd.merge(pchem_df, pchem_df_add, on='key')

            q = "please pull out surface area information of " + i + " in the document."
            pchem_df_add = get_sub_answers(i, q, key, pchem_surface_area)
            pchem_df = pd.merge(pchem_df, pchem_df_add, on='key')
            pchem_df.insert(0, 'ref', pdf_dict[key])

            # Append pchem_df to the list
            all_pchem_dfs.append(pchem_df)

    except Exception as e:
        # 오류가 발생한 파일을 기록하고, 다음 파일로 넘어감
        print(f"Error processing {key}: {e}")
        error_files.append(key)

# Combine all pchem_df DataFrames into a single DataFrame
all_pchem_df = pd.concat(all_pchem_dfs, ignore_index=True)
all_pchem_df['ref'] = all_pchem_df['key'].map(lambda x: pdf_dict[x] if x in pdf_dict else None)


WZ62GXAT: {"response": "nano-TiO2 (TiO2)"}
        key          mat_name
0  WZ62GXAT  nano-TiO2 (TiO2)
        key           mat_synthesis
0  WZ62GXAT  Commercially available
        key mat_core_size
0  WZ62GXAT         20-50
        key mat_hydrodynamic_size
0  WZ62GXAT                 368.1
        key mat_surface_charge
0  WZ62GXAT               None
        key mat_surface_area
0  WZ62GXAT             None
UK8SV8XM: {
  "nanoparticles": "Citrate coated iron oxide (Fe3O4)
        key                           mat_name
0  UK8SV8XM  Citrate coated iron oxide (Fe3O4)
        key mat_synthesis
0  UK8SV8XM   Synthesized
        key mat_core_size
0  UK8SV8XM            10
        key mat_hydrodynamic_size
0  UK8SV8XM                    62
        key mat_surface_charge
0  UK8SV8XM               None
        key mat_surface_area
0  UK8SV8XM             None
UK8SV8XM: PEG400DMA coated iron oxide (Fe3O4)"
}
        key                             mat_name
0  UK8SV8XM  PEG400DMA coated iron 

In [None]:
# 오류가 발생한 파일 출력
if error_files:
    print("The following files encountered errors:")
    for error_file in error_files:
        print(error_file)
else:
    print("No errors encountered.")

No errors encountered.


# Pchem data extraction result

In [None]:
all_pchem_df

Unnamed: 0,ref,key,mat_name,mat_synthesis,mat_core_size,mat_hydrodynamic_size,mat_surface_charge,mat_surface_area
0,19922763,WZ62GXAT,nano-TiO2 (TiO2),Commercially available,20-50,368.1,,
1,24094173.pdf,UK8SV8XM,Citrate coated iron oxide (Fe3O4),Synthesized,10,62,,
2,24094173.pdf,UK8SV8XM,PEG400DMA coated iron oxide (Fe3O4),Synthesized,,198,,
3,24788586.pdf,JST2FHKJ,IONPs (Fe3O4),Synthesized,7-9,86,40 (pH: 2); -25 (pH: 10); 0 (pH: 6.8),
4,17178155.pdf,KT5U49ZB,Ferumoxtran-10 (Fe3O4),"Commercially available (Sinerem®, Guerbet, Fra...",5,,,
5,23892599.pdf,95KXTZVR,MgNPs-Fe3O4 (Fe3O4),Commercially available,10,60-100 (Solvent: suspension); 100-120 (Concent...,-30--40 (pH: 10),100-120
6,24462358.pdf,8A3RUDZD,Fe3O4-PEI-FI-HA6K (Fe3O4),Synthesized,15.6±3.4,190.5 (Solvent: water),16.3,
7,24462358.pdf,8A3RUDZD,Fe3O4-PEI-FI-HA31K (Fe3O4),Synthesized,16.1±2.9,217.1,21.9,
8,22502734.pdf,DUR86VJI,Al2O3 (Al2O3),"Commercially available (Al2O3 NPs, 10 nm)",14.7±5.2,260.4±16.9 (Solvent: BEGM); 230.5±6.6 (Solvent...,7.4 (pH: PZZP),
9,22502734.pdf,DUR86VJI,CuO (CuO),Synthesized,12.8±3.4,305.3±5.6 (Solvent: BEGM); 313.8±4.9 (Solvent:...,7.9,


# save extracted Pchem data

In [None]:
import datetime
from pytz import timezone

now = datetime.datetime.now(timezone('Asia/Seoul'))
now = now.strftime("%Y%m%d_%H%M%S")
o = os.path.join(output_folder, "pchem_gtp_output_" + now + ".xlsx")
all_pchem_df.to_excel(o)
print(o + "로 저장되었습니다.")

output_HEY20241019-Eunyong Ha-Claude 3.5 sonnet-Ha1/pchem_gtp_output_20241020_005245.xlsx로 저장되었습니다.


# Tox data extraction

In [None]:
all_tox_dfs = []
all_gpt_responses = []

for key, value in mats_all_paper_fix.items():

    # make for loop to print each value
    for i in value:
        print(key)

        q = "please pull out cytotoxicity information in the document."
        tox_df = get_sub_answers(i, q, key, tox_info)

        #gpt_res = gpt_responses(key, i, ans_info, ans_core, ans_hydro, ans_s_charge, ans_s_area)
        #all_gpt_responses.append(gpt_res)

        # Append pchem_df to the list
        all_tox_dfs.append(tox_df)

# Combine all pchem_df DataFrames into a single DataFrame
all_tox_df = pd.concat(all_tox_dfs, ignore_index=True)
all_tox_df['ref'] = all_tox_df['key'].map(lambda x: pdf_dict[x] if x in pdf_dict else None)

WZ62GXAT
        key cell_assay cell_classification     cell_organ cell_species  \
0  WZ62GXAT        MTT              Cancer  Adrenal gland          Rat   

  cell_type  
0      PC12  
UK8SV8XM
        key                 cell_assay cell_classification  \
0  UK8SV8XM  Live/Dead Viability Assay      Normal; Cancer   

                          cell_organ                   cell_species  \
0  Fibroblast (NIH 3T3); Lung (A549)  Mouse (NIH 3T3); Human (A549)   

       cell_type  
0  NIH 3T3; A549  
UK8SV8XM
        key                 cell_assay cell_classification  \
0  UK8SV8XM  Live/Dead Viability Assay      Normal; Cancer   

                          cell_organ                   cell_species  \
0  Fibroblast (NIH 3T3); Lung (A549)  Mouse (NIH 3T3); Human (A549)   

       cell_type  
0  NIH 3T3; A549  
JST2FHKJ
        key cell_assay cell_classification  cell_organ cell_species cell_type
0  JST2FHKJ   MTT; LDH              Normal  Fibroblast        Mouse   NIH 3T3
KT5U49ZB
        ke

In [None]:
all_tox_df

Unnamed: 0,key,cell_assay,cell_classification,cell_organ,cell_species,cell_type,ref
0,WZ62GXAT,MTT,Cancer,Adrenal gland,Rat,PC12,19922763
1,UK8SV8XM,Live/Dead Viability Assay,Normal; Cancer,Fibroblast (NIH 3T3); Lung (A549),Mouse (NIH 3T3); Human (A549),NIH 3T3; A549,24094173.pdf
2,UK8SV8XM,Live/Dead Viability Assay,Normal; Cancer,Fibroblast (NIH 3T3); Lung (A549),Mouse (NIH 3T3); Human (A549),NIH 3T3; A549,24094173.pdf
3,JST2FHKJ,MTT; LDH,Normal,Fibroblast,Mouse,NIH 3T3,24788586.pdf
4,KT5U49ZB,Neutral Red; MTT,Normal,Monocyte-macrophage,Human,HMM,17178155.pdf
5,95KXTZVR,LDH,Cancer,Lung,Human,A549,23892599.pdf
6,8A3RUDZD,MTT,Cancer,Brain (U87MG); Cervix (Hela),Human,U87MG; Hela,24462358.pdf
7,8A3RUDZD,MTT,Cancer,Brain (U87MG); Cervix (Hela),Human,U87MG; Hela,24462358.pdf
8,DUR86VJI,MTS; ATP; LDH,Normal,Bronchial tube (BEAS-2B); Myeloid (RAW 264.7),Human (BEAS-2B); Mouse (RAW 264.7),BEAS-2B; RAW 264.7,22502734.pdf
9,DUR86VJI,MTS; ATP; LDH,Normal,Bronchial tube (BEAS-2B); Myeloid (RAW 264.7),Human (BEAS-2B); Mouse (RAW 264.7),BEAS-2B; RAW 264.7,22502734.pdf


# Tox data extraction results

In [None]:
import datetime
from pytz import timezone

now = datetime.datetime.now(timezone('Asia/Seoul'))
now = now.strftime("%Y%m%d_%H%M%S")
o = os.path.join(output_folder, "tox_gtp_output_" + now + ".xlsx")
all_tox_df.to_excel(o)
print("saved as " + o + ".")

output_HEY20241019-Eunyong Ha-Claude 3.5 sonnet-Ha1/tox_gtp_output_20241020_005451.xlsx로 저장되었습니다.


In [None]:
# 마지막 셀에 추가
notebook_end_time = time.time()
print(f"total running time: {notebook_end_time - notebook_start_time}sec")

노트북 전체 실행 시간: 676.0091426372528초


# Tokens and cost

In [None]:
cost_list = []
for key, value in openai_cb.items():
    #print(value.total_cost)
    cost_dict = {}
    cost_dict["Date Time"] = key
    cost_dict["Total tokens"] = value.total_tokens
    cost_dict["Total cost ($)"] = value.total_cost
    cost_list.append(cost_dict)

cost_df = pd.DataFrame(cost_list)
sums = cost_df.select_dtypes(include='number').sum()
sums_dict = {"Date Time": "Total", "Total tokens": sums["Total tokens"], "Total cost ($)": sums["Total cost ($)"]}

In [None]:
cost_df = pd.concat([cost_df, pd.DataFrame([sums_dict])])
cost_df

Unnamed: 0,Date Time,Total tokens,Total cost ($)
0,20241020_004434,0.0,0.0
1,20241020_004436,0.0,0.0
2,20241020_004437,0.0,0.0
3,20241020_004439,0.0,0.0
4,20241020_004441,0.0,0.0
...,...,...,...
280,20241020_005440,0.0,0.0
281,20241020_005443,0.0,0.0
282,20241020_005446,0.0,0.0
283,20241020_005451,0.0,0.0


In [None]:
import datetime
from pytz import timezone

now = datetime.datetime.now(timezone('Asia/Seoul'))
now = now.strftime("%Y%m%d_%H%M%S")
o = os.path.join(output_folder, "token_and_cost_" + now + ".xlsx")
cost_df.to_excel(o)
print("saved as " + o + ".")

output_HEY20241019-Eunyong Ha-Claude 3.5 sonnet-Ha1/token_and_cost_20241020_005451.xlsx로 저장되었습니다.


In [None]:
# 마지막 셀에 추가
notebook_end_time = time.time()
print(f"total running time: {notebook_end_time - notebook_start_time}sec")

노트북 전체 실행 시간: 676.5923361778259초
