# 0. Enter username (output folder will made for this name)

In [None]:
import time
notebook_start_time = time.time()

In [None]:
user_name = "user_name"

# 1. Basic prompt template of LangChain enviroment

In [None]:
template = """
You are a data analyst specializing in nano-toxicity.
Your task is extracting data about the physicochemical and cytotoxic properties of nanomaterials used in the research paper.
Use the following pieces of instruction and format to answer the user's questions.
Don't be lazy.
Be very strict and answer the question very accurate and scientific manner.
If data is not specified in the paper, just answer 'None'.
Do not use a full sentence.
Context: {context}
Question: {question}
Format instruction: {format_instructions}
"""

# 2. Material data extraction prompt

In [None]:
q_mat = """
Just answer with following format: 'Nanomaterial name (Nanomaterial type)'. Do not use a full sentence.
Question: What nanoparticles were used in the characterization experiments in the research paper? Refer to the following descriptions.
Description 1. Provide the nanoparticles that were used for core size(i.e. primary size, nominal size) or hydrodynamic size(i.e. Z-average size) or surface charge(i.e. zeta potential) or surface area in the research paper.
Description 2. Provide information on all the nanoparticles actually used in the authors' experiments, Do not include nanoparticles used in other papers (i.e., mentioned in the reference) in your answer.
Description 3. If there were multiple nanoparticles of the same type, the author would have named them differently. What did the author name them?
Description 4. Write the name of the nanoparticle followed by its type in chemical formula within parentheses.
Description 5. Several examples of the format.
-Written in the document: Al2O3, ZnO, SiO2, Fe2O3 (normal form), Response: Al2O3 (Al2O3), ZnO (ZnO), SiO2 (SiO2), Fe2O3 (Fe2O3).
-Written in the document: T10, T100 (labeled differently according to 'size'), Response: T10 (TiO2), T100 (TiO2).
-Written in the document: ZnAc, ZnChl (ZnO-Acetate, ZnO-Chloride; labeled differently according to 'chemical'), Response: ZnAc (ZnO), ZnChl (ZnO).
-Written in the document: TiO2-PVP, TiO2-Citrate (labeled differently according to 'coating'), Response: TiO2-PVP (TiO2), TiO2-Citrate (TiO2).
-Written in the document: P25, ST-21 (labeled differently according to 'manufacturer'), Response: P25 (TiO2), ST-21 (TiO2).
-Written in the document: CuO-USA, CuO-UK (labeled differently according to 'location'), Response: CuO-USA (CuO), CuO-UK (CuO).
Description 6. Do not omit the information.
Description 7. Do not write 'NP' or 'nanoparticles' followed by nanoparticles'name
"""

# 3. PChem/Tox data extraction prompt
- Pchem
- Tox

In [None]:
import pprint
from typing import Any, Dict
import datetime
from pytz import timezone

import pandas as pd
from langchain.output_parsers import PydanticOutputParser
#from langchain.output_parsers import PandasDataFrameParser

from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator

class pchem_product_info(BaseModel):
    mat_name: str = Field(description="Question: What nanoparticles were used in the characterization experiments in the research paper? Refer to the following descriptions. Description 0. Do not include metal nanoparticles such as Silver (Ag) or Gold (Au). 1. Provide the nanoparticles that were used for core size(i.e. primary size, nominal size) or hydrodynamic size(i.e. Z-average size) or surface charge(i.e. zeta potential) or surface area in the research paper. Description 2. Provide information on all the nanoparticles actually used in the authors' experiments. Do not include nanoparticles used in other papers (i.e., mentioned in the reference) in your answer. Description 3. If there were multiple nanoparticles of the same type, the author would have named them differently. What did the author name them? Description 4. Write the name of the nanoparticle followed by its type in chemical formula within parentheses.   Description 5. Several examples of the format. -Written in the document: Al2O3 NPs, ZnO, SiO2, Fe2O3 (normal form), Response: Al2O3 (Al2O3), ZnO (ZnO), SiO2 (SiO2), Fe2O3 (Fe2O3). -Written in the document: T10, T100 (labeled differently according to 'size'), Response: T10 (TiO2), T100 (TiO2). -Written in the document: ZnAc, ZnChl (ZnO-Acetate, ZnO-Chloride; labeled differently according to 'chemical'), Response: ZnAc (ZnO), ZnChl (ZnO). -Written in the document: TiO2-PVP, TiO2-Citrate (labeled differently according to 'coating'), Response: TiO2-PVP (TiO2), TiO2-Citrate (TiO2). -Written in the document: P25, Nanofilament (labeled differently according to 'manufacturer'), Response: P25 (TiO2), Nanofilament (TiO2). -Written in the document: CuO-USA, CuO-UK (labeled differently according to 'location'), Response: CuO-USA (CuO), CuO-UK (CuO). Description 6. Do not omit the information. Description 7. Do not write 'NP' or 'nanoparticles' followed by nanoparticles'name")
    def to_dict(self):
        return {"mat_name": self.mat_name}

class pchem_mat_synthesis(BaseModel):
    mat_synthesis: str = Field(description="The following nanoparticles were synthesized by researcher? or commercially available? If nanoparticles were synthesized, just answer 'Synthesized'. If nanoparticles were commercially available, just answer 'Commercially available (with cat# or product # in parentheses).'")
    def to_dict(self):
        return {"mat_synthesis": self.mat_synthesis}

class pchem_core_size(BaseModel):
    mat_core_size: str = Field(description="What is the value of core size or core size distribution (i.e. primary size, nominal size) of each material? Refer to the following format to answer. 0. TEM, SEM, AFM size. 1. Do not include unit. 2. Do not use a full sentence. 3. If there is no value, assign 'None'. 4. Do not include calculated size. 5. If the values are represented as a range, they are represented in the following format: value1-value2. ex) 50-100. 6. If the values are represented with an error rate '±', they are represented in the following format: value±error rate. ex) 35±10.")
    mat_core_size_measurement_method: str = Field(description="What measurement method is used for measure core size (i.e., primary size)? 1. transmission electron microscopy (TEM) should be selected first, followed by scanning electron microscopy (SEM), atomic force microscopy (AFM), and X-ray diffraction (XRD) in cases where multiple methods are available. 2. If the measurement mehod is not present type 'None'. 3. please use abbreviation form such as TEM, SEM, XRD.")
    mat_core_size_source: str = Field(description="Is the core size data sourced from the author's own experiment, manufacturer specifications, a reference paper, or is it not provided? Just answer four types. Format: Experiment, Manufacturer, Reference, Not specified")
    def to_dict(self):
        return {"mat_core_size": self.mat_core_size, "mat_core_size_measurement_method": self.mat_core_size_measurement_method, "mat_core_size_source": self.mat_core_size_source}

class pchem_hydrodynamic_size(BaseModel):
    mat_hydrodynamic_size: str = Field(description="What is the value of hydrodynamic size (i.e., Z-average size, size in media) of each material? Please provide details on the sizes under various conditions or in different media. 0. DLS size. 1. Do not include unit. 2. Do not include an explanation about hydrodynamic size. Just give me the value. 3. Do not use a full sentence. 4. If there is no value, assign 'None'. 5. If multiple values exist for each material, divide the value using ';', add parentheses after the value, and write the conditions in the parentheses. 6. Please refer to the following format when you write down the conditions in parentheses. Format: [Classification: detailed conditions]. ex) 50 (Solvent: water); 100 (Solvent: medium), 30 (Time: 2 h); 50 (Time: 24 h). Kind of Classification: Solvent, Time, Concentration, pH.7. If there are both water and culture media conditions, only the culture media condition is selected. However, if there are only conditions taken in water, select conditions taken in water. If t 8. If the values are represented as a range, they are represented in the following format: value1-value2. ex) 50-100. 9. If the values are represented with an error rate '±', they are represented in the following format: value±error rate. ex) 35±10.")
    mat_hydrodynamic_size_measurement_method: str = Field(description="What measurement method is used for measure hydrodynamic size (i.e., aggregates size, Z-average size)?  1. Dynamic light scattering (DLS) or Nanoparticle tracking analysis (NTA) should be selected first, followed by FCS in cases where multiple methods are available. 2. If the measurement mehod is not present type 'None'. 3. please use abbreviation form such as DLS, NTA.")
    mat_hydrodynamic_size_source: str = Field(description="Is the hydrodynamic size data sourced from the author's own experiment, manufacturer specifications, a reference paper, or is it not provided? Just answer four types. Format: Experiment, Manufacturer, Reference, Not specified")
    def to_dict(self):
        return {"mat_hydrodynamic_size": self.mat_hydrodynamic_size, "mat_hydrodynamic_size_measurement_method": self.mat_hydrodynamic_size_measurement_method, "mat_hydrodynamic_size_source": self.mat_hydrodynamic_size_source}

class pchem_surface_charge(BaseModel):
    mat_surface_charge: str = Field(description="What is the value of surface charge (i.e., Zeta potential) of each material? Please provide details on the sizes under various conditions or in different media. 1. Do not include unit. 2. Do not include an explanation about hydrodynamic size. Just give me the value. 3. Do not use a full sentence. 4. If there is no value, assign 'None'. 5. If multiple values exist for each material, divide the value using ';', add parentheses after the value, and write the conditions in the parentheses. 6. Please refer to the following format when you write down the conditions in parentheses. Format: [Classification: detailed conditions]. ex) -10 (Solvent: water); 21 (Solvent: medium), 30 (Time: 2 h); 50 (Time: 24 h). Kind of Classification: Solvent, Time, Concentration, pH. 7. If the values are represented as a range, they are represented in the following format: value1-value2. ex) 50-100. 8. If the values are represented with an error rate '±', they are represented in the following format: value±error rate. ex) 35±10. 9. If there are both water and culture media conditions, only the culture media condition is selected. However, if there are only conditions taken in water, select conditions taken in water.")
    mat_surface_charge_measurement_method: str = Field(description="What measurement method is used for measure surface charge (i.e., Zeta potential)?  1. IF surface charge data exist, just answer Zeta potential. 2. Format to anwser: Zeta potential.")
    mat_surface_charge_source: str = Field(description="Is the hydrodynamic size data sourced from the author's own experiment, manufacturer specifications, a reference paper, or is it not provided? Just answer four types. Format: Experiment, Manufacturer, Reference, Not specified")
    def to_dict(self):
        return {"mat_surface_charge": self.mat_surface_charge, "mat_surface_charge_measurement_method": self.mat_surface_charge_measurement_method, "mat_surface_charge_source": self.mat_surface_charge_source}

class pchem_surface_area(BaseModel):
    mat_surface_area: str = Field(description="What is the value of surface area of each material? Refer to the following format to answer. 1. Do not include unit. 2. Do not use a full sentence. 3. If there is no value, assign 'None'. 4. Do not include calculated size. 5. If the values are represented as a range, they are represented in the following format: value1-value2. ex) 50-100. 6. If the values are represented with an error rate '±', they are represented in the following format: value±error rate. ex) 35±10.")
    mat_surface_area_measurement_method: str = Field(description="What measurement method is used for measure surface area (i.e., SSA)?  1. Brunauer-Emmett-Teller (BET) method is most common measurement method. 3. please use abbreviation form such as BET.")
    mat_surface_area_source: str = Field(description="Is the hydrodynamic size data sourced from the author's own experiment, manufacturer specifications, a reference paper, or is it not provided? Just answer four types. Format: Experiment, Manufacturer, Reference, Not specified")
    def to_dict(self):
        return {"mat_surface_area": self.mat_surface_area, "mat_surface_area_measurement_method": self.mat_surface_area_measurement_method, "mat_surface_area_source": self.mat_surface_area_source}

In [None]:
class tox_info(BaseModel):
    cell_type: str = Field(description="What cell lines were used in the cell viability assay? 1. Do not use a full sentence. 2. Please provide the abbreviation form of the cell name. ex) A549, THP-1, MRC-5, EA.hy926, BEAS-2B, HaCaT, L929, U87, etc. 3. If multiple cell lines were used, divide the value using ';'. ex) A549; THP-1; MRC-5.")
    cell_species: str = Field(description="What species the cell line originated from? 1. Do not use a full sentence. 2. Please refer to the following form. ex) Human, Rabbit, Mouse, Pig, etc. 3. If multiple cell lines were used and their species are different,  divide the value using ';' and write the name of the cell species followed by its cell type within parentheses. ex) Human (A549); Mouse (L929).")
    cell_organ: str = Field(description="What organ the cell line originated from? 1. Do not use a full sentence. 2. Please refer to the following form. ex) Lung, Breast, Kidney, Brain, Liver, Bronchial tube, Prostate, Spleen, etc. 3. If multiple cell lines were used and their organs are different, divide the value using ';' and write the name of the cell species followed by its cell type within parentheses. ex) Lung (A549); Fibroblast (L929).")
    cell_assay: str = Field(description="Which cell viability assays or cytotoxicity assay were conducted in this paper? 1. Do not use a full sentence. 2. Only reference the following cell viability assays, and if none, assign as 'None'. 2. Please refer to the following form. ex) CCK-8, MTT, MTS, WST, Alamar blue, CellTiter-Glo, Neutral Red, NRU, Trypan blue, XTT, Calcein-AM, BrdU, EdU, Propidium iodide, Hoechst33342 assays. 3. If multiple cell viability assays were used, divide the value using ';'. ex) MTT; MTS; CCK-8.")
    cell_classification: str = Field(description="Please determine the cell type: whether it is a normal cell or a cancer cell? 1. Do not use a full sentence. 2. Just answer, 'Normal' or 'Cancer'.")
    def to_dict(self):
        return {"cell_type": self.cell_type, "cell_species": self.cell_species, "cell_organ": self.cell_organ, "cell_assay": self.cell_assay,"cell_classification": self.cell_classification}

# 4. Connect LangChain with Zotero
- Zotero collection cionfiguration
- Set directory of output and zotero pdf download
- Extract data from Zotero
- Get PDF file list and IDs

In [None]:
import time
embedding_start_time = time.time()

In [None]:
##Zotero collection configuration

import os
from os.path import join, basename, splitext
import subprocess
from glob import glob
from shutil import copy
from random import shuffle, seed

from pyzotero import zotero

zot = zotero.Zotero(library_id, 'library_type', 'zotero_api_key') ## fill zotero.Zoter(library_id, library_type, zotero_api_key)

collections = {c['data']['name']: c for c in zot.collections()}

collection_names = []
for key, value in collections.items():
    #print(key)
    collection_names.append(key)

collection_names

['remove123456',
 'remove12345',
 'remove123',
 'PchemScore_1',
 'oxide af filter5',
 'oxide af filter3',
 'oxide af filter2',
 'pmfilter',
 'oxide af a',
 'oxide af filter1',
 'oxide af filter',
 'WOS 8',
 'WOS 7',
 'WOS 6',
 'WOS 5',
 'WOS 4',
 'WOS 3',
 'WOS 1',
 'WOS 2',
 'WOS 0',
 'PUBMED 9',
 'PUBMED 8',
 'PUBMED 7',
 'PUBMED 6',
 'PUBMED 2',
 'PUBMED 1',
 'PUBMED 5',
 'PUBMED 4',
 'PUBMED 3',
 'PUBMED 0',
 'testa',
 'new pdf',
 'Oxide af 2018 WOS',
 'Oxide af 2018 PUBMED',
 'Ha 1a-1',
 'Metal oxide after 2018 wos',
 'Metal oxide after 2018 pubmed',
 'test1',
 'Metal External dataset',
 'ddd',
 'Trinh_test',
 'External dataset',
 'Ha 1d',
 'Ha 1c',
 'Ha 1b',
 'Ha 1a',
 'Ha 3b',
 '2025_DB',
 'TrinhB',
 'Ha 3a',
 'Ha 2',
 'TrinhA',
 'training set oxide 3',
 'training set metal 2',
 'training set oxide 2',
 'training set metal',
 'test set for chatGPT',
 'traning set for chatGPT',
 'Ha IIIa',
 'Ha II',
 'Ha I',
 'Xiao Metal',
 'Ha Oxide 2018',
 'Xiao Oxide',
 'Trinh Metal 2018',
 'k

In [None]:
selected_collection = "enter_collection_name" # copy and pase collection name
pdf_folder = "/home/pdf_pdf" + selected_collection.replace(" ", "_")
print("PDF 파일들은 " + pdf_folder + "에 저장됩니다.")

PDF 파일들은 /home/pdf_pdfHa_3b에 저장됩니다.


In [None]:
## PDF 파일 폴더와 output 폴더 설정

import os

def create_folder(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder created at {folder_path}")
    else:
        print(f"Folder already exists at {folder_path}")

output_folder = "output_" + user_name
create_folder(output_folder)
create_folder(pdf_folder)

Folder already exists at output_HEY20241118-Eunyong Ha-reproducibility
Folder already exists at /home/pdf_pdfHa_3b


In [None]:
## extract collection data from zotero

import os
from os.path import join, basename, splitext
import subprocess
from glob import glob
from shutil import copy
from random import shuffle, seed

from pyzotero import zotero
zot = zotero.Zotero(library_id, 'library_type', 'zotero_api_key') ## fill zotero.Zoter(library_id, library_type, zotero_api_key)

collections = {c['data']['name']: c for c in zot.collections()}
collection = collections[selected_collection]
key = collection['key']

items = [d for d in zot.everything(zot.collection_items(key))]

pdf_dict = {}
for item in items:
    pdf_dict[item['data']['key']] = item['data']['title']

### if there is 'title' error use this code
#for item in items:
#    # Check if the 'title' key exists in the item['data']
#    if 'title' in item['data']:
#        pdf_dict[item['data']['key']] = item['data']['title']
#    else:
#        print(f"Item {item['data']['key']} does not have a title.")

# Iterate over items in the collection
for item in items:
    title = item['data'].get('title', 'No Title')
    print(f"\nProcessing item: {title}")

    # Ensure we're not processing non-attachment items
    if item['data'].get('itemType') != 'attachment':
        try:
            children = [c for c in zot.children(item['key'])]
            print(f"Found {len(children)} children for item '{title}'")

            # Check for PDFs among the children
            pdfs = [c for c in children if c['data'].get('contentType') == 'application/pdf']

            if not children:
                print(f"Missing documents for: {title}")
            elif not pdfs:
                print(f"No PDFs found for: {title}")
            elif len(pdfs) > 1:
                print(f"Too many PDFs for: {title}")
            else:
                doc = pdfs[0]
                print(f"PDF found: {doc['data']['filename']}")

                pdf_file_path = os.path.join(pdf_folder, f"{doc['key']}.pdf")

                # Check if the PDF already exists, and download if it doesn't
                if not os.path.exists(pdf_file_path):
                    zot.dump(doc['key'], pdf_file_path, pdf_folder)
                    print(f"{pdf_file_path} is downloaded.")
                else:
                    print(f"{pdf_file_path} already exists.")

        except Exception as e:
            print(f"Error processing item {title}: {e}")
    else:
        print(f"Skipped non-PDF entry: {title}")


Processing item: Michael Berg 등 - 2013 - Comparative cytological responses of lung epitheli.pdf
Skipped non-PDF entry: Michael Berg 등 - 2013 - Comparative cytological responses of lung epitheli.pdf

Processing item: Hsiao 그리고 Huang - 2011 - Effects of various physicochemical characteristics.pdf
Skipped non-PDF entry: Hsiao 그리고 Huang - 2011 - Effects of various physicochemical characteristics.pdf

Processing item: Reactive oxygen species mediated DNA damage in human lung alveolar epithelial (A549) cells from exposure to non-cytotoxic MFI-type zeolite nanoparticles
Found 1 children for item 'Reactive oxygen species mediated DNA damage in human lung alveolar epithelial (A549) cells from exposure to non-cytotoxic MFI-type zeolite nanoparticles'
PDF found: 23103338.pdf
/home/pdf_pdfHa_3b/EXQ8I7IY.pdf already exists.

Processing item: 23103338
Skipped non-PDF entry: 23103338

Processing item: Comparative cytological responses of lung epithelial and pleural mesothelial cells following in vit

In [None]:
## Get PDF file list and IDs

pdf_files = []

for file_path in os.listdir(pdf_folder):
    if os.path.isfile(os.path.join(pdf_folder, file_path)):
        pdf_files.append(os.path.join(pdf_folder, file_path))

print(pdf_files)

# get pdf ids and store as pdf_ids
pdf_ids = []
for file_path in pdf_files:
    # get base name of file_pafth and remove .pdf
    base_name = os.path.basename(file_path)
    base_name = os.path.splitext(base_name)[0]
    pdf_ids.append(base_name)
print(pdf_ids)

['/home/pdf_pdfHa_3b/6Z5K86RQ.pdf', '/home/pdf_pdfHa_3b/T43GGRMX.pdf', '/home/pdf_pdfHa_3b/HVPYYSLW.pdf', '/home/pdf_pdfHa_3b/5I96BQWG.pdf', '/home/pdf_pdfHa_3b/YNQITCKF.pdf', '/home/pdf_pdfHa_3b/FFWAKGE3.pdf', '/home/pdf_pdfHa_3b/HEJLYCVK.pdf', '/home/pdf_pdfHa_3b/Q4GSNA3A.pdf', '/home/pdf_pdfHa_3b/USNLP6MR.pdf', '/home/pdf_pdfHa_3b/5BHP8KES.pdf', '/home/pdf_pdfHa_3b/95ASP9SZ.pdf', '/home/pdf_pdfHa_3b/SKCFUKGT.pdf', '/home/pdf_pdfHa_3b/ASCVGIJU.pdf', '/home/pdf_pdfHa_3b/U27TLPF7.pdf', '/home/pdf_pdfHa_3b/HAKJAMZY.pdf', '/home/pdf_pdfHa_3b/EXQ8I7IY.pdf']
['6Z5K86RQ', 'T43GGRMX', 'HVPYYSLW', '5I96BQWG', 'YNQITCKF', 'FFWAKGE3', 'HEJLYCVK', 'Q4GSNA3A', 'USNLP6MR', '5BHP8KES', '95ASP9SZ', 'SKCFUKGT', 'ASCVGIJU', 'U27TLPF7', 'HAKJAMZY', 'EXQ8I7IY']


In [None]:
for key in pdf_ids:
    print (key)

6Z5K86RQ
T43GGRMX
HVPYYSLW
5I96BQWG
YNQITCKF
FFWAKGE3
HEJLYCVK
Q4GSNA3A
USNLP6MR
5BHP8KES
95ASP9SZ
SKCFUKGT
ASCVGIJU
U27TLPF7
HAKJAMZY
EXQ8I7IY


# 5. ChatGPT with LangChain

In [None]:
from langchain.embeddings import OpenAIEmbeddings
OPENAI_API_KEY="open_ai_api_key"
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="text-embedding-3-large")

  warn_deprecated(


In [None]:
## Generate embeddings & save as files

In [None]:

from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFium2Loader
import os


error_files = []

def get_pdf_text(file_pafth):
    try:
        # get base name of file_pafth and remove .pdf
        base_name = os.path.basename(file_pafth)
        base_name = os.path.splitext(base_name)[0]
        print(base_name)

        em_path = "/home/workspace/embed/openai_em1118-2/" + base_name + "_openai"
        if not os.path.exists(em_path):
            # Load the PDF document
            load = PyPDFium2Loader(file_pafth)
            document = load.load()

            # Split text into chunks
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
            texts = text_splitter.split_documents(document)

            # Embed the documents using FAISS
            db = FAISS.from_documents(texts, embeddings)

            # Save the FAISS index locally
            db.save_local(em_path)
            print(f"{em_path} is generated.")
        else:
            print(f"{em_path} already exists")
    except Exception as e:
        # Handle any exceptions and continue processing the next file
        print(f"Error processing {file_pafth}: {e}")
        # 오류가 발생한 파일명을 리스트에 추가
        error_files.append(file_pafth)

# Run get_pdf_text for all pdf_files
for file_path in pdf_files:
    get_pdf_text(file_path)

6Z5K86RQ




/home/workspace/embed/openai_em1118-2/6Z5K86RQ_openai is generated.
T43GGRMX
/home/workspace/embed/openai_em1118-2/T43GGRMX_openai is generated.
HVPYYSLW
/home/workspace/embed/openai_em1118-2/HVPYYSLW_openai is generated.
5I96BQWG
/home/workspace/embed/openai_em1118-2/5I96BQWG_openai is generated.
YNQITCKF
/home/workspace/embed/openai_em1118-2/YNQITCKF_openai is generated.
FFWAKGE3
/home/workspace/embed/openai_em1118-2/FFWAKGE3_openai is generated.
HEJLYCVK
/home/workspace/embed/openai_em1118-2/HEJLYCVK_openai is generated.
Q4GSNA3A
/home/workspace/embed/openai_em1118-2/Q4GSNA3A_openai is generated.
USNLP6MR
/home/workspace/embed/openai_em1118-2/USNLP6MR_openai is generated.
5BHP8KES
/home/workspace/embed/openai_em1118-2/5BHP8KES_openai is generated.
95ASP9SZ
/home/workspace/embed/openai_em1118-2/95ASP9SZ_openai is generated.
SKCFUKGT
/home/workspace/embed/openai_em1118-2/SKCFUKGT_openai is generated.
ASCVGIJU
/home/workspace/embed/openai_em1118-2/ASCVGIJU_openai is generated.
U27TLPF7

In [None]:

if error_files:
    print("The following files encountered errors:")
    for error_file in error_files:
        print(error_file)
else:
    print("No errors encountered.")

No errors encountered.


In [None]:
# 마지막 셀에 추가
embedding_end_time = time.time()
print(f"total notebook running time : {notebook_end_time - notebook_start_time}sec")

노트북 전체 실행 시간: 92.12493801116943초


In [None]:
# Prepare LLM (ChatGPT)

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
##gpt-3.5-turbo-0125
##gpt-4o
chat = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-4o")

  warn_deprecated(


In [None]:
from langchain.callbacks import get_openai_callback

openai_cb = {}

def get_answer(doc_id, q, parser):
    db = FAISS.load_local("/home/workspace/embed/openai_em1118-2/" + doc_id + "_openai", embeddings,allow_dangerous_deserialization=True)
    prompt = PromptTemplate(
        template=template,
        input_variables=['context', 'question'],
        partial_variables = {"format_instructions": parser.get_format_instructions()},)
    retriever = db.as_retriever(search_kwargs={'k': 10})
    qa_llm = RetrievalQA.from_chain_type(llm = chat,
                                        chain_type = 'stuff',
                                        retriever = retriever,
                                        return_source_documents = True,
                                        chain_type_kwargs = {'prompt': prompt})
    with get_openai_callback() as cb:
        output = qa_llm({'query': q})
        now = datetime.datetime.now(timezone('Asia/Seoul'))
        now = now.strftime("%Y%m%d_%H%M%S")
        openai_cb[now] = cb
    return output

In [None]:
output_parser = CommaSeparatedListOutputParser()
mats_all_paper = {pdf_id: get_answer(pdf_id, q_mat, output_parser)['result'] for pdf_id in pdf_ids}


  warn_deprecated(


In [None]:
mats_all_paper

{'6Z5K86RQ': 'SiO2 (SiO2)',
 'T43GGRMX': 'TiO2 (TiO2), CeO2 (CeO2), ZnO (ZnO), SiO2 (SiO2)',
 'HVPYYSLW': 'MTI5 (TiO2), P25 (TiO2), Nanofilament (TiO2), Hombitan (TiO2), Vive Nano Titania (TiO2)',
 '5I96BQWG': 'ZnO nanorods (ZnO)',
 'YNQITCKF': 'P60 (PLGA), P100 (PLGA), P200 (PLGA), T10 (TiO2), T20 (TiO2), T100 (TiO2)',
 'FFWAKGE3': 'Fe2O3 (Fe2O3), In2O3 (In2O3), ZnO (ZnO), ZrO2 (ZrO2), Al2O3 (Al2O3)',
 'HEJLYCVK': '10SiO2 (SiO2), 150SiO2 (SiO2), 500SiO2 (SiO2)',
 'Q4GSNA3A': 'Al2O3 (Al2O3), CeO2 (CeO2), CoO (CoO), Cr2O3 (Cr2O3), Fe2O3 (Fe2O3), Gd2O3 (Gd2O3), HfO2 (HfO2), Ni2O3 (Ni2O3), SiO2 (SiO2), SnO2 (SnO2), Y2O3 (Y2O3), Yb2O3 (Yb2O3), ZrO2 (ZrO2), CuO (CuO), Co3O4 (Co3O4), Fe3O4 (Fe3O4), Sb2O3 (Sb2O3), TiO2 (TiO2), WO3 (WO3), ZnO (ZnO)',
 'USNLP6MR': 'Si50 (SiO2), Si500 (SiO2)',
 '5BHP8KES': 'Bulk ZnO (ZnO), Non Ionic ZnO (ZnO), Cationic ZnO (ZnO), Anionic ZnO (ZnO)',
 '95ASP9SZ': 'P25 (TiO2), TiO2P25–300 (TiO2), TiO2P25–130 (TiO2), TiO2P25–70 (TiO2)',
 'SKCFUKGT': 'ZnO (ZnO), Fe2

# material data extracted

In [None]:
# OutputFixingPaser

from langchain.output_parsers import OutputFixingParser
fixing_parser = OutputFixingParser.from_llm(parser = output_parser, llm = chat)

mats_all_paper_fix = {}
for key, value in mats_all_paper.items():
    value_fix = fixing_parser.parse(value)
    mats_all_paper_fix[key] = value_fix

mats_all_paper_fix

{'6Z5K86RQ': ['SiO2 (SiO2)'],
 'T43GGRMX': ['TiO2 (TiO2)', 'CeO2 (CeO2)', 'ZnO (ZnO)', 'SiO2 (SiO2)'],
 'HVPYYSLW': ['MTI5 (TiO2)',
  'P25 (TiO2)',
  'Nanofilament (TiO2)',
  'Hombitan (TiO2)',
  'Vive Nano Titania (TiO2)'],
 '5I96BQWG': ['ZnO nanorods (ZnO)'],
 'YNQITCKF': ['P60 (PLGA)',
  'P100 (PLGA)',
  'P200 (PLGA)',
  'T10 (TiO2)',
  'T20 (TiO2)',
  'T100 (TiO2)'],
 'FFWAKGE3': ['Fe2O3 (Fe2O3)',
  'In2O3 (In2O3)',
  'ZnO (ZnO)',
  'ZrO2 (ZrO2)',
  'Al2O3 (Al2O3)'],
 'HEJLYCVK': ['10SiO2 (SiO2)', '150SiO2 (SiO2)', '500SiO2 (SiO2)'],
 'Q4GSNA3A': ['Al2O3 (Al2O3)',
  'CeO2 (CeO2)',
  'CoO (CoO)',
  'Cr2O3 (Cr2O3)',
  'Fe2O3 (Fe2O3)',
  'Gd2O3 (Gd2O3)',
  'HfO2 (HfO2)',
  'Ni2O3 (Ni2O3)',
  'SiO2 (SiO2)',
  'SnO2 (SnO2)',
  'Y2O3 (Y2O3)',
  'Yb2O3 (Yb2O3)',
  'ZrO2 (ZrO2)',
  'CuO (CuO)',
  'Co3O4 (Co3O4)',
  'Fe3O4 (Fe3O4)',
  'Sb2O3 (Sb2O3)',
  'TiO2 (TiO2)',
  'WO3 (WO3)',
  'ZnO (ZnO)'],
 'USNLP6MR': ['Si50 (SiO2)', 'Si500 (SiO2)'],
 '5BHP8KES': ['Bulk ZnO (ZnO)',
  'Non Ionic Zn

In [None]:
# Pchem info. extraction

In [None]:
from dataclasses import dataclass

@dataclass
class gpt_responses:
    pdf_id: str
    mat_name: str
    pchem_product_info: str
    pchem_mat_synthesis: str
    pchem_core_size: str
    pchem_hydrodynamic_size: str
    pchem_surface_charge: str
    pchem_surface_area: str


In [None]:
import json
import textwrap
import langchain_core.output_parsers


def get_sub_answers(mat, q, key, data_class):
    schema = data_class.schema()
    field_names = list(schema['properties'].keys())
    field_names.insert(0, 'key')
    #print(field_names)

    output_parser = PydanticOutputParser(pydantic_object=data_class)
    ans = get_answer(key, q, output_parser)

    fixing_parser = OutputFixingParser.from_llm(parser = output_parser, llm = chat)

    text = ans['result']
    py = fixing_parser.parse(text)
    py_dict = py.to_dict()

    #print(py_dict)

    pchem_df = pd.DataFrame.from_records(py_dict, index = ["0"])
    pchem_df.insert(0, 'key', key)
    #pchem_df.insert(0, 'ref', pdf_dict[key])
    print(pchem_df)

    return(pchem_df)

#i = "TiO2 P25–70 nano-TiO2"
#q = "please pull out material information of " + i + " in the document."
#tmp = get_sub_answers(i, q, "95ASP9SZ", pchem_product_info)

In [None]:
all_pchem_dfs = []
all_gpt_responses = []
error_files = []

for key, value in mats_all_paper_fix.items():
    try:
        # make for loop to print each value
        for i in value:
            print(key + ": " + i)

            q = "please pull out material information of " + i + " in the document."
            pchem_df = get_sub_answers(i, q, key, pchem_product_info)

            q = "please pull out material synthesis information of " + i + " in the document."
            pchem_df_add = get_sub_answers(i, q, key, pchem_mat_synthesis)
            pchem_df = pd.merge(pchem_df, pchem_df_add, on='key')

            q = "please pull out core size information of " + i + " in the document."
            pchem_df_add = get_sub_answers(i, q, key, pchem_core_size)
            pchem_df = pd.merge(pchem_df, pchem_df_add, on='key')

            q = "please pull out hydrodynamic size information of " + i + " in the document."
            pchem_df_add = get_sub_answers(i, q, key, pchem_hydrodynamic_size)
            pchem_df = pd.merge(pchem_df, pchem_df_add, on='key')

            q = "please pull out surface charge information of " + i + " in the document."
            pchem_df_add = get_sub_answers(i, q, key, pchem_surface_charge)
            pchem_df = pd.merge(pchem_df, pchem_df_add, on='key')

            q = "please pull out surface area information of " + i + " in the document."
            pchem_df_add = get_sub_answers(i, q, key, pchem_surface_area)
            pchem_df = pd.merge(pchem_df, pchem_df_add, on='key')
            pchem_df.insert(0, 'ref', pdf_dict[key])

            # Append pchem_df to the list
            all_pchem_dfs.append(pchem_df)

    except Exception as e:
        # 오류가 발생한 파일을 기록하고, 다음 파일로 넘어감
        print(f"Error processing {key}: {e}")
        error_files.append(key)

# Combine all pchem_df DataFrames into a single DataFrame
all_pchem_df = pd.concat(all_pchem_dfs, ignore_index=True)
all_pchem_df['ref'] = all_pchem_df['key'].map(lambda x: pdf_dict[x] if x in pdf_dict else None)


6Z5K86RQ: SiO2 (SiO2)
        key     mat_name
0  6Z5K86RQ  SiO2 (SiO2)
        key           mat_synthesis
0  6Z5K86RQ  Commercially available
        key mat_core_size mat_core_size_measurement_method  \
0  6Z5K86RQ     33.5±7.73                              TEM   

  mat_core_size_source  
0           Experiment  
        key       mat_hydrodynamic_size  \
0  6Z5K86RQ  120±23.8 (Solvent: medium)   

  mat_hydrodynamic_size_measurement_method mat_hydrodynamic_size_source  
0                                      DLS                   Experiment  
        key mat_surface_charge mat_surface_charge_measurement_method  \
0  6Z5K86RQ               None                                  None   

  mat_surface_charge_source  
0                      None  
        key mat_surface_area mat_surface_area_measurement_method  \
0  6Z5K86RQ      576.23±7.87                                 BET   

  mat_surface_area_source  
0              Experiment  
T43GGRMX: TiO2 (TiO2)
        key     mat_name
0

In [None]:
# 오류가 발생한 파일 출력
if error_files:
    print("The following files encountered errors:")
    for error_file in error_files:
        print(error_file)
else:
    print("No errors encountered.")

No errors encountered.


# Pchem data extraction result

In [None]:
all_pchem_df

Unnamed: 0,ref,key,mat_name,mat_synthesis,mat_core_size,mat_core_size_measurement_method,mat_core_size_source,mat_hydrodynamic_size,mat_hydrodynamic_size_measurement_method,mat_hydrodynamic_size_source,mat_surface_charge,mat_surface_charge_measurement_method,mat_surface_charge_source,mat_surface_area,mat_surface_area_measurement_method,mat_surface_area_source
0,Michael Berg 등 - 2013 - Comparative cytologica...,6Z5K86RQ,SiO2 (SiO2),Commercially available,33.5±7.73,TEM,Experiment,120±23.8 (Solvent: medium),DLS,Experiment,,,,576.23±7.87,BET,Experiment
1,22303956,T43GGRMX,TiO2 (TiO2),Commercially available (Aeroxide P25),29.2±10,TEM,Experiment,1.38±0.03 (Solvent: culture media); 2.3±0.2 (S...,DLS,Experiment,1.38±0.03 (pH: 7.4); 2.3±0.2 (pH: 7.4),Zeta potential,Not specified,,,Not specified
2,22303956,T43GGRMX,CeO2 (CeO2),,23.4±4,TEM,Experiment,40±3 (Solvent: medium); 1.4±0.02 (Solvent: med...,DLS,Experiment,40±3 (Solvent: medium),Zeta potential,Experiment,,,Not specified
3,22303956,T43GGRMX,ZnO (ZnO),,15.5±4,TEM,Experiment,28±4 (Solvent: medium),DLS,Experiment,28±4,Zeta potential,Not specified,,,Not specified
4,22303956,T43GGRMX,SiO2 (SiO2),Commercially available,12.0±2,TEM,Experiment,30 (Solvent: culture media); 34.9 (Solvent: wa...,DLS,Experiment,30 (pH: 7.4),Zeta potential,Not specified,,,Not specified
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,Hsiao 그리고 Huang - 2011 - Effects of various ph...,U27TLPF7,TiO2-A (TiO2),Synthesized,90-160,,Experiment,661 (Solvent: water); 1054 (Solvent: DMEM),DLS,Experiment,-40.8 (Solvent: water),Zeta potential,Not specified,38,BET,Experiment
68,24078789,HAKJAMZY,Nano-ZnO (ZnO),Commercially available,50-70,TEM,Manufacturer,936 (Solvent: serum-free DMEM); 279 (Solvent: ...,DLS,Experiment,-2.4 (Solvent: DMEM); -7.5 (Solvent: DMEM/5% F...,Zeta potential,Experiment,7.34,BET,Experiment
69,24078789,HAKJAMZY,Micro-ZnO (ZnO),Commercially available,,TEM,Manufacturer,"530 (Solvent: DMEM); 1107 (Solvent: DMEM, Seru...",DLS,Experiment,-4.0 (Solvent: DMEM); -8.2 (Solvent: DMEM/5% F...,Zeta potential,Experiment,1.81,BET,Experiment
70,23103338,EXQ8I7IY,MFI-50 (SiO2),Synthesized,50,SEM,Experiment,,,Not specified,-13.2±0.7 (Solvent: water),Zeta potential,Experiment,108.3±0.4,,Not specified


# save extracted Pchem data

In [None]:
import datetime
from pytz import timezone

now = datetime.datetime.now(timezone('Asia/Seoul'))
now = now.strftime("%Y%m%d_%H%M%S")
o = os.path.join(output_folder, "pchem_gtp_output_" + now + ".xlsx")
all_pchem_df.to_excel(o)
print(o + "로 저장되었습니다.")

output_HEY20241118-Eunyong Ha-reproducibility/pchem_gtp_output_20241118_170418.xlsx로 저장되었습니다.


# Tox data extraction

In [None]:
print (mats_all_paper_fix)

{'6Z5K86RQ': ['SiO2 (SiO2)'], 'T43GGRMX': ['TiO2 (TiO2)', 'CeO2 (CeO2)', 'ZnO (ZnO)', 'SiO2 (SiO2)'], 'HVPYYSLW': ['MTI5 (TiO2)', 'P25 (TiO2)', 'Nanofilament (TiO2)', 'Hombitan (TiO2)', 'Vive Nano Titania (TiO2)'], '5I96BQWG': ['ZnO nanorods (ZnO)'], 'YNQITCKF': ['P60 (PLGA)', 'P100 (PLGA)', 'P200 (PLGA)', 'T10 (TiO2)', 'T20 (TiO2)', 'T100 (TiO2)'], 'FFWAKGE3': ['Fe2O3 (Fe2O3)', 'In2O3 (In2O3)', 'ZnO (ZnO)', 'ZrO2 (ZrO2)', 'Al2O3 (Al2O3)'], 'HEJLYCVK': ['10SiO2 (SiO2)', '150SiO2 (SiO2)', '500SiO2 (SiO2)'], 'Q4GSNA3A': ['Al2O3 (Al2O3)', 'CeO2 (CeO2)', 'CoO (CoO)', 'Cr2O3 (Cr2O3)', 'Fe2O3 (Fe2O3)', 'Gd2O3 (Gd2O3)', 'HfO2 (HfO2)', 'Ni2O3 (Ni2O3)', 'SiO2 (SiO2)', 'SnO2 (SnO2)', 'Y2O3 (Y2O3)', 'Yb2O3 (Yb2O3)', 'ZrO2 (ZrO2)', 'CuO (CuO)', 'Co3O4 (Co3O4)', 'Fe3O4 (Fe3O4)', 'Sb2O3 (Sb2O3)', 'TiO2 (TiO2)', 'WO3 (WO3)', 'ZnO (ZnO)'], 'USNLP6MR': ['Si50 (SiO2)', 'Si500 (SiO2)'], '5BHP8KES': ['Bulk ZnO (ZnO)', 'Non Ionic ZnO (ZnO)', 'Cationic ZnO (ZnO)', 'Anionic ZnO (ZnO)'], '95ASP9SZ': ['P25 (Ti

In [None]:
all_tox_dfs = []
all_gpt_responses = []

for key, value in mats_all_paper_fix.items():

    # make for loop to print each value
    for i in value:
        print(key)

        q = "please pull out cytotoxicity information in the document."
        tox_df = get_sub_answers(i, q, key, tox_info)

        #gpt_res = gpt_responses(key, i, ans_info, ans_core, ans_hydro, ans_s_charge, ans_s_area)
        #all_gpt_responses.append(gpt_res)

        # Append pchem_df to the list
        all_tox_dfs.append(tox_df)

# Combine all pchem_df DataFrames into a single DataFrame
all_tox_df = pd.concat(all_tox_dfs, ignore_index=True)
all_tox_df['ref'] = all_tox_df['key'].map(lambda x: pdf_dict[x] if x in pdf_dict else None)

6Z5K86RQ
        key cell_assay cell_classification                  cell_organ  \
0  6Z5K86RQ       None      Normal; Cancer  Lung (Met-5A); Lung (A549)   

                   cell_species     cell_type  
0  Human (Met-5A); Human (A549)  Met-5A; A549  
T43GGRMX
        key cell_assay cell_classification  \
0  T43GGRMX        MTT              Cancer   

                                      cell_organ  \
0  Breast (MCF-7); Lung (BEAS-2B); Cervix (HeLa)   

                                   cell_species             cell_type  
0  Human (MCF-7); Human (BEAS-2B); Human (HeLa)  MCF-7; BEAS-2B; HeLa  
T43GGRMX
        key cell_assay cell_classification  \
0  T43GGRMX   MTT; LDH              Cancer   

                                          cell_organ  \
0  Breast (MCF-7); Cervix (HeLa); Bronchial tube ...   

                                        cell_species  \
0  Human (MCF-7); Human (HeLa); Human (BEAS-2B); ...   

                    cell_type  
0  MCF-7; HeLa; BEAS-2B; PC12  
T43

In [None]:
all_tox_df

Unnamed: 0,key,cell_assay,cell_classification,cell_organ,cell_species,cell_type,ref
0,6Z5K86RQ,,Normal; Cancer,Lung (Met-5A); Lung (A549),Human (Met-5A); Human (A549),Met-5A; A549,Michael Berg 등 - 2013 - Comparative cytologica...
1,T43GGRMX,MTT,Cancer,Breast (MCF-7); Lung (BEAS-2B); Cervix (HeLa),Human (MCF-7); Human (BEAS-2B); Human (HeLa),MCF-7; BEAS-2B; HeLa,22303956
2,T43GGRMX,MTT; LDH,Cancer,Breast (MCF-7); Cervix (HeLa); Bronchial tube ...,Human (MCF-7); Human (HeLa); Human (BEAS-2B); ...,MCF-7; HeLa; BEAS-2B; PC12,22303956
3,T43GGRMX,MTT,Cancer; Normal,Breast (MCF-7); Bronchial tube (BEAS-2B),Human (MCF-7); Human (BEAS-2B),MCF-7; BEAS-2B,22303956
4,T43GGRMX,MTT,Cancer,Breast (MCF-7); Lung (BEAS-2B); Brain (PC12); ...,Human (MCF-7); Human (BEAS-2B); Rat (PC12); Hu...,MCF-7; BEAS-2B; PC12; HeLa,22303956
...,...,...,...,...,...,...,...
67,U27TLPF7,MTT,Cancer,Lung,Human,A549,Hsiao 그리고 Huang - 2011 - Effects of various ph...
68,HAKJAMZY,MTT; IL-8,Cancer,Monoblastoid,Human,THP-1,24078789
69,HAKJAMZY,MTT; IL-8,,,,,24078789
70,EXQ8I7IY,,,,,,23103338


# Tox data extraction results

In [None]:
import datetime
from pytz import timezone

now = datetime.datetime.now(timezone('Asia/Seoul'))
now = now.strftime("%Y%m%d_%H%M%S")
o = os.path.join(output_folder, "tox_gtp_output_" + now + ".xlsx")
all_tox_df.to_excel(o)
print(o + "로 저장되었습니다.")

output_HEY20241118-Eunyong Ha-reproducibility/tox_gtp_output_20241118_170742.xlsx로 저장되었습니다.


# Tokens and cost

In [None]:
cost_list = []
for key, value in openai_cb.items():
    #print(value.total_cost)
    cost_dict = {}
    cost_dict["Date Time"] = key
    cost_dict["Total tokens"] = value.total_tokens
    cost_dict["Total cost ($)"] = value.total_cost
    cost_list.append(cost_dict)

cost_df = pd.DataFrame(cost_list)
sums = cost_df.select_dtypes(include='number').sum()
sums_dict = {"Date Time": "Total", "Total tokens": sums["Total tokens"], "Total cost ($)": sums["Total cost ($)"]}

In [None]:
cost_df = pd.concat([cost_df, pd.DataFrame([sums_dict])])
cost_df

Unnamed: 0,Date Time,Total tokens,Total cost ($)
0,20241118_164948,1374.0,0.006950
1,20241118_164952,1497.0,0.007785
2,20241118_164954,1477.0,0.007785
3,20241118_164956,1425.0,0.007215
4,20241118_164958,1467.0,0.007725
...,...,...,...
516,20241118_170735,1664.0,0.008910
517,20241118_170737,1659.0,0.008835
518,20241118_170739,1438.0,0.007680
519,20241118_170741,1438.0,0.007680


In [None]:
import datetime
from pytz import timezone

now = datetime.datetime.now(timezone('Asia/Seoul'))
now = now.strftime("%Y%m%d_%H%M%S")
o = os.path.join(output_folder, "token_and_cost_" + now + ".xlsx")
cost_df.to_excel(o)
print("saved as " + o + ".")

output_HEY20241118-Eunyong Ha-reproducibility/token_and_cost_20241118_170742.xlsx로 저장되었습니다.


In [None]:
# 마지막 셀에 추가
notebook_end_time = time.time()
print(f"total running time: {notebook_end_time - notebook_start_time}sec")

노트북 전체 실행 시간: 1173.8525257110596초
