## Use Chatgpt-3.5 API to perform Named Entity Recognition (NER) in a chemistry context

### Import Libraries and keys

In [4]:
import os, sys
import openai
import re
import PyPDF2
import json
import string
from dotenv import load_dotenv, find_dotenv


_ = load_dotenv(find_dotenv()) # read local .env file

openai.proxy = os.getenv("HTTP_PROXY") # set proxy if needed
openai.api_key  = os.environ['OPENAI_API_KEY']

### Get ready for API call

In [9]:
def get_completion_from_messages(messages, model="gpt-3.5-turbo", temperature=0, max_tokens=1024):
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, 
        max_tokens=max_tokens, 
    )
    return response.choices[0].message["content"]

### Extract from PDF

In [5]:
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, 'E:\\wymApp\\flask-llm-pdf-analyzer')

def extract_pages(file_path):
    """
    input: the PDF in current directory
    output: a list of sentences
    """

    all_pages = []
    # Open the PDF file in binary mode
    with open(file_path, 'rb') as pdf_file:

        # Create a PDF reader object
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        # Loop over each page in the file
        for page_num in range(1, len(pdf_reader.pages)-2):
            # Extract the text from the page
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
            all_pages.append(text)

    # return all_sentences
    return all_pages

#### Read PDF

In [6]:
pdf_path = "c:\\Users\\ywanglu\\Desktop\\Advanced Materials \
- 2022 - He - Cooperative Coupling of H2O2 Production and Organic Synthesis over a Floatable.pdf"
pages = extract_pages(pdf_path)

#### Prompt to extract PDF

In [18]:
prompt = """Please extract the names of chemical used in the text deliminated by triple backticks.\
        Summarize each chemical in one JSON object. Do not output anything if there is no chemical name.\
        The information of the chemical can be obtained from Sigma-Aldrich\
        here is the json structure:\
        {
            "chemical_name": "the name of a single chemical",
            "chemical_desctiption": "the description of chemical",
            "chemical_safety": "the safety information of the chemical",
        },
        """

In [19]:
responses = []

for i in range(len(pages)):
    user_message = f"""```{pages[i]}```
                    {prompt}"""
    messages = [
        {'role': 'user', 'content': user_message},
    ]
    response = get_completion_from_messages(messages)

    # clear the non-english input

    try:

        response = response.replace("""}\n{""" , """},\n{""")
        response = response.replace("""} \n{""" , """},\n{""")
        data = json.loads("[" + response + "]") # add [] to accord to json syntax

    except json.decoder.JSONDecodeError:
        continue
    # combine the json output
    try:
        responses.extend(data)
    except KeyError:
        continue
print(responses)

[{'chemical_name': 'titanium isopropoxide', 'chemical_description': 'Titanium isopropoxide is a chemical compound with the formula Ti(OCH(CH3)2)4. It is a colorless liquid that is used as a precursor for the synthesis of titanium dioxide nanoparticles.', 'chemical_safety': 'Titanium isopropoxide is flammable and can cause skin and eye irritation. It should be handled with care and stored in a cool, well-ventilated area.'}, {'chemical_name': 'hexadecylamine', 'chemical_description': 'Hexadecylamine is a long-chain primary amine with the formula CH3(CH2)15NH2. It is a white solid that is used as a surfactant and emulsifier in various industrial applications.', 'chemical_safety': 'Hexadecylamine is harmful if swallowed or inhaled. It can cause skin and eye irritation. It should be handled with care and stored in a cool, dry place.'}, {'chemical_name': 'ammonium hydroxide', 'chemical_description': 'Ammonium hydroxide is a solution of ammonia in water. It is a colorless liquid that is used 

### Clean the extracted data

In [16]:
def contains_punctuation(s):
    """
    Returns True if the input string contains any punctuation characters, False otherwise.
    """
    for c in s:
        if c in string.punctuation:
            return True
    return False
    
def remove_duplicate(responses):
    """
    input: a list of json objects
    output: a list of json objects without duplicate chemical names
    """
    chemical_names = set()
    new_responses = []
    for obj in responses:
        if obj['chemical_name'] not in chemical_names and \
        not contains_punctuation(obj['chemical_name']):
            chemical_names.add(obj['chemical_name'])
            new_responses.append(obj)
    return new_responses

### Check the results

In [17]:
result = remove_duplicate(responses)
print(json.dumps(result, indent=4))

[
    {
        "chemical_name": "titanium isopropoxide",
        "chemical_description": "Titanium isopropoxide is a chemical compound with the formula Ti(OCH(CH3)2)4. It is a colorless liquid that is used as a precursor for the synthesis of titanium dioxide nanoparticles.",
        "chemical_safety": "Titanium isopropoxide is flammable and can cause skin and eye irritation. It should be handled with care and stored in a cool, well-ventilated area."
    },
    {
        "chemical_name": "hexadecylamine",
        "chemical_description": "Hexadecylamine is a long-chain primary amine with the formula CH3(CH2)15NH2. It is a white solid that is used as a surfactant and emulsifier in various industrial applications.",
        "chemical_safety": "Hexadecylamine is harmful if swallowed or inhaled. It can cause skin and eye irritation. It should be handled with care and stored in a cool, dry place."
    },
    {
        "chemical_name": "ammonium hydroxide",
        "chemical_description": "Am