In [1]:
from os import listdir, environ
from sys import path

from science_parse_api.api import parse_pdf
from pathlib import Path

import pandas as pd

import json
import ast
import nltk

import openai
from openai.error import RateLimitError, InvalidRequestError
import backoff

In [2]:
openai.organization = environ.get('OPEN_AI_ORG')
openai.api_key = environ.get('OPEN_AI_API_KEY')

In [3]:
def parse_paragraphs(directory = './SeNp_research_articles/'):
    pdf_dirs = [directory]
    pdf_files = [pdf_dirs[0]+i for i in listdir(pdf_dirs[0])]
    
    # sci parse host
    host = 'http://127.0.0.1'
    port = '8080'
    
    paragraphs = {}
    for fl in pdf_files:
        pth = Path('./', fl).resolve()
        parsed = parse_pdf(host, pth, port=port)
        paragraphs[fl]=parsed

    df = pd.DataFrame()
    df['file'] = paragraphs.keys()
    df['content'] = paragraphs.values() 
    df['id'] = df.content.map(lambda x: x.get('id'))
    df['title'] = df.content.map(lambda x: x.get('title'))
    df['abstractText'] = df.content.map(lambda x: x.get('abstractText'))
    df['year'] = df.content.map(lambda x: x.get('year'))
    df['authors'] = df.content.map(lambda x: x.get('authors'))
    df['references'] = df.content.map(lambda x: x.get('references'))
    df['sections'] = df.content.map(lambda x: x.get('sections'))

    df_paragraphs = df[['file', 'id', 'title', 'year', 'authors', 'sections']].copy()\
    .explode('sections').reset_index(drop=True)

    df_paragraphs = df_paragraphs[df_paragraphs['sections'].map(lambda x: len(x.get('text'))>0)].copy()

    df_paragraphs.reset_index(drop=True, inplace=True)

    return(df_paragraphs)

In [4]:
@backoff.on_exception(backoff.expo, RateLimitError)
def text_cleaning(text, model="gpt-4-turbo-preview"):
    # print(len(text.split(' ')))
    messages = []
    
    messages.append({"role": "system", "content":'''# who you are: you are helpful assistant, expert in biochemistry.
    # your task: remove artefacts from not so good parsed scientific text sections.
    # you will take input in format: section text
    # you will respond: section text cleaned from artefacts'''})

    messages.append({"role": "user", "content": text})

    try:
        response = openai.ChatCompletion.create(model=model, messages=messages, max_tokens=4095, temperature=1)
    except InvalidRequestError as e:
        print(f"Error: {e}")
        

    # Print the response and add it to the messages list
    chat_message = response['choices'][0]['message']['content']
    # print(len(chat_message.split(' ')))
    # print(f"Bot: {chat_message}")
    # messages.append({"role": "assistant", "content": chat_message})
    print('*', end = "")
    return(chat_message)

In [5]:
@backoff.on_exception(backoff.expo, RateLimitError)
def label_paragraph_fourth_prompt(paragraph):
    messages = []
    
    messages.append({"role": "system", "content":'''# who you are: you are helpful assistant, expert in chemistry.
    # your task: label paragraph that describes exact synthesis recipe for synthesis of Se nanoparticles with labels "YES" or "NO".
    # hint: paragraph usually contains parameters such as mass of substances used, concentrations of solutions, reaction temperatures, etc. 
    # you will take input in format: #####<paragraph>#####
    # you will respond: <label>'''})

    messages.append({"role": "user", "content": '#####'+paragraph+'#####'})
    
    response = openai.ChatCompletion.create(model="gpt-4-turbo-preview", 
                                            messages=messages, max_tokens=4095, temperature=1)

    # Print the response and add it to the messages list
    chat_message = response['choices'][0]['message']['content']
    # print(f"Bot: {chat_message}")
    # messages.append({"role": "assistant", "content": chat_message})
    print(chat_message, end = "")
    return(chat_message)

In [6]:
df_paragraphs = parse_paragraphs(directory='./SeNp_research_articles/')

In [7]:
df_paragraphs = \
df_paragraphs[df_paragraphs['sections'].map(lambda x: len(x.get('text').split(' ')))>30].copy()

In [8]:
df_paragraphs['section_text_cleaned'] = \
df_paragraphs['sections'].map(lambda x: text_cleaning(text = x.get('text'), model = 'gpt-3.5-turbo'))

*****************************************************************************************************************************

In [10]:
df_paragraphs['section_text_cleaned']

0      Colloids and Surfaces B: Biointerfaces 132 (20...
2      Department of Biotechnology, University of Ver...
3      Biogenic metal/metalloid nanoparticles of micr...
4      Selenium nanoparticles (SeNPs) of 10–400 nm in...
5      Biosynthesis of SeNPs by bacterial strains\nFi...
                             ...                        
135    The release of versatile drugs from PCL micros...
136    The release of SeNP from PCL was measured in i...
137    Although selenium nanoparticles (SeNPs) are no...
138    From the application aspect, degradation of bi...
139    This study was supported by the Ministry of Ed...
Name: section_text_cleaned, Length: 125, dtype: object

In [11]:
df_paragraphs['label_raw'] = df_paragraphs['sections'].map(lambda x: x.get('text'))\
.map(lambda x: label_paragraph_fourth_prompt(x))

NONONONONONONOYESNONONONONONONOYESNONONONONONOYESNONOYESNONONONONONONOYESNONONONONONONONONONONONONONONONONONONONONONONONONONOYESYESNONONONONOYESNONONONONONONONONONONONONOYESNONONOYESNONONONONOYESNOYESNONONONONONOYESNONONONONONONONONONONOYESNONONONONONONONONONONONO

In [13]:
df_paragraphs['label_cleaned'] = df_paragraphs['section_text_cleaned'].map(lambda x: x)\
.map(lambda x: label_paragraph_fourth_prompt(x))

YESNONONONONONOYESNONONONONONONOYESNONONONONONOYESNONOYESNONONONONONONOYESNONONONONONONONONONONONONONONONONONONONONONONONONONOYESYESNONONONONOYESNONONONONONONONONONONONONOYESNONONOYESNONONONONONONOYESNONONONONONOYESNONONONONONONONONONONONONONONONONONONONONONONONO

In [22]:
#Check Labeled praghraphs

# for i in df_paragraphs[df_paragraphs['label_raw'] == 'YES'].sample(frac=.1)\
# ['sections'].map(lambda x: x.get('text')).tolist():
#     print(i)
#     print('**********************\n')

In [30]:
df_paragraphs.to_pickle(path = 'SeNp_synth_paragraphs_labeled.pkl')