In [1]:
from os import listdir, environ
from sys import path

import pandas as pd

import openai
from openai.error import RateLimitError, InvalidRequestError
import backoff

import json
import ast

In [2]:
openai.organization = environ.get('OPEN_AI_ORG')
openai.api_key = environ.get('OPEN_AI_API_KEY')

In [3]:
@backoff.on_exception(backoff.expo, RateLimitError)
def extract_paragraph_info_json_three(paragraph):
    messages = []
    
    messages.append({"role": "system", "content":'''# who you are: you are helpful assistant, expert in chemistry.
    # your task: extract synthesis parameters from scientific text.
    # you will take input in format: #####<paragraph>#####
    # you will respond in json format: 
    {"synthesis_approach": [<value/values (separated by comma)> or '' (empty string)], "optimization_method": [<value/values> or ''], "formulations": [<value/values> or ''], "precursor": [<value/values> or ''], "concentration_&_volume_of_precursor_solutions": [<value/values> or ''], "reduction_agents": [<value/values> or ''], "stabilizing_agent": [<value/values> or ''], "stabilizing_agent_type": [<value/values> or ''], "stabilizing_agent_concentration": [<value/values> or ''], "mixing_&_reaction_conditions": [<value/values> or ''], "mass_ratio": [<value/values> or ''], "precautions": [<value/values> or ''], "storage": [<value/values> or ''], "post-preparation_analysis": [<value/values> or '']}'''})
    
    messages.append({"role": "user", "content": '#####'+paragraph+'#####'})
    
    response = openai.ChatCompletion.create(model="gpt-4-turbo-preview", 
                                            messages=messages, max_tokens=4095, temperature=1)

    # Print the response and add it to the messages list
    chat_message = response['choices'][0]['message']['content']
    # print(f"Bot: {chat_message}")
    # messages.append({"role": "assistant", "content": chat_message})
    return(chat_message)

In [4]:
df_paragraphs = pd.read_pickle('SeNp_synth_paragraphs_labeled.pkl')

In [5]:
df_paragraphs_synth = \
df_paragraphs[df_paragraphs['label_cleaned_splited'] == 'YES']

In [6]:
df_paragraphs_synth['synth_prams_json_three'] = \
df_paragraphs_synth['section_text_cleaned_splited']\
.map(lambda x: extract_paragraph_info_json_three(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paragraphs_synth['synth_prams_json_three'] = \


In [7]:
df_paragraphs_synth['synth_prams_json_three']

0      ```json\n{\n  "synthesis_approach": ["foam rep...
17     {"synthesis_approach": ["mixing aqueous soluti...
17     ```json\n{\n  "synthesis_approach": ["Spray dr...
17     ```json\n{\n  "synthesis_approach": [""],\n  "...
25     ```json\n{\n  "synthesis_approach": ["Bio-synt...
25     ```json\n{\n  "synthesis_approach": ["Jain et ...
29     ```json\n{\n  "synthesis_approach": ["Reddish-...
40     ```json\n{\n  "synthesis_approach": ["chemical...
70     ```json\n{\n  "synthesis_approach": ["Mixing, ...
70     ```json\n{\n  "synthesis_approach": ["reaction...
71     ```json\n{\n  "synthesis_approach": ["radiatio...
78     ```json\n{\n  "synthesis_approach": ["Wet chem...
78     ```json\n{\n    "synthesis_approach": ["dissol...
86     {"synthesis_approach": ["function method"], "o...
92     ```json\n{\n  "synthesis_approach": ["green sy...
99     ```json\n{\n  "synthesis_approach": ["reductio...
108    ```json\n{\n  "synthesis_approach": ["reductio...
108    ```json\n{\n    "synthes

In [8]:
pd.DataFrame(df_paragraphs_synth['synth_prams_json_three']\
.map(lambda x: json.loads(x.replace('json', '').replace('```', ''))).tolist())

Unnamed: 0,synthesis_approach,optimization_method,formulations,precursor,concentration_&_volume_of_precursor_solutions,reduction_agents,stabilizing_agent,stabilizing_agent_type,stabilizing_agent_concentration,mixing_&_reaction_conditions,mass_ratio,precautions,storage,post-preparation_analysis
0,"[foam replica method, physicochemical solvent/...",[],"[45S5Bioglass® powder (45% SiO2, 24.5% CaO, 24...","[ascorbic acid, sodium selenite, bovine serum ...",[],[ascorbic acid],[bovine serum albumin (BSA)],[],[],"[dissolving ascorbic acid in distilled water, ...",[],[],[],[]
1,[mixing aqueous solutions],[],"[HAuCl4 (3 mM), NaSeSO4 (2 mM)]","[HAuCl4, NaSeSO4]","[HAuCl4 (3 mM), NaSeSO4 (2 mM); volume ratio o...",[],[],[],[],"[stirred at room temperature, monitored every ...",[],"[precipitated via centrifugation, washed two t...",[],[UV/Vis spectrophotometry]
2,[Spray drying],,[BSA nanospheres with Au2Se/Au nanoparticles],"[BSA powder, Au2Se/Au nanoparticles]",[0.25 g of BSA in 25 mL of distilled water],,[BSA],[Protein],,"[Heating to 150 °C, Maintained at 150 °C for 3 h]","[BSA to Au2Se/Au: 50:1, BSA to Au2Se/Au: 50:3,...",,,
3,[],[],[(Au2Se/Au and ZnPc)-loaded BSA nanospheres (2...,"[HNO3, HClO4]","[HNO3 (5 mL), HClO4 (1 mL)]",[],[BSA],[],[],"[HNO3 treated at 100 °C for 2 h, HClO4 treated...",[],[],[],[inductively coupled plasma–atomic emission sp...
4,[Bio-synthesis],,[Bio-synthesized elemental selenium nanopartic...,[Sodium selenite],"[10 mM, 100 mL Luria-Bertani (LB) broth]",[Comamonas testosteroni S44],,,,"[Incubation at 28 °C, shaking at 150 rpm for 3...",,"[Centrifugation for 10 min at 8,000 rpm, Sonic...",[Temporarily preserving at -20 °C],[]
5,[Jain et al. protocol with minor modifications],,"[sodium selenite, L-reduced glutathione (GSH)]",[sodium selenite],[200 µL 1 M sodium selenite],[L-reduced glutathione (GSH)],,,,"[room temperature, French pressure for lysis]",[0.25 g GSH],[Cells collected by centrifugation and washed ...,,
6,[Reddish-orange homogeneous colloid],[],[Cts-Se colloid],"[low molecular weight Cts, ascorbic acid, sodi...","[0.15 g in 50 ml double distilled water, 25 ml...",[ascorbic acid],"[low molecular weight Cts, acetic acid]","[polymer, acid]",[some drops of 1.0% acetic acid solution],[Stirring at room temperature],[],[],[],"[UV-vis spectrophotometry, Particle size analy..."
7,[chemical reduction],[DOE methodology],"[SeNPs-BSA, SeNPs-Chit, SeNPs-Gluc]",[Sodium selenite],[0.02 M (12.5 mL) for SeNPs-BSA and SeNPs-Chit...,"[Ascorbic acid for SeNPs-BSA and SeNPs-Chit, G...","[BSA, Chitosan]","[Protein for BSA, Polysaccharide for Chitosan]",[0.87% w/w (5 mL)],[Ascorbic acid + stabilizer solutions mix then...,[1:1 between sodium selenite and chitosan or BSA],[Reaction vessels covered with aluminum foil t...,[4–8◦C],"[FTIR, XRD]"
8,"[Mixing, Refluxing]",[],"[Sodium selenosulphate solution, PVA stock sol...","[Selenium powder, Na2SO3]",[Selenium powder (2 g) and Na2SO3 (20 g) in 10...,[],[PVA],[Polymer],[1%],"[Refluxed at 70 °C for 7 h, Stirring at 80 °C ...","[Selenium powder:Na2SO3 = 2g:20g, PVA:Water = ...",[Solution kept in dark to prevent photo-oxidat...,[],[]
9,[reaction of sodium selenosulphate with differ...,"[studying the reaction kinetics, spectrophotom...",[PVA-stabilized Se nanoparticles],[sodium selenosulphate],[5x10^-4 to 1.5x10^-3 mol dm^-3],[],[PVA],[polymer],[0.05–0.15%],"[aqueous medium, different time periods depend...",[],[separation of selenium nanoparticles using hi...,[],"[spectrophotometric method, acid test for conf..."


In [9]:
df_paragraphs_synth['synth_prams_json_three_from_raw'] = \
df_paragraphs_synth['sections']\
.map(lambda x: extract_paragraph_info_json_three(x.get('text')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paragraphs_synth['synth_prams_json_three_from_raw'] = \


In [10]:
pd.DataFrame(df_paragraphs_synth['synth_prams_json_three_from_raw']\
.map(lambda x: json.loads(x.replace('json', '').replace('```', ''))).tolist())

Unnamed: 0,synthesis_approach,optimization_method,formulations,precursor,concentration_&_volume_of_precursor_solutions,reduction_agents,stabilizing_agent,stabilizing_agent_type,stabilizing_agent_concentration,mixing_&_reaction_conditions,mass_ratio,precautions,storage,post-preparation_analysis
0,[foam replica method],,[45S5Bioglass],"[45 SiO2, 24.5 CaO, 24.5 Na2O, 6 P2O5, sodium ...","[0.02 M sodium selenite: 12.5 mL, ascorbic aci...",[ascorbic acid],[bovine serum albumin (BSA)],[protein],,[sodium selenite and ascorbic acid solution mi...,"[scaffold coating with SeNp: 2.5 wt%, scaffold...",[slow burning out of as-coated foams to minimi...,,"[Thermogravimetric analysis (TGA), X-ray diffr..."
1,"[Spray drying, Ultrasonic atomization]","[UV/Vis spectrophotometry monitoring, Centrifu...","[Au2Se/Au Core–Shell Nanoparticles, Au2Se/Au-L...","[Gold chloride tetrahydrate (HAuCl4), NaSeSO4]","[HAuCl4 (3 mM), NaSeSO4 (2 mM), volume ratio o...",[],[BSA],[Protein],[],"[Stirring at room temperature, Monitoring abso...","[BSA to Au2Se/Au are 50:1, 50:3, 50:5]",[Stirred in a dark room to prevent ZnPc degrad...,[],[High-resolution transmission electron microsc...
2,[Spray drying],[UV/Vis spectrophotometry for monitoring solut...,"[Au2Se/Au core-shell nanoparticles, Au2Se/Au-l...","[Gold chloride tetrahydrate (HAuCl4), NaSeSO4]","[HAuCl4 (3 mM), NaSeSO4 (2 mM), volume ratio o...",[],[BSA],[Protein],[0.25 g in 25 mL of distilled water],"[Stirring at room temperature, centrifugation,...","[BSA to Au2Se/Au: 50:1, 50:3, 50:5]",[Stirring in a dark room to prevent degradation],[],[High-resolution transmission electron microsc...
3,[Spray drying],[],"[Au2Se/Au core–shell nanoparticles, Au2Se/Au-l...","[Gold chloride tetrahydrate (HAuCl4), NaSeSO4,...","[HAuCl4 (3 mM), NaSeSO4 (2 mM), volume ratio o...",[],[BSA],[Protein],[0.25 g in 25 mL of distilled water],"[Mixed and stirred at room temperature, Monito...","[BSA to Au2Se/Au set at 50:1, 50:3, and 50:5]","[Stirring in a dark room overnight, Precipitat...",[],[Highresolution transmission electron microsco...
4,"[bio-synthesized, chemical and bio-synthesized...",[],"[Luria-Bertani (LB) broth, sodium selenite, gl...",[sodium selenite],[10 mM sodium selenite added to 100 mL LB brot...,"[Comamonas testosteroni S44, glutathione (GSH)]",[],[],[],[Incubated in 250 mL Erlenmeyer flasks at 28 °...,[1:100 for sample with KBr for FT-IR analysis],[],[Temporarily preserved in −20 °C],"[Transmission Electron Microscopy (TEM), Elect..."
5,"[bio-synthesis, chemical synthesis]","[shaking at 150 rpm, incubation at 28 °C, soni...","[BioSeNPs, CheBioSeNPs]",[sodium selenite],"[10 mM sodium selenite for BioSeNPs, 200 µL 1 ...","[selenite reductase, glutathione]","[proteins enriched in charged amino acids, car...","[charged amino acids, carbohydrates, and lipids]",[],[incubation at 28 °C for 3 days with shaking a...,[1:100 of sample to KBr for FT-IR spectroscopy],"[cells were washed twice with ddH2O, pellets w...",[temporarily preserving in −20 °C],"[TEM, EDX, DLS, zeta potential, FT-IR spectros..."
6,[colloidal synthesis],[],"[0.15 g Cts in 50 ml double distilled water, 2...","[Cts (low molecular weight), ascorbic acid, so...","[Cts: 0.15 g in 50 ml water, Ascorbic acid: 0....",[ascorbic acid],[Cts (low molecular weight)],[Polymer],[],[Stirring at room temperature],[],[addition of a few drops of acetic acid (1.0%)...,[],[UV-vis spectrophotometry covering 200 to 700 ...
7,[chemical reduction],[DOE methodology],"[SeNPs-BSA, SeNPs-Chit, SeNPs-Gluc]",[Sodium selenite],"[0.02M (12.5mL) for SeNPs-BSA and SeNPs-Chit, ...","[Ascorbic acid, Glucose]","[BSA, Chitosan]","[Protein-based for BSA, Polysaccharide for Chi...",[0.87% w/w (5mL)],"[Magnetic stirring at 1,500 rpm until color ch...",[1:1 between sodium selenite and chitosan or BSA],[Reaction vessels covered with aluminum foil t...,[4-8 degrees Celsius],"[FTIR, XRD]"
8,[PVA-stabilized Se nanoparticles synthesized b...,"[spectrophotometric method, acid test]","[sodium selenosulphate, PVA, sodium acrylate, ...","[sodium selenosulphate, selenium powder]",[selenium powder (2 g) and Na2SO3 (20 g) in 10...,[],[PVA],[polyvinyl alcohol],[0.05-0.15%],"[refluxed at 70 8C for about 7 hours, dilution...",[selenium powder:Na2SO3 = 2g:20g],[keeping sodium selenosulphate solution in dar...,[],"[UV–vis optical absorption spectra, XRD patter..."
9,[PVA-stabilized Se nanoparticles synthesis in ...,,"[sodium selenosulphate, polyvinyl alcohol (PVA...",[sodium selenosulphate],"[0.25 M sodium selenosulphate, 1% PVA stock so...",[Na2SO3],[PVA],[polymer],[0.05–0.15%],"[temperature 70 8C, refluxed for about 7 hours...","[selenium powder: 2 g, Na2SO3 solution: 20 g i...",[keep sodium selenosulphate solution in dark t...,[dried at room temperature],"[UV–vis optical absorption spectra, XRD patter..."


In [11]:
df_paragraphs_synth['sections'].map(lambda x: x.get('text')).iloc[4]

'Production and purification of bio-synthesized elemental selenium nanoparticles (BioSeNPs). A 1% Comamonas testosteroni S44 inoculum was incubated in 250 mL Erlenmeyer flasks containing 100 mL Luria-Bertani (LB, Difco) broth and cultured for 12 h (up to the middle of exponential growth), then 10 mM sodium selenite was added and incubation continued at 28 °C with shaking at 150 rpm for 3 days. The production of elemental selenium was confirmed by the appearance of red color. The culture was collected by centrifugation (Eppendorf 12492) for 10 min at 8,000 rpm. Cells were lysed by sonication after washing twice by double distilled water (ddH2O, 18.25 MΩ·cm). Following centrifugation (Eppendorf 5415D) at 12,000 rpm for 5 min and removing the supernatant, the pellets were then resuspended in ddH2O and centrifuged twice with 80% (w/v) sucrose for 30 min to remove biomass (Fig.\xa0S1). The pure BioSeNPs on the bottom were collected after washing twice by ddH2O and temporarily preserving in 