In [1]:
from os import listdir, environ
from sys import path

import pandas as pd

import openai
from openai.error import RateLimitError, InvalidRequestError
import backoff

import json
import ast

In [2]:
openai.organization = environ.get('OPEN_AI_ORG')
openai.api_key = environ.get('OPEN_AI_API_KEY')

In [3]:
@backoff.on_exception(backoff.expo, RateLimitError)
def extract_paragraph_info_json_three(paragraph):
    messages = []
    
    messages.append({"role": "system", "content":'''# who you are: you are helpful assistant, expert in colloidal chemistry.
    # your task: extract synthesis parameters from scientific text.
    # you will take input in format: #####<paragraph>#####
    # you will respond in json format: 
    {"synthesis_approach": [<value/values (separated by comma)> or '' (empty string)], "optimization_method": [<value/values> or ''], "formulations": [<value/values> or ''], "precursor": [<value/values> or ''], "concentration_&_volume_of_precursor_solutions": [<value/values> or ''], "reduction_agents": [<value/values> or ''], "stabilizing_agent": [<value/values> or ''], "stabilizing_agent_type": [<value/values> or ''], "stabilizing_agent_concentration": [<value/values> or ''], "mixing_&_reaction_conditions": [<value/values> or ''], "mass_ratio": [<value/values> or ''], "precautions": [<value/values> or ''], "storage": [<value/values> or ''], "post-preparation_analysis": [<value/values> or '']}'''})
    
    messages.append({"role": "user", "content": '#####'+paragraph+'#####'})
    
    response = openai.ChatCompletion.create(model="gpt-4-turbo-preview", 
                                            messages=messages, max_tokens=4095, temperature=1)

    # Print the response and add it to the messages list
    chat_message = response['choices'][0]['message']['content']
    # print(f"Bot: {chat_message}")
    # messages.append({"role": "assistant", "content": chat_message})
    return(chat_message)

In [4]:
df_paragraphs = pd.read_pickle('SeNp_synth_paragraphs_labeled.pkl')

In [5]:
df_paragraphs_synth = \
df_paragraphs[df_paragraphs['label_cleaned_splited'] == 'YES']

In [6]:
df_paragraphs_synth['synth_prams_json_three'] = \
df_paragraphs_synth['section_text_cleaned_splited']\
.map(lambda x: extract_paragraph_info_json_three(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paragraphs_synth['synth_prams_json_three'] = \


In [7]:
df_paragraphs_synth['synth_prams_json_three']

0      ```json\n{\n  "synthesis_approach": ["foam rep...
17     ```json\n{\n  "synthesis_approach": ["HAuCl4 a...
17     {"synthesis_approach": ["spray drying"], "opti...
17     ```json\n{\n  "synthesis_approach": [""],\n  "...
25     ```json\n{\n  "synthesis_approach": ["Biologic...
25     ```json\n{\n  "synthesis_approach": ["biologic...
29     ```json\n{\n  "synthesis_approach": ["Cts-Se c...
40     ```json\n{\n  "synthesis_approach": ["chemical...
70     ```json\n{\n  "synthesis_approach": ["refluxin...
70     {\n"synthesis_approach": ["reaction of sodium ...
71     ```json\n{\n  "synthesis_approach": ["radiatio...
78     ```json\n{\n  "synthesis_approach": ["chemical...
78     ```json\n{\n  "synthesis_approach": ["dialysis...
86     {"synthesis_approach": [""], "optimization_met...
92     ```json\n{\n  "synthesis_approach": ["Green sy...
99     {\n"synthesis_approach": ["reduction"],\n"opti...
108    ```json\n{\n  "synthesis_approach": ["reductio...
108    ```json\n{\n  "synthesis

In [8]:
pd.DataFrame(df_paragraphs_synth['synth_prams_json_three']\
.map(lambda x: json.loads(x.replace('json', '').replace('```', ''))).tolist())

Unnamed: 0,synthesis_approach,optimization_method,formulations,precursor,concentration_&_volume_of_precursor_solutions,reduction_agents,stabilizing_agent,stabilizing_agent_type,stabilizing_agent_concentration,mixing_&_reaction_conditions,mass_ratio,precautions,storage,post-preparation_analysis
0,"[foam replica method, physicochemical solvent/...",[],"[45% SiO2, 24.5% CaO, 24.5% Na2O, 6% P2O5, pol...","[45S5Bioglass® powder, ascorbic acid, sodium s...",[],[ascorbic acid],[bovine serum albumin (BSA)],[],[],"[dissolving ascorbic acid in distilled water, ...",[],[],[],[]
1,[HAuCl4 and NaSeSO4 aqueous solutions mixing],[],"[HAuCl4, NaSeSO4]","[HAuCl4, NaSeSO4]","[HAuCl4 (3 mM), NaSeSO4 (2 mM), Volume ratio N...",[],[],[],[],"[Mixed and stirred at room temperature, Soluti...",[],"[Precipitation via centrifugation, Washed two ...",[],[Monitoring solution absorption spectrum shift...
2,[spray drying],[],[],"[BSA powder, Au2Se/Au nanoparticles]",[0.25 g of BSA in 25 mL of distilled water],[],[BSA],[protein],[],"[heated to 150 °C, maintained for 3 h]","[BSA to Au2Se/Au: 50:1, 50:3, 50:5]",[],[],[]
3,[],[],[(Au2Se/Au and ZnPc)-loaded BSA nanospheres],"[HNO3, HClO4]","[1 mL of nanospheres (2 mg/mL), 5 mL of HNO3, ...",[],[BSA],[],[2 mg/mL],"[treatment with HNO3 at 100 °C for 2 h, HClO4 ...",[],[],[],[Inductively coupled plasma–atomic emission sp...
4,[Biological synthesis],,[Luria-Bertani (LB) broth],[Sodium selenite],"[10 mM, 100 mL]",[Comamonas testosteroni S44],,,,"[Incubation: 28 °C, 150 rpm, 3 days, Initial c...",,[Centrifugation steps for purification and rem...,[-20 °C],"[Centrifugation, Sonication, Sucrose density g..."
5,[biological],[protocol developed by Jain et al. with minor ...,[CheBioSeNPs],[sodium selenite],[200 µL 1 M sodium selenite],[L-reduced glutathione (GSH)],[],[],[],[at room temperature],[0.25 g GSH to SLCF volume],"[Cells collected by centrifugation, washed twi...",[],[centrifugation]
6,[Cts-Se colloid formation],[],"[Cts solution, ascorbic acid solution, sodium ...","[low molecular weight Cts, ascorbic acid, sodi...","[Cts: 0.15 g in 50 ml double distilled water, ...",[ascorbic acid],[acetic acid],[acid],[1.0% (some drops)],"[stirring at room temperature, sequential addi...",[],[],[],"[UV-vis spectrophotometry, particle size analy..."
7,[chemical reduction],[DOE methodology],"[SeNPs-BSA, SeNPs-Chit, SeNPs-Gluc]",[Sodium selenite],[0.02 M (12.5 mL) for SeNPs-BSA and SeNPs-Chit...,"[Ascorbic acid for SeNPs-BSA and SeNPs-Chit, G...","[BSA for SeNPs-BSA, Chitosan for SeNPs-Chit, N...","[Protein for BSA, Polysaccharide for Chitosan]",[0.87% w/w (5 mL)],"[Mixing at 1,500 rpm on a magnetic stirrer unt...",[1:1 between sodium selenite and chitosan/BSA],[Covering reaction vessels with aluminum foil ...,[4–8◦C],"[FTIR, XRD]"
8,[refluxing],,"[sodium selenosulphate solution, PVA stock sol...","[selenium powder, Na2SO3]","[selenium powder (2 g), Na2SO3 (20 g) in 100 m...",,[PVA],,[1%],"[Temperature: 70 °C for selenium mixture, 80 °...",,[kept in dark to prevent photo-oxidation],,
9,[reaction of sodium selenosulphate with differ...,"[study of the reaction kinetics, spectrophotom...",[PVA-stabilized Se nanoparticles],[sodium selenosulphate],[5x10^-4 to 1.5x10^-3 mol dm^-3],[],[PVA],[],[0.05–0.15%],[different time periods depending on the natur...,[],"[spectrophotometric method, addition of dilute...",[],"[spectrophotometric method, acid test for the ..."


In [9]:
df_paragraphs_synth['synth_prams_json_three_from_raw'] = \
df_paragraphs_synth['sections']\
.map(lambda x: extract_paragraph_info_json_three(x.get('text')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paragraphs_synth['synth_prams_json_three_from_raw'] = \


In [10]:
pd.DataFrame(df_paragraphs_synth['synth_prams_json_three_from_raw']\
.map(lambda x: json.loads(x.replace('json', '').replace('```', ''))).tolist())

Unnamed: 0,synthesis_approach,optimization_method,formulations,precursor,concentration_&_volume_of_precursor_solutions,reduction_agents,stabilizing_agent,stabilizing_agent_type,stabilizing_agent_concentration,mixing_&_reaction_conditions,mass_ratio,precautions,storage,post-preparation_analysis
0,[foam replica method],,[45S5Bioglass],"[sodium selenite (Na2SeO3), bovine serum album...","[0.02 M sodium selenite: 12.5 mL, BSA: 5 mL, a...",[ascorbic acid],"[BSA, PVP]","[protein, polymer]",[PVP: 0.5% w/w],"[Homogenization at 21,000 rpm for 15 min, Slow...","[Coating of SeNp on BG scaffolds: 2.5 wt%, Coa...",[Slow burn out of coated foams to minimize dam...,,"[X-ray diffraction (XRD), Fourier transform in..."
1,[Centrifugation],[UV/Vis spectrophotometry],"[Au2Se/Au-Loaded BSA Nanospheres, Au2Se/Au Cor...","[HAuCl4 (3 mM), NaSeSO4 (2 mM)]","[HAuCl4 (3 mM), NaSeSO4 (2 mM); Volume ratio o...",[],[BSA],[Protein],[0.25 g in 25 mL of distilled water],[Solution mixed and stirred at room temperatur...,"[BSA to Au2Se/Au set at 50:1, 50:3, and 50:5]","[Solution stirred in a dark room overnight, So...",[],[High-resolution transmission electron microsc...
2,[wet chemistry],,"[Au2Se/Au Core–Shell Nanoparticles, Au2Se/Au-L...","[HAuCl4, NaSeSO4, BSA, ZnPc]","[HAuCl4 (3 mM), NaSeSO4 (2 mM), volume ratio o...",,[BSA],[protein],[0.25 g in 25 mL distilled water],[mix and stir HAuCl4 and NaSeSO4 at room tempe...,"[BSA to Au2Se/Au: 50:1, 50:3, 50:5]",[stirring in a dark room for ZnPc incorporatio...,,"[HRTEM for nanoparticle morphology, SEM for na..."
3,"[Spray drying, Centrifugation, Stirring]","[UV/Vis spectrophotometry, SEM analysis, HRTEM...","[Au2Se/Au core–shell nanoparticles, Au2Se/Au-l...","[HAuCl4 (Gold chloride tetrahydrate), NaSeSO4,...","[HAuCl4 (3 mM), NaSeSO4 (2 mM), Volume ratio o...",[],[BSA],[Protein],[BSA mass to Au2Se/Au nanoparticle mass ratios...,"[Room temperature stirring, Stirring in dark r...","[BSA to Au2Se/Au ratios: 50:1, 50:3, 50:5]","[Stirring in dark room for ZnPc incorporation,...",[],[HRTEM and EDS for nanoparticle morphology and...
4,"[Bio-synthesized, Chemical reduction with bio-...",[],"[BioSeNPs, CheBioSeNPs]",[Sodium selenite],"[10 mM sodium selenite for BioSeNPs, 200 µL 1 ...","[Comamonas testosteroni S44 for BioSeNPs, L-re...","[Proteins, carbohydrates, lipids]",[Biological],[],"[Incubation at 28 °C, shaking at 150 rpm for 3...",[0.25 g GSH in SLCF for CheBioSeNPs],[Cells must be lysed after washing for BioSeNP...,[Temporarily preserving in -20 °C for BioSeNPs],"[TEM, EDX, DLS, zeta potential, FT-IR spectros..."
5,"[Bio-synthesis, Chemical synthesis]",,"[Luria-Bertani (LB) broth, sodium selenite, L-...",[sodium selenite],"[10 mM sodium selenite in 100 mL LB broth, 200...","[selenite reductase, glutathione]","[proteins enriched in charged amino acids, car...",,,"[28 °C, 150 rpm shaking for 3 days, room tempe...",[1:100 (sample:KBr) for FT-IR spectroscopy],[centrifugation at specific speeds and tempera...,[temporarily preserving in −20 °C],"[Transmission Electron Microscopy (TEM), Elect..."
6,[chemical reduction],,"[0.15 g Cts in 50 ml double distilled water, 2...",[sodium selenite],"[50 ml double distilled water, 25 ml of ascorb...",[ascorbic acid],[Cts (low molecular weight)],[polymeric],[0.15 g in 50 ml water],[adding acetic acid (1.0%) at room temperature...,,[addition of some drops of acetic acid (1.0%)],,"[UV-Vis spectrophotometry covering 200-700 nm,..."
7,[chemical reduction],[DOE methodology],"[SeNPs-BSA, SeNPs-Chit, SeNPs-Gluc]",[Sodium selenite],"[0.02M (12.5mL) for SeNPs-BSA and SeNPs-Chit, ...","[Ascorbic acid, Glucose]","[BSA, Chitosan]","[Protein (BSA), Polysaccharide (Chitosan)]",[0.87% w/w],"[Magnetic stirring at 1500 rpm, color change t...",[1:1 (sodium selenite:chitosan or BSA)],[Covered with aluminum foil to prevent photo-o...,[4–8°C],"[FTIR, XRD]"
8,[PVA-stabilized selenium nanoparticles synthesis],"[spectrophotometric method, dilute nitric acid...","[sodium selenosulphate, PVA, vinyl monomers]","[sodium selenosulphate, selenium powder]","[sodium selenosulphate (0.25 M), PVA (1%), sod...",[],[polyvinyl alcohol (PVA)],[polymer],[0.05–0.15%],[selenium powder and Na2SO3 solution refluxed ...,[selenium powder (2 g) to Na2SO3 solution (20 ...,[solution kept in dark to prevent photo-oxidat...,[],"[UV–vis optical absorption spectra, XRD patter..."
9,[PVA-stabilized Se nanoparticles],"[spectrophotometric method, acid test, UV–vis ...",[sodium selenosulphate with different vinyl mo...,"[sodium selenosulphate, selenium powder, Na2SO...",[selenium powder (2 g) and Na2SO3 (20 g) in 10...,[],[PVA],[polymer],[0.05–0.15%],[reaction of sodium selenosulphate with vinyl ...,"[selenium powder (2 g) to Na2SO3 (20 g), PVA (...",[keep sodium selenosulphate solution in dark t...,[],"[UV–vis optical absorption spectra, XRD patter..."


In [13]:
df_paragraphs_synth['sections'].map(lambda x: x.get('text')).iloc[4]

'Production and purification of bio-synthesized elemental selenium nanoparticles (BioSeNPs). A 1% Comamonas testosteroni S44 inoculum was incubated in 250 mL Erlenmeyer flasks containing 100 mL Luria-Bertani (LB, Difco) broth and cultured for 12 h (up to the middle of exponential growth), then 10 mM sodium selenite was added and incubation continued at 28 °C with shaking at 150 rpm for 3 days. The production of elemental selenium was confirmed by the appearance of red color. The culture was collected by centrifugation (Eppendorf 12492) for 10 min at 8,000 rpm. Cells were lysed by sonication after washing twice by double distilled water (ddH2O, 18.25 MΩ·cm). Following centrifugation (Eppendorf 5415D) at 12,000 rpm for 5 min and removing the supernatant, the pellets were then resuspended in ddH2O and centrifuged twice with 80% (w/v) sucrose for 30 min to remove biomass (Fig.\xa0S1). The pure BioSeNPs on the bottom were collected after washing twice by ddH2O and temporarily preserving in 