# This notebook is a demo version.
# Therefore, the output results in the cells below may not be reproducible.

In [1]:
# Imports
from pdf2image import convert_from_path
from pdf2image.exceptions import ( PDFInfoNotInstalledError,  PDFPageCountError,  PDFSyntaxError)
from pdfminer.high_level import extract_text
import base64
import io
import os
import concurrent.futures
from tqdm import tqdm
# from openai import OpenAI
import re
import pandas as pd 
from sklearn.metrics.pairwise import cosine_similarity
import json
import numpy as np
from rich import print as richprint
from ast import literal_eval
import unicodedata
import difflib

In [2]:
from pdfminer.high_level import extract_text
from tqdm import tqdm
import re


def make_buffer_page(page):
    new_page = page[:]
    for i in range(len(page)):
        if not page[i]:
            left = page[i - 1] if i > 0 else False
            right = page[i + 1] if i < len(page) - 1 else False
            if left or right:
                new_page[i] = True
    return new_page



def pdf_Rf_page_img(pdf_list):
    Rf_in_multi_paper = []
    for i, pdf in enumerate(pdf_list):
        print(f'pdf {i}')
        text = extract_text(pdf)
        if '\x0c' in text:

            rf_pages_bool = [('Rf' in  txt_page or 'RF' in txt_page) for txt_page in text.split('\x0c')]
            rf_pages_bool_buffer = make_buffer_page(rf_pages_bool)
            rf_pages_num = np.arange(0,len(rf_pages_bool),1)[rf_pages_bool_buffer]

            pages_text = [txt_page for txt_page in text.split('\x0c')]
            rf_pages_text = [pages_text[i] for i in rf_pages_num]
            rf_pages_united_string = '\x0c'.join(rf_pages_text)

            sen_rf_mnch_in_pdf=[]
            print("Checking each page...")
            for txt in tqdm(rf_pages_text):
                sen_list = txt.split('\n')
                sen_list = [sen for sen in sen_list if not re.fullmatch(r'\s*', sen)]
                sen_idx = [[idx-1, idx, idx+1, idx+2] for idx, sen in enumerate(sen_list) if ('Rf' in  sen or 'RF' in sen)]
                sen_rf_mnch_in_page=[]
                for mnch in sen_idx:
                    sen_rf_mnch_in_page.append([sen_list[i] for i in mnch if (i>=0)&(i<len(sen_list))])
                sen_rf_mnch_in_pdf.append(sen_rf_mnch_in_page)
                
            
        else:
            print(f'{pdf} : 페이지 표시자(\x0c)가 없습니다!!')
            continue

        imgs = convert_from_path( pdf, poppler_path = '/home/ufslab223/anaconda3/envs/ksw/bin')
        rf_page_imgs = [imgs[i] for i in rf_pages_num]
        Rf_in_multi_paper.append(rf_page_imgs)
    return Rf_in_multi_paper, rf_pages_united_string, rf_pages_text, sen_rf_mnch_in_pdf



# Converting images to base64 encoded images in a data URI format to use with the ChatCompletions API
# 이미지 이진 데이터-->텍스트 문자열 (base64)
# URI : 파일 내용을 문자열로 표현한 URL 형식

def get_img_uri(img):
    png_buffer = io.BytesIO()
    img.save(png_buffer, format="PNG")
    png_buffer.seek(0)

    base64_png = base64.b64encode(png_buffer.read()).decode('utf-8')

    data_uri = f"data:image/png;base64,{base64_png}"
    return data_uri

def get_img_markdown(img):
    
    uri = get_img_uri(img)
    markdown = f'![image](data:image/png;base64,{uri})\n\nDescribe this image.'

# GPT-API (one page each)

### opne ai api key 발급 및 결제 필요

In [None]:
print('This section is not included here because the research has not been finalized yet.\nMore details will be shared after publication.\nFor inquiries, please contact the author.')

#GPT-API
import openai
from openai import OpenAI
import time

def safe_chat(client, **kwargs):
    # 분당 토큰 처리 일정 이상 넘어가면 에러 뜨는데, 몇초 기다리면 제대로 됨. 
    # 중간에 안터지게 몇초 기다리고 재시도 하게 하는 함수
    for _ in range(5): # 5변 시도
        try:
            return client.chat.completions.create(**kwargs)
        except openai.RateLimitError as e:
            print(" 분당 토큰 수 제한 때문에... 잠깐만 기달려봐...")
            time.sleep(1.5) # 1.5초 기다렸다가 재시도
    raise RuntimeError("분당 토큰 수 제한이 계속 걸리네... 나중에 다시 시도해봐")


def analyze_one_page_GPT(one_page_text):
    # Open AI api key를 발급받아서 입력하세요.
    client = OpenAI(api_key='')

    json_style_example='''
    [
      {
        "compound name": "26b",
        "compound IUPAC name": "the given IUPAC name",
        "eluent1": "PE",
        "eluent2": "EA",
        "ratio": "20/1",
        "Rf": "0.6"
      },
      {
        "compound name": "26c",
        "compound IUPAC name": "another IUPAC name",
        "eluent1": "CH2Cl2",
        "eluent2": "np.nan",
        "ratio": "1",
        "Rf": "0.4"
      }
    ]
    '''
    print('GPT started to analyze pdf')
    answer=[]
    for i in tqdm(range(len(one_page_text))):
        text_input = [{"type": "text", "text": one_page_text[i]}]
        #image_input = [{"type": "image_url", "image_url": {"url": one_page_image[i]}}]

        if i==0:
            supporting_page_text = "This is the first page, so there is no previous page."
        else:
            supporting_page_text = one_page_text[i-1]

        if len(answer)>0:
            clean_answer = answer[-1].strip().removeprefix("```json").removesuffix("```").strip()
            previous_answer = json.loads(clean_answer)
        else:
            previous_answer=[]

        system_prompt = f'''

        This section is not included here because the research has not been finalized yet.
        More details will be shared after publication.
        For inquiries, please contact the author.
        
        '''
        
        response = safe_chat(client,
                             model="gpt-4.1-mini",
                             messages=[
                                 {"role": "system", "content": system_prompt},
                                 {"role": "user", "content": text_input}# + image_input},
                             ],
                             temperature=0,
                             top_p=0.1,
                             #reasoning_effort='high'
                            )
        answer.append(response.choices[0].message.content)
        
    return answer


# collect Rf pages

In [4]:
def collect_rf_pages(pdf_file):
    print('preprocessing pdf')
    # pdf_file example : 'ol4c04571_si_001.pdf'

    # extract
    path = './SI/'
    pdf_list= [path+pdf_file]
    rf_multi_paper, rf_multi_string, short_text, sen_rf_mnch_in_pdf = pdf_Rf_page_img(pdf_list)

    #image --> url
    rf_url_multi_paper = []
    for rf_one_paper in rf_multi_paper:
        rf_page_url = [get_img_uri(rf_page) for rf_page in rf_one_paper]
        rf_url_multi_paper.append(rf_page_url)
        
    return rf_multi_string, short_text,  sen_rf_mnch_in_pdf, rf_url_multi_paper

# extract data (GPT)

In [5]:
def normalize_ligatures(s):
    return unicodedata.normalize('NFKC', s)

def most_similar_substring(text, target, mode = 'IUPAC_string', starting_index = 0):
    text =  normalize_ligatures(re.sub(r'\s+','',text))
    best_match = ""
    highest_ratio = 0
    best_index = -1
    if type(target)==str:
        target_len = len(target)
        for i in range(starting_index, len(text) - target_len + 1):
            if (i + target_len)>len(text):
                print('check IUPAC. similar IUPAC is out of range')
                best_index = len(text)
                break
            chunk = text[i:i+target_len]
            ratio = difflib.SequenceMatcher(None, chunk, target).ratio()
            if ratio > highest_ratio:
                highest_ratio = ratio
                best_match = chunk
                best_index = i
    if mode == 'IUPAC_string':            
        return best_match.strip()
    elif mode == 'IUPAC_index':
        return best_index

def most_similar_substring_limited_idx(text, target, ending_index, mode = 'IUPAC_string',  starting_index = 0):
    text =  normalize_ligatures(re.sub(r'\s+','',text))
    best_match = ""
    highest_ratio = 0
    if type(target)==str:
        target_len = len(target)
        for i in range(starting_index, ending_index):
            if (i + target_len)>len(text):
                print('check IUPAC. similar IUPAC is out of range')
                best_index = len(text)
                break
            chunk = text[i:i+target_len]
            ratio = difflib.SequenceMatcher(None, chunk, target).ratio()
            if ratio > highest_ratio:
                highest_ratio = ratio
                best_match = chunk
                best_index = i
    if mode == 'IUPAC_string':            
        return best_match.strip()
    elif mode == 'IUPAC_index':
        return best_index


def false_iupac_compare(short_text, data_final, false_page_idx):
    page_idx_list = list(false_page_idx.items())
    for pi_pair in page_idx_list:
        page_text = short_text[pi_pair[0]]
        if pi_pair[0] > 0:
            pre_page_text = short_text[pi_pair[0]-1]
        iupac_list = data_final.loc[pi_pair[1], 'compound IUPAC name'].tolist()
        for i, iupac in enumerate(iupac_list):
            iupac_in_paper = most_similar_substring(page_text, iupac)
            richprint(f'[bold red]index___{pi_pair[1][i]}[/bold red]')
            richprint(f'[bold red]name: {data_final.loc[pi_pair[1][i], 'compound name']}[/bold red]')
            if type(iupac)==str:
                print('LLM_output >>>>  ', re.sub(r'\s+','',iupac))
            else:
                print('LLM_output >>>>  ', iupac)
            print('TXT_current >>>  ', iupac_in_paper)
            if pi_pair[0] > 0:
                iupac_in_paper_pre = most_similar_substring(pre_page_text, iupac)
                print('TXT_previous >>  ', iupac_in_paper_pre,'\n TXT_current가 전혀 비슷하지 않을때 참고')
            

def extract_data_GPT(pdf_file):
    import json
    whole_string, short_text, sen_rf_mnch_in_pdf, image_url_list = collect_rf_pages(pdf_file)
    image = image_url_list[0]
    if len(short_text)<=1:
        print('Cannot detect Rf pages!!!')
        return None
    extract_output1 = analyze_one_page_GPT(one_page_text = short_text)
    false_page_idx = {}
    whole_data=[]
    df_idx_start=0
    for idx, data in enumerate(extract_output1):
        clean_data = data.strip().removeprefix("```json").removesuffix("```").strip()
        dict_data = json.loads(clean_data)

        # data double check
        
        # --> Rf data check
        
        if len(dict_data)==0:
            print('No Rf data in this page')
        else:
            df_data = pd.DataFrame(dict_data)
            richprint(f'[bold bright_red]Page ---#################################---{idx}[/bold bright_red]')
            df = df_data.loc[:,['compound name','eluent1','eluent2','ratio','Rf']]
            num = len(df)
            df.index = np.arange(df_idx_start, df_idx_start + num, 1)
            df_idx_start += num
            display(pd.DataFrame(df).loc[:,['compound name','eluent1','eluent2','ratio','Rf']])


            # --> iupac name & matching with Rf data check

            short_text_flat = normalize_ligatures(re.sub(r'\s+','',short_text[idx]).lower())
            page_iupac_index = [short_text_flat.find(re.sub(r'\s+','',normalize_ligatures(str(iupac))).lower()) for iupac in df_data['compound IUPAC name']]
            
            for_check = page_iupac_index.copy()
            if (for_check[0] == -1)&(idx>0):
                previous_short_text_flat = normalize_ligatures(re.sub(r'\s+','',short_text[idx-1]).lower())
                pre_page_iupac_index = [previous_short_text_flat.find(re.sub(r'\s+','',normalize_ligatures(str(iupac))).lower()) for iupac in df_data['compound IUPAC name']]
                where = np.where(np.array(pre_page_iupac_index)!= -1)[0]
                for i in where:
                    max_idx = len(previous_short_text_flat)
                    minus_index = pre_page_iupac_index[i]-(max_idx+100)
                    for_check[i] = minus_index
                    page_iupac_index[i] = minus_index
                
            if -1 in for_check:
                minus_one_idx = np.where(np.array(for_check)==-1)[0]
                false_index = minus_one_idx + (df_idx_start-num)
                print(f'wrong iupac in this page:{false_index}')
                #print(num)
                false_page_idx[idx] = false_index

                for i in minus_one_idx:
                    query_iupac = df_data.loc[i,'compound IUPAC name']

                    left = 0
                    right = len(short_text_flat) - len(str(query_iupac)) + 1

                    left_candidates = [idx_val for idx_val in for_check[:i] if idx_val != -1]
                    if left_candidates:
                        left = max(left_candidates)

                    right_candidates = [idx_val for idx_val in for_check[i+1:] if idx_val != -1]
                    if right_candidates:
                        right = min(right_candidates) - len(str(query_iupac))

                    left = max(left, 0)
                    right = max(right, left + 1)

                    iupac_similar_index = most_similar_substring_limited_idx( short_text_flat,query_iupac,ending_index=right,mode='IUPAC_index',starting_index=left)

                    for_check[i] = iupac_similar_index



             ######################################## 여기서 부터 본격적으로 IUPAC~Rf 매칭 검증 
            # 위에서 page_iupac_index : 이전 페이지 등장하는 IUPAC 고려한 인덱스
            # 위에서 for_check : 이전 페이지 등장 IUPAC 고려 + 틀리게 뽑은 IUPAC은 가장 유사한 string 인덱스로 데체

            # Rf indexing
            short_text_flat_2=  normalize_ligatures(re.sub(r'\s+','',short_text[idx])) # same 'short_text_flat' but no 'lower()'  
            previous_short_text_flat_2 = normalize_ligatures(re.sub(r'\s+','',short_text[idx-1])) # same 'previous short_text_flat' but no 'lower()'  
            pre_full_len = len(previous_short_text_flat_2)
            Rf_index = np.array([i for i in range(len(short_text_flat_2) - 1) if (short_text_flat_2[i:i+2] == 'Rf') or (short_text_flat_2[i:i+2] == 'RF')])

            # IUPAC , Rf 등장 순서 재현하여 출력
            sen_rf_mnchs_list = sen_rf_mnch_in_pdf[idx]
            # 페이지에 데이터 1개일때
            if len(for_check) == 1:
                iupac_idx = for_check[0]
                Rf_location_up = Rf_index[Rf_index < iupac_idx]
                Rf_location_down = Rf_index[Rf_index > iupac_idx]

                if len(Rf_location_up) > 0:
                    richprint(f'[bold green1]***Rf_position: {Rf_location_up}***[/bold green1]')
                    mask = np.isin(Rf_index, Rf_location_up)
                    sen_rf_mnchs = [m for n, m in zip(mask, sen_rf_mnchs_list) if n]
                    for mnch in sen_rf_mnchs:
                        mnch_txt = '\n'.join(mnch)
                        print(mnch_txt + '\n')

                
                iupac_length = len(df_data.loc[0, 'compound IUPAC name']) if pd.notna(df_data.loc[0,'compound IUPAC name']) else 0
                if iupac_idx < 0:
                    org_idx = iupac_idx + (pre_full_len + 100)
                    iupac_label = previous_short_text_flat_2[org_idx:(org_idx+iupac_length)]
                else :
                    iupac_label = short_text_flat_2[iupac_idx:(iupac_idx+iupac_length)]
                richprint(f'[bold orange1]IUPAC position: {iupac_idx} --> {iupac_label}[/bold orange1]')

                if len(Rf_location_down) > 0:
                    richprint(f'[bold green1]***Rf_position: {Rf_location_down}***[/bold green1]')
                    mask = np.isin(Rf_index, Rf_location_down)
                    sen_rf_mnchs = [m for n, m in zip(mask, sen_rf_mnchs_list) if n]
                    for mnch in sen_rf_mnchs:
                        mnch_txt = '\n'.join(mnch)
                        print(mnch_txt + '\n')


            else:
                # 페이지에 데이터 2개 이상
                for i, iupac_idx in enumerate(for_check):
                    if i ==0:
                        Rf_location_up = Rf_index[Rf_index < iupac_idx]
                        Rf_location_down = Rf_index[(Rf_index > iupac_idx) & (Rf_index < for_check[1])]
                        if len(Rf_location_up)>0:
                            richprint(f'[bold green1]***Rf_position: {Rf_location_up}***[/bold green1]')
                            mask = np.isin(Rf_index, Rf_location_up)
                            sen_rf_mnchs = [m for n,m in zip(mask, sen_rf_mnchs_list) if n]
                            for mnch in sen_rf_mnchs:
                                mnch_txt = '\n'.join(mnch)
                                print(mnch_txt+'\n')

                        iupac_length = len(df_data.loc[i, 'compound IUPAC name']) if pd.notna(df_data.loc[i,'compound IUPAC name']) else 0
                        if iupac_idx < 0:
                            org_idx = iupac_idx + (pre_full_len + 100)
                            iupac_label = previous_short_text_flat_2[org_idx:(org_idx+iupac_length)]
                        else :
                            iupac_label = short_text_flat_2[iupac_idx:(iupac_idx+iupac_length)]
                        richprint(f'[bold orange1]IUPAC position: {iupac_idx} --> {iupac_label}[/bold orange1]')

                        if len(Rf_location_down)>0:
                            richprint(f'[bold green1]***Rf_position: {Rf_location_down}***[/bold green1]')
                            mask = np.isin(Rf_index, Rf_location_down)
                            sen_rf_mnchs = [m for n,m in zip(mask, sen_rf_mnchs_list) if n]
                            for mnch in sen_rf_mnchs:
                                mnch_txt = '\n'.join(mnch)
                                print(mnch_txt+'\n')
                    elif i == len(for_check)-1 :
                        Rf_location_down = Rf_index[Rf_index > iupac_idx]

                        iupac_length = len(df_data.loc[i, 'compound IUPAC name']) if pd.notna(df_data.loc[i,'compound IUPAC name']) else 0
                        if iupac_idx < 0:
                            org_idx = iupac_idx + (pre_full_len + 100)
                            iupac_label = previous_short_text_flat_2[org_idx:(org_idx+iupac_length)]
                        else :
                            iupac_label = short_text_flat_2[iupac_idx:(iupac_idx+iupac_length)]
                        richprint(f'[bold orange1]IUPAC position: {iupac_idx} --> {iupac_label}[/bold orange1]')
                            
                        if len(Rf_location_down)>0:
                            richprint(f'[bold green1]***Rf_position: {Rf_location_down}***[/bold green1]')
                            mask = np.isin(Rf_index, Rf_location_down)
                            sen_rf_mnchs = [m for n,m in zip(mask, sen_rf_mnchs_list) if n]
                            for mnch in sen_rf_mnchs:
                                mnch_txt = '\n'.join(mnch)
                                print(mnch_txt+'\n')
                    else:
                        Rf_location_down = Rf_index[(Rf_index > iupac_idx) & (Rf_index < for_check[i+1])]

                        iupac_length = len(df_data.loc[i, 'compound IUPAC name']) if pd.notna(df_data.loc[i,'compound IUPAC name']) else 0
                        if iupac_idx < 0:
                            org_idx = iupac_idx + (pre_full_len + 100)
                            iupac_label = previous_short_text_flat_2[org_idx:(org_idx+iupac_length)]
                        else :
                            iupac_label = short_text_flat_2[iupac_idx:(iupac_idx+iupac_length)]
                        richprint(f'[bold orange1]IUPAC position: {iupac_idx} --> {iupac_label}[/bold orange1]')
                        
                        if len(Rf_location_down)>0:
                            richprint(f'[bold green1]***Rf_position: {Rf_location_down}***[/bold green1]')
                            mask = np.isin(Rf_index, Rf_location_down)
                            sen_rf_mnchs = [m for n,m in zip(mask, sen_rf_mnchs_list) if n]
                            for mnch in sen_rf_mnchs:
                                mnch_txt = '\n'.join(mnch)
                                print(mnch_txt+'\n')
                                
            ########################################## 여기까지가 IUPAC, RF 매칭 검증 (등장 순서 재현)
        
        whole_data += dict_data
        
    for datapoint in whole_data:
        for key, value in datapoint.items():
            if value == "np.nan":
                datapoint[key] = np.nan
    
    data_final = pd.DataFrame(whole_data)

    if len(data_final)==0:
        print("No data")
        return None
    else:
        # iupac name check

        if len(false_page_idx)==0:
            print('Nice')
            print('Every IUPAC is correct')
            
        else:
            richprint('Wrong IUPAC names')
            richprint(false_page_idx)

            false_iupac_compare(short_text, data_final, false_page_idx)

            

        return data_final, false_page_idx

# one touch extract

In [None]:
# start!

# 이번주 는 datatable_123 부터 카운트 

title='dmmy title'
doi='dmmy doi'
publish='YYYY.vol.issue'

In [None]:
data_final, false_page_idx = extract_data_GPT('1.pdf')

preprocessing pdf
pdf 0
Checking each page...


100%|██████████| 24/24 [00:00<00:00, 16808.03it/s]


GPT started to analyze pdf


100%|██████████| 24/24 [01:12<00:00,  3.02s/it]

No Rf data in this page





Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
0,1c-A,EtOAc,Hex,0.5/9.5,0.62
1,1c-B,EtOAc,Hex,2/8,0.32
2,1c-C,EtOAc,Hex,2/8,0.45


chromatography eluting with a gradient of 0-10% ethyl acetate/hexanes to give compound (1c-A) as 
a clear oil (8.98 g, 58.2 mmol, 98%); Rf = 0.62 (EtOAc/Hex 0.5: 9.5). 1H NMR (400 MHz, CDCl3): δ 5.72 
(s, 1H), 4.16 – 3.87 (m, 2H), 2.69 (t, J = 6.0 Hz, 2H), 2.36 (t, J = 6 Hz, 2H), 1.66 (q, J = 6 Hz, 2H), 1.58 (q, 
J = 6 Hz, 2H), 1.27 – 1.13 (m, 3H). 13C NMR (125 MHz, CDCl3): δ 168.9, 166.8, 111.6, 59.3, 35.9, 35.8, 



were removed under reduced pressure. The crude alcohol was purified by silica gel chromatography 
(0-20% ethyl acetate/hexanes) to give product (1c-B) as a clear oil (2.26 g, 20.5 mmol, 86%); Rf = 0.32 
(EtOAc/Hex 2:8). 1H NMR (400 MHz, CDCl3): δ 5.50 – 5.48 (m, 1H), 4.11 (d, J = 7.2 Hz, 2H), 2.28 – 2.22 
(m, 4H), 1.69 – 1.59 (m, 4H). 13C NMR (125 MHz, CDCl3): δ 147.9, 119.0, 61.0, 33.7, 28.6, 26.3, 26.0. 



vacuo.  The  crude  mixture  was  then  used  in  the  next  step  without  further  purification.  The  crude 
product was a pale-yellow oil. Rf = 0.45 (EtOAc/Hex 2:8). 
Allylidenecyclopentane (1c-D): To freshly azeotroped methyltriphenylphosphonium bromide (6.98 g, 
19.5 mmol, 1.5 equiv.) in THF (55 mL) at 0 °C was added nBuLi (1.6 M in hexane, 12.8 mL, 20.4 mmol, 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
3,1c-D,pentane,np.nan,1,0.7
4,1c,Ether,Pentane,0.5/9.5,0.59
5,1d-A,EtOAc,Hex,0.5/9.5,0.34


solvent was removed. This crude mixture was then used in the next step without further purification. 
Rf = 0.70 (pentane). 
2-Vinyl-1-oxaspiro[2.4]heptane (1c): m-Chloroperbenzoic acid (75%, 1.28 g, 7.39 mmol, 1 equiv.) was 
added portion wise to a stirred slurry of the allylidenecyclopentane (1c-D) (0.80 g, 7.39 mmol, 1 equiv.) 



(1c) as an oil, which was further purified by column chromatography using neutral alumina eluting 
with  5%  Et2O/pentane  to  afford  the  pure  product  (0.46  g,  3.70  mmol,  yield  =  50%);  Rf  =  0.59 
(Ether/Pentane 0.5:9.5). 1H NMR (300 MHz, CDCl3): δ 5.63 – 5.51 (m, 1H), 5.45 – 5.35 (m, 1H), 5.26 
(ddd, J = 10.2, 1.8, 0.6 Hz, 1H), 3.36 (d, J = 7.5 Hz, 1H), 1.91 – 1.71 (m, 4H), 1.67 – 1.52 (m, 4H). 13C 



chromatography eluting with a gradient of 0-10% ethyl acetate/hexanes to give compound (1d-A) as 
a clear oil (3.25 g, 19.38 mmol, 95%); Rf = 0.34 (EtOAc/Hex 0.5: 9.5).  1H NMR (400 MHz, CDCl3): δ 5.63 
– 5.54 (m, 1H), 4.12 (q, J = 7.2 Hz, 2H), 2.90 – 2.70 (m, 2H), 2.21 – 2.13 (m, 2H), 1.63 – 1.58 (m, 6H), 
1.27 – 1.23 (m, 3H). 13C NMR (100 MHz, CDCl3): δ 166.9, 163.5, 113.0, 59.5, 38.0, 29.8, 28.6, 27.8, 26.3, 

reduced pressure. The crude alcohol was purified by silica gel flash chromatography (gradient of 0:100 
to 20:80 EtOAc/ hexanes) to afford  the product (1d-B) as a clear oil (1.4 g, 11.09 mmol, 93%); Rf = 
S4 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
6,1d-B,EtOAc,Hex,1/9,0.087
7,1d-C,EtOAc,Hex,2/8,0.7
8,1d-D,Hex,np.nan,1,0.86
9,(±)-1d,EtOAc,Hex,0.5/9.5,0.45


Na2SO4 and concentrated in vacuo. The crude mixture (1d-C) was then used in the next step without 
further purification. Rf = 0.7 (EtOAc/ Hex 2:8). 
Allylidenecyclohexane (1d-D): To freshly azeotroped methyltriphenylphosphonium bromide (12.89 g, 
36 mmol, 1.5 equiv.) in THF (90 mL) at 0 °C was added n-BuLi (2.5 M in hexane, 15 mL, 37.68 mmol, 



was removed under reduced pressure.  This crude mixture was  then  used in the next step without 
further purification. Rf = 0.86 (Hex). 
2-vinyl-1-oxaspiro[2.5]octane ((±)-1d): m-Chloroperbenzoic acid (mCPBA, 75%) (1.92 g, 11.13 mmol, 
1  equiv.)  (used  after  purification  following  the  procedure  described  in  the  preparation  method  of 



oxaspiro[2.5]octane  ((±)-1d),  which  was  further  purified  by  flash  chromatography  using  neutral 
alumina eluting with 5% Et2O/pentane to afford the pure product (0.94 g, 6.78 mmol, 61% yield); Rf = 
0.45 (EtOAc/Hex 0.5:9.5). 1H NMR (400 MHz, CDCl3): δ 5.75 – 5.69 (m, 1H), 5.43 – 5.36 (m, 1H), 5.29 – 
5.25 (m, 1H), 3.14 (d, J = 0.8 Hz, 1H), 1.76 – 1.60 (m, 2H), 1.55 – 1.39 (m, 8H).  13C NMR (100 MHz, 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
10,Ethyl 3-butylhept-2-enoate (1e-A),EtOAc,Hex,0.5/9.5,0.62
11,"2,3-butylhept-2-en-1-ol (1e-B)",EtOAc,Hex,1/9,0.25
12,3-butylhept-2-enal (1e-C),EtOAc,Hex,2/8,0.72
13,5-allylidenenonane (1e-D),Hex,np.nan,1,0.9


gradient of 0-10% ethyl acetate/hexane to give the product (1e-A) as a clear oil (6.30 g, 29.64 mmol, 
97%); Rf = 0.62 (EtOAc/Hex 0.5: 9.5). 1H NMR (400 MHz, CDCl3): δ 5.58 (s, 1H), 4.10 (q, J = 7.2 Hz, 2H), 
2.59 – 2.53 (m, 2H), 2.13 – 2.07 (m, 2H), 1.46 – 1.19 (m, 11H), 0.90 – 0.85 (m, 6H). 13C NMR (100 MHz, 
CDCl3): δ 166.6, 164.8, 115.1, 59.3, 38.1, 31.9, 30.8, 29.8, 23.0, 22.4, 14.3, 13.9, 13.9. The analytical 



pressure. The crude alcohol was purified by silica gel chromatography (0-20% ethyl acetate/hexanes) 
to give product (1e-B) as a clear oil (2.67 g, 15.67 mmol, 96%); Rf = 0.25 (EtOAc/Hex 1:9). 1H NMR (400 
MHz, CDCl3): δ 5.35 (t, J = 7.2 Hz, 1H), 4.11 (d, J = 6.4 Hz, 2H), 2.06 – 1.92 (m, 4H), 1.40 – 1.19 (m, 8H), 
0.87 (t, J = 7.1 Hz, 6H). 13C NMR (100 MHz, CDCl3): δ 144.4, 123.3, 59.2, 36.5, 31.1, 30.2, 30.1, 22.8, 



vacuo.  The  crude  mixture  was  then  used  in  the  next  step  without  further  purification.  The  crude 
product was a pale-yellow oil. Rf = 0.72 (EtOAc/Hex 2:8). 
5-allylidenenonane  (1e-D):  To  freshly  azeotroped  methyltriphenylphosphonium  bromide  (6.98  g, 
19.50 mmol, 1.5 equiv.) in THF (55 mL) at 0 °C was added nBuLi (1.6 M in hexane, 12.75 mL, 20.41 



the solid. The mixture was filtered through celite and the remaining solvent was removed. This crude 
mixture was then used in the next step without further purification. Rf = 0.90 (Hex). 
2,2-dibutyl-3-vinyloxirane ((±)-1e): m-Chloroperbenzoic acid (75%, 1.45 g, 8.42 mmol, 1 equiv.) (used 
after purification following the procedure described in the preparation method of compound 1c) was 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
14,"2,2-dibutyl-3-vinyloxirane ((±)-1e)",EtOAc,Hex,0.5/9.5,0.42
15,1-phenylprop-2-en-1-one (1f-B),EtOAc,Hex,1/9,0.6
16,"2,3-Epoxy-1-(phenyl)-1-propanone (1f-C)",EtOAc,Hex,1/9,0.2


1e) as an oil, which was further purified by column chromatography using neutral alumina eluting with 
5% Et2O/pentane to afford the pure product (0.65 g, 3.53 mmol, yield = 42%); Rf = 0.42 (EtOAc/Hex 
0.5:9.5). 1H NMR (400 MHz, CDCl3): δ 5.73 (ddd, J = 17.2, 10.4, 7.2 Hz, 1H), 5.39 (ddd, J = 17.2, 1.6, 0.8 
Hz, 1H), 5.28 (ddd, J = 10.8, 1.6, 0.8 Hz, 1H), 3.15 (d, J = 7.2 Hz, 1H), 1.62 – 1.20 (m, 12H), 0.87 (td, J = 



chromatography (5% EtOAc/hexane), product 1f-B was obtained as a slightly green oil (6.05 g, 45.72 
mmol, 98% yield). Rf = 0.6 (EtOAc/Hex 1:9). 1H NMR (400 MHz, CDCl3): δ 7.90 (dd, J = 6.4, 0.8 Hz, 2H), 
7.53 – 7.49 (m, 1H), 7.44 – 7.40 (m, 2H), 7.24 – 7.05 (m, 1H), 6.39 (ddd, J = 16.8, 2.4, 1.2 Hz, 1H), 5.89 
–  5.85  (m,  1H).  13C  NMR  (100  MHz,  CDCl3):  δ  191.0,  137.3,  133.0,  132.4,  130.1,  128.7,  128.6.  The 



chromatography (10% EtOAc/hexane), the product (1f-C) was obtained as a white solid (5.97 g, 40.28 
mmol, 88% yield); Rf = 0.2 (EtOAc: Hex 1:9).   1H NMR (500 MHz, CDCl3): δ 8.03 – 8.00 (m, 2H), 7.62 – 
7.59 (m, 1H), 7.48 (m, 2H), 4.21 (dd, J = 4.5, 2.5 Hz, 1H), 3.10 (dd, J = 6.5, 4.5 Hz, 1H), 2.95 (ddd, J = 6.5, 
2.5,  1.5  Hz,  1H).  13C  NMR  (125  MHz,  CDCl3):  δ  194.7,  135.5,  133.9,  128.8,  128.4,  51.1,  47.5.  The 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
17,(±)-1f,EtOAc,Hex,1/9,0.8


solid.  The  mixture  was  filtered  and  the  remaining  solvent  was  removed.  The  product  ((±)-1f)  was 
obtained as a clear oil (2.95 g, 20.14 mmol, 50% yield); Rf = 0.8 (EtOAc : Hex 1:9) after purification by 
chromatography with hexane using neutral alumina. 1H NMR (400 MHz, CDCl3): δ 7.45 – 7.43 (m, 2H), 
7.37 – 7.30 (m, 3H), 5.43 (d, J = 1.2 Hz, 1H), 5.36 (t, J = 0.8 Hz, 1H), 3.67 (ddd, J = 4, 2.8, 1.2 Hz, 1H), 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
18,1g-A,EtOAc,Hex,1/9,0.7
19,1g-B,EtOAc,Hex,2/8,0.41
20,1g-C,EtOAc,Hex,1/9,0.5
21,1g-D,EtOAc,Hex,1/9,0.95


residue  was  purified  by  flash  column  chromatography  eluting  with  a  gradient  of  0-10%  ethyl 
acetate/hexanes to give compound (1g-A) as a clear oil (3.2 g, 14.7 mmol, 91%); Rf = 0.70 (EtOAc/Hex 
1: 9). 1H NMR (400 MHz, CDCl3): δ 5.55 (s, 1H), 4.10 (q, J = 0.4 Hz, 2H), 3.84 (ddd, J = 8.4, 5.6, 3.2 Hz, 
1H), 2.27 (dd, J = 13.2, 2.0 Hz, 1H), 2.11 (td, J = 13.2, 4.0 Hz, 1H), 1.92 – 1.88 (m, 2H), 1.79 (td, J = 13.6, 



chromatography (0-20% ethyl acetate/hexanes) to give product (1g-B)  as  a  clear  oil  (2.58  g,  14.11 
mmol, yield = 99%); Rf = 0.41 (EtOAc/Hex 2:8). 1H NMR (400 MHz, CDCl3): δ 5.31 (t, J = 7.2 Hz, 1H), 4.09 
(d, J = 7.2 Hz, 2H), 2.67 – 2.63 (m, 1H), 2.22 (dd, J = 13.2, 2.0 Hz, 1H), 2.00 (td, J = 13.2, 3.2 Hz, 1H), 
1.85 – 1.80 (m, 2H), 1.70 (td, J = 13.2, 3.2 Hz, 1H), 1.18 – 0.9 (m, 4H), 0.81 (s, 9H). 13C NMR (125 MHz, 



Na2SO4, filtered and concentrated in vacuo. The crude mixture was then used in the next step without 
further purification. The crude product was a pale-yellow oil. Rf = 0.50 (EtOAc/Hex 1:9). 
1-Allylidene-4-(tert-butyl)cyclohexane (1g-D): To freshly azeotroped methyltriphenyl-phosphonium 
bromide (6.2 g, 17.3 mmol, 1.5 equiv.) in THF (40 mL) at 0 °C was added nBuLi (1.6 M in hexane, 12.2 



remaining solvent was removed. This crude mixture was then used in the next step without further 
purification. Rf = 0.95 (EtOAc/Hex 1:9). 
S9 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
22,1g,EtOAc,Hex,0.5/9.5,0.4
23,1h-A,EtOAc,Hex,0.5/9.5,0.66
24,1h-B,EtOAc,Hex,2/8,0.43


purified by column chromatography using neutral alumina eluting with 5% Et2O/pentane to afford the 
pure product (0.98 g, 5.05 mmol, yield = 57%); Rf = 0.4 (EtOAc/Hex 0.5:9.5). 1H NMR (300 MHz, CDCl3): 
δ 5.76 – 5.64 (m, 1H), 5.41 – 5.33 (m, 1H), 5.28 – 5.22 (m, 1H), 3.13 (dd, J = 9.6, 7.2 Hz, 1H), 1.85 – 1.47 
(m, 5H), 1.34 – 1.19 (m, 3H), 1.10 – 0.94 (m, 1H), 0.80 (d, J = 6.0 Hz, 9H). 13C NMR (125 MHz, CDCl3): δ 



residue  was  purified  by  flash  column  chromatography  eluting  with  a  gradient  of  0-10%  ethyl 
acetate/hexanes to give compound (1h-A) as a clear oil (5.4 g, 29.64 mmol, 95%); Rf = 0.66 (EtOAc/Hex 
0.5: 9.5). 1H NMR (300 MHz, CDCl3): δ δ5.57 (s, 1H), 4.10 (q, J = 7.2 Hz, 2H), 3.70 (dt, J = 4.8, 3.6 Hz, 
1H), 2.25 – 2.14 (m, 2H), 1.95 – 1.73z (m, 3H), 1.66 –1.56 (m, 1H), 1.23 (t, J = 7.2 Hz, 3H), 1.16 – 0.96 



chromatography  (0-20%  ethyl  acetate/hexanes)  to  give  product  (1h-B)  as  a  clear  oil  (3.13  g,  20.5 
mmol, 89%); Rf = 0.43 (EtOAc/Hex 2:8). 1H NMR (400 MHz, CDCl3): δ 5.34 (t, J = 7.2 Hz, 1H), 4.10 (d, J 
= 7.2 Hz, 2H), 2.62 – 2.52 (m, 1H), 2.18 – 2.15 (m, 1H), 2.09 – 1.96 (m, 1H), 1.86 – 1.69 (m, 3H), 1.57 – 
1.52 (m, 1H), 1.37 (s, 1H), 1.01 – 0.93 (m, 2H), 0.87 (d, J = 6.4 Hz, 3H). 13C NMR (125 MHz, CDCl3): δ 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
25,1h-C,EtOAc,Hex,2/8,0.62
26,1h-D,EtOAc,Hex,0.5/9.5,np.nan
27,1h,Ether,Pentane,0.5/9.5,0.65
28,3t,EtOAc,Hex,1/1,0.48


with brine, dried over Na2SO4, filtered and concentrated in vacuo. The crude mixture was then used 
in  the  next  step  without  further  purification.  The  crude  product  was  a  pale-yellow  oil.  Rf  =  0.62 
(EtOAc/Hex 2:8). 
stirred 



remaining solvent was removed. This crude mixture was then used in the next step without further 
purification. Rf = (EtOAc/Hex 0.5: 9.5) 
6-Methyl-2-vinyl-1-oxaspiro[2.5]octane  (1h):  m-Chloroperbenzoic  acid  (75%,  1.45  g,  8.41  mmol,  1 
equiv.) was added portion wise to a stirred slurry of the 1-allylidene-4-methylcyclohexane (1h-D) (1.5 



chromatography using neutral alumina eluting with 5% Et2O/pentane to afford the pure product (0.80 
g, 5.30 mmol, yield = 63%); Rf = 0.65 (Ether/Pentane 0.5:9.5). 1H NMR (400 MHz, CDCl3): δ 5.79 – 5.66 
(m, 1H), 5.44 – 5.37 (m, 1H), 5.32 – 5.23 (m, 1H), 3.25 – 3.10 (m, 1H), 1.98 – 1.22 (m, 9H), 1.05 – 0.79 
(m, 3H). 13C NMR (125 MHz, CDCl3): δ 133.1, 119.8, 64.2, 63.9, 34.2, 34.0, 32.3, 31.8, 28.4, 22.0. The 



pure product  4-((tert-butoxycarbonyl)amino)benzoic acid (3t)    as  an  amorphous  solid  (0.11  g,  0.48 
mmol, 80% yield); Rf = 0.48 (EtOAc/Hex 1:1). 1H NMR (400 MHz, MeOD): δ 7.79 (d, J = 8.8 Hz, 1H), 7.37 
(d, J = 8.8 Hz, 1H), 1.37 (s, 9H). 13C NMR (100 MHz, MeOD): δ 168.4, 153.4, 143.9, 130.5, 124.0, 117.3, 
S11 

No Rf data in this page
No Rf data in this page


Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
29,4’jb,DCM,np.nan,1,0.35


evaporated to give desired product (48.0 mg, 0.19 mmol, 96% yield) in pure form. No further column 
purification was necessary. Rf = 0.35 (DCM). 1H NMR (500 MHz, CDCl3): δ 7.16 (d, J = 8 Hz, 1H), 6.84 (d, 
J = 8.5 Hz, 1H), 5.90 (dd, J = 17.5, 11 Hz, 1H), 5.16 – 5.13 (m, 2H), 3.77 (s, 1H), 3.66 – 3.54 (m, 4H), 1.48 
(s, 3H). 13C NMR (125 MHz, CDCl3): δ 171.7, 158.7, 138.3, 130.2, 126.0, 115.3, 114.1, 84.8, 68.6, 55.3, 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
30,4’jc,DCM,np.nan,1,0.3


was  purified  by  silica  gel  column  chromatography  (0  to  10%  EtOAc/Hexane)  to  afford  the  desired 
product (4’jc) as a colorless oil (56.0 mg, 0.19 mmol, 96% yield). Rf = 0.30 (DCM). 1H NMR (400 MHz, 
CDCl3): δ 7.17 (d, J = 8.8 Hz, 2H), 6.83 (d, J = 8.8 Hz, 2H), 5.85 (ddd, J = 16.8, 10.8, 6.8 Hz, 1H), 5.26 – 
5.19 (m, 2H), 5.19 – 5.13 (m, 1H), 3.75 (s, 3H), 3.57 (s, 2H), 1.79 – 1.72 (m, 3H), 1.58 – 1.50 (m, 5H). 13C 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
31,4ad,DCM,np.nan,1,0.4


mg, 0.19 mmol, 95% yield) in 91% ee. The same reaction was performed with [Ir(COD)Cl]2 and (R)-L  
following the same procedure and the product (ent-4ad) was obtained in 93% yield and in 94% ee. Rf 
= 0.4 (DCM). 1H NMR (400 MHz, CDCl3): δ 8.10 – 8.01 (m, 2H), 7.60 – 7.52 (m, 1H), 7.46 – 7.42 (m, 2H), 
6.03 – 5.91 (m, 1H), 5.42 – 5.28 (m, 3H), 1.70 – 1.44 (m, 10H). 13C NMR (100 MHz, CDCl3): δ 165.6, 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
32,4ae,DCM,np.nan,1,0.56
33,4af,DCM,np.nan,1,0.38


chromatography (0 to 10% EtOAc/Hexane) to afford the desired product (4ae) as a colorless oil (59.0 
mg, 0.19 mmol, 97% yield). Rf = 0.56 (DCM). 1H NMR (400 MHz, CDCl3): δ 8.07 – 8.02 (m, 2H), 7.58 – 
7.54 (m, 1H), 7.46 – 7.43 (m, 2H), 5.99 (ddd, J = 17.2, 10.4, 6.8 Hz, 1H), 5.48 – 5.30 (m, 3H), 1.64 (s, 
1H), 1.59 – 1.52 (m, 4H), 1.33 – 1.24 (m, 8H), 0.89 – 0.82 (m, 6H). 13C NMR (100 MHz, CDCl3): δ 161.7, 



>99% ee. Under the same reaction conditions using (R)-L, the product (ent-4af) was obtained in 83% 
yield and in >99% ee. Rf = 0.38 (DCM). 1H NMR (400 MHz, CDCl3): δ 8.13 – 8.10 (m, 2H), 7.58 (ddd, J = 
8.4, 2.4, 1.2 Hz, 1H), 7.48 (m, 4H), 7.38 – 7.29 (m, 3H), 6.05 – 6.02 (m, 1H), 5.43 (d, J = 4.0 Hz, 2H), 3.87 
– 3.78 (m, 2H), 2.03 (s, 1H).  13C NMR (100 MHz, CDCl3): δ 166.1, 145.1, 138.9, 133.3, 129.8, 128.6, 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
34,4’jg,DCM,np.nan,1,0.3


crude mixture was purified by silica gel column chromatography (0 to 10% EtOAc/Hexane) to afford 
the desired product (4’jg) as a colorless oil (67.8 mg, 0.19 mmol, 94% yield). Rf = 0.30 (DCM). 1H NMR 
(400 MHz, CDCl3): δ 7.17 (d, J = 8.5 Hz, 2H), 6.83 (d, J = 8.6 Hz, 2H), 5.93 – 5.75 (m, 1H), 5.22 (dd, J = 
13.8, 8.5 Hz, 2H), 4.99 (d, J = 7.0 Hz, 1H), 3.76 (s, 3H), 3.58 (s, 2H), 1.61 – 1.46 (m, 4H), 1.31 – 1.20 (m, 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
35,4’jh,DCM,np.nan,1,0.3
36,(S)-8,Hex,EtOAc,1/1,0.27


mixture  was  purified  by  silica  gel  column  chromatography  (0  to  10%  EtOAc/Hexane)  to  afford  the 
desired product (4’jh) as a colorless oil (54.1 mg, 0.17 mmol, 85% yield). Rf = 0.30 (DCM). 1H NMR (400 
MHz, CDCl3): δ 7.17 (d, J = 7.2 Hz, 2H), 6.83 (d, J = 8.4 Hz, 2H), 6.03 – 5.69 (m, 1H), 5.31 – 5.16 (m, 2H), 
5.00 (d, J = 6.8 Hz, 1H), 3.76 (s, 3H), 3.58 (s, 2H), 1.71 – 1.36 (m, 5H), 1.23 (d, J = 4.8 Hz, 4H), 0.87 (s, 



DCM and then with EtOAc to give product ((S)-8) (0.063 g, 0.723 mmol, 86% yield) as a colorless dense 
oil. Rf = 0.27 (Hex/EtOAc 1:1). 1H NMR (400 MHz, CDCl3): δ 5.81 (ddd, J = 17.2, 10.8, 5.6 Hz, 1H), 5.32 
(dt, J = 17.2, 1.6 Hz, 1H), 5.19 (dt, J = 10.4, 1.6 Hz, 1H), 4.22 – 4.20 (m, 1H), 3.63 (dd, J = 11.2, 3.2 Hz, 
S34 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
37,"(S)-but-3-ene-1,2-diol",Hex,EtOAc,1/1,0.27
38,9,EtOAc,Hex,0.5/9.5,0.3
39,10,EtOAc,Hex,1/9,0.68
40,11,EtOAc,Hex,1/9,0.28


chromatography (0 to 10% EtOAc/Hexane) to give compound (9) as a colorless oil (0.56 g, 2.75 mmol, 
97% yield); Rf = 0.3 (EtOAc/Hex 0.5:9.5). 1H NMR (500 MHz, CDCl3): δ 5.78 (ddd, J = 16.5, 11.0, 6.0 Hz, 
1H), 5.31 (d, J = 17.0 Hz, 1H), 5.15 (d, J = 11.0 Hz, 1H), 4.13 (s, 1H), 3.63 (dd, J = 10.0, 4.0 Hz, 1H), 3.42 
(dd, J = 10.0, 8.0 Hz, 1H), 2.57 (s, 1H), 0.88 (s, 9H), 0.05 (s, 6H). 13C NMR (125 MHz, CDCl3): δ 136.7, 



chromatography (0 to 3% EtOAc/Hexane) to give the product (10) as a colorless oil (0.06 g, 0.22 mmol, 
78% yield); Rf = 0.68 (EtOAc/Hex 1:9). 1H NMR (400 MHz, CDCl3): δ 5.94 – 5.75 (m, 2H), 5.34 – 5.27 (m, 
2H), 5.22 (dt, J = 18.4, 1.2 Hz, 1H), 5.18 – 5.10 (m, 2H), 3.65 (d, J = 5.6 Hz, 2H), 3.09 (dt, J = 6.8, 1.2 Hz, 
2H), 0.85 (s, 9H), 0.02 (s, 6H). 13C NMR (100 MHz, CDCl3): δ 170.6, 133.4, 130.3, 118.5, 117.8, 75.3, 



was concentrated  to dryness and  the crude product  was purified by column chromatography (0 to 
10% EtOAc/Hexane) to give product (11) as a colorless oil (0.12 g, 0.49 mmol, 88% yield); Rf = 0.28 
(EtOAc/Hex 1:9). 1H NMR (400 MHz, CDCl3): δ 5.91 – 5.77 (m, 2H), 4.94 – 4.87 (m, 1H), 3.82 (dd, J = 
10.8, 4.4 Hz, 1H), 3.71 (dd, J = 10.8, 3.2 Hz, 1H), 3.04 – 2.98 (m, 2H), 0.83 (s, 9H), 0.01 (d, J = 2.4 Hz, 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
41,12,DCM,Hex,1/1,0.4
42,13,EtOAc,Hex,1/9,0.44
43,14,EtOAc,Hex,1/9,0.11


concentrated  to  dryness.  The  crude  product  was  purified  by  column  chromatography  (0  to  10% 
EtOAc/Hexanes) to yield compound (12) as a colorless oil (0.086 g, 0.35 mmol, 86% yield); Rf = 0.4 
(DCM/Hex 1:1). 1H NMR (400 MHz, CDCl3): δ 6.86 (ddd, J = 9.6, 5.6, 2.8 Hz, 1H), 5.98 – 5.92 (m, 1H), 
4.43 (ddd, J = 10.8, 10.0, 4.8 Hz, 1H), 3.77 (dd, J = 4.8, 2.0 Hz, 2H), 2.53 – 2.32 (m, 2H), 0.85 (s, 9H), 



gel chromatography (0 to 5% EtOAc/Hexane) to give the product (13) as a colorless oil (0.116 g, 0.34 
mmol, 71% yield); Rf = 0.44 (EtOAc/Hex 1:9). 1H NMR (400 MHz, CDCl3): δ 7.70 (d, J = 16.0 Hz, 1H), 7.52 
– 7.50 (m, 2H), 7.38 – 7.36 (m, 3H), 6.45 (d, J = 16.0 Hz, 1H), 5.88 (ddd, J = 17.2, 10.4, 6.0 Hz, 1H), 5.46 
– 5.41 (m, 1H), 5.34 (dt, J = 17.2, 1.2 Hz, 1H), 5.24 (dt, J = 10.4, 1.2 Hz, 1H), 3.76 – 3.74 (m, 2H), 0.87 



and  the  crude  product  was  purified  by  column  chromatography  (0-10%  EtOAc/Hexanes)  to  yield 
product (14) as a colorless oil (0.053 g, 0.23 mmol, 81% yield); Rf = 0.11 (EtOAc/Hex 1:9). 1H NMR (400 
MHz, CDCl3): δ 7.47 (dd, J = 6.0, 1.6 Hz, 1H), 6.13 (dd, J = 6.0, 2.0 Hz, 1H), 5.04 – 5.01 (m, 1H), 3.90 (dd, 
S36 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
44,15,EtOAc,Hex,1/9,0.52
45,16,EtOAc,Hex,2/8,0.21


purified by column chromatography (0 to 5% EtOAc/Hexane) to yield product (15) as a colorless oil 
(0.31 g, 0.967 mmol, 97% yield); Rf = 0.52 (EtOAc/Hex 1:9). 1H NMR (400 MHz, CDCl3): δ 7.31 – 7.24 
(m, 5H), 5.78 (ddd, J = 17.2, 10.8, 6.0 Hz, 1H), 5.34 – 5.29 (m, 1H), 5.18 (ddd, J = 12.0, 11.2, 1.2 Hz, 2H), 
3.67 – 3.64 (m, 4H), 0.86 (s, 9H), 0.02 (s, 6H). 13C NMR (100 MHz, CDCl3): δ 170.7, 134.0, 133.3, 129.3, 



The  residue  was  purified  by  silica  gel  chromatography  eluting  with  a  gradient  of  0  to  20%  ethyl 
acetate/hexane to give product (16) as a clear oil (0.087 g, 0.27 mmol, 58% yield); Rf = 0.21 (EtOAc/Hex 
2:8). 1H NMR (500 MHz, CDCl3): δ 7.27 (d, J = 24.0 Hz, 5H), 5.57 – 5.53 (m, 2H), 4.05 (d, J = 3.0 Hz, 2H), 
3.60 (t, J = 7.0 Hz, 1H), 2.81 - 2.76 (m, 1H), 2.51 – 2.48 (m, 1H), 0.84 (s, 9H), –0.01 (s, 6H). 13C NMR (125 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
46,17,EtOAc,Hex,1/9,0.625


was  purified  by  silica  gel  chromatography  (0  to  5%  EtOAc/Hexane)  to  give  the  product  (17)  as  a 
colorless oil (0.48 g, 1.44 mmol, 85% yield); Rf = 0.625 (EtOAc/Hex 1:9). 1H NMR (400 MHz, CDCl3): δ 
7.19 – 7.17 (m, 2H), 6.84 – 6.82 (m, 2H), 5.79 – 5.69 (m, 2H), 5.50 – 5.46 (m, 1H), 5.29-5.21 (m, 2H), 
5.03 – 4.96 (m, 2H), 4.22 (dd, J = 12, 4 Hz, 1H), 4.07 (dd, J = 12, 7.6 Hz, 1H), 3.77 (s, 3H), 3.56 (s, 2H), 



Unnamed: 0,compound name,eluent1,eluent2,ratio,Rf
47,18,EtOAc,Hex,2/8,0.1
48,5aa,EtOAc,Hex,1/9,0.2


(10  to  20%  EtOAc/Hexane)  to  give  9-membered  lactone  compound  (18)  as  a  white  solid  after 
recrystallization from n-hexane/DCM (3:1) (0.10 g, 0.33 mmol, 55% yield); Rf = 0.1 (EtOAc/Hex 2:8). M. 
Pt. 135 °C.  1H NMR (400 MHz, CDCl3): δ 7.16 (d, J = 8.4 Hz, 2H), 6.83 (d, J = 8.8 Hz, 2H), 5.61 (dt, J = 
14.0, 6.8 Hz, 1H), 5.41 – 5.36 (m, 2H), 4.21 (dt, J = 11.2, 5.6 Hz, 1H), 4.14 – 4.08 (m, 1H), 3.77 (s, 3H), 



product was isolated by preparative TLC (10% EtOAc/Hex, 3 times run) to give the desired product 2-
hydroxybut-3-en-1-yl benzoate 5aa (32.6 mg, 85% yield) as transparent oil. Rf = 0.2 (EtOAc/Hex 1:9). 
1H NMR (500 MHz, CDCl3) δ 8.09 – 7.96 (m, 2H), 7.58 – 7.48 (m, 1H), 7.41 (dd, J = 14.0, 4.8 Hz, 2H), 
5.92 (ddd, J = 16.5, 10.5, 5.5 Hz, 1H), 5.42 (dt, J = 17.5, 1.5 Hz, 1H), 5.25 (dt, J = 11.0, 1.5 Hz, 1H), 4.55 

No Rf data in this page
Nice
Every IUPAC is correct


In [186]:
print(f'총 개수: {len(data_final)}개')

총 개수: 49개


In [187]:
# delete wrong

del_list=[]
for arr in list(false_page_idx.values()):
    del_list += list(arr)
del_list

del_method = input("Do you want to use list? (y/n): ")
if del_method == 'y':
    user_input = input("삭제 리스트에서 제외할 옳은 인덱스들 입력 (띄어쓰기로 구분): ")
    if user_input == "":
        data_final.drop(del_list,axis=0, inplace=True)
    else:
        exe = [float(x) for x in user_input.split()]
        for i in exe:
            del_list.remove(i)
            data_final.drop(del_list,axis=0, inplace=True)
    print(f'{del_list} is deleted: length = {len(data_final)}')
elif del_method == 'n': 
    user_input = input("틀린 인덱스 입력 (띄어쓰기로 구분): ")
    numbers = [float(x) for x in user_input.split()]
    data_final.drop(numbers, axis=0, inplace=True)
    print(f'{numbers} is in delete list: length = {len(data_final)}')



# data_final.index = range(len(data_final))

[3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 18.0, 19.0, 20.0, 21.0, 37.0, 38.0, 39.0, 40.0] is in delete list: length = 34


In [188]:
data_final

Unnamed: 0,compound name,compound IUPAC name,eluent1,eluent2,ratio,Rf
0,1c-A,Ethyl 2-cyclopentylideneacetate,EtOAc,Hex,0.5/9.5,0.62
1,1c-B,2-Cyclopentylideneethan-1-ol,EtOAc,Hex,2/8,0.32
2,1c-C,2-cyclopentylideneacetaldehyde,EtOAc,Hex,2/8,0.45
10,Ethyl 3-butylhept-2-enoate (1e-A),Ethyl 3-butylhept-2-enoate,EtOAc,Hex,0.5/9.5,0.62
11,"2,3-butylhept-2-en-1-ol (1e-B)","2,3-butylhept-2-en-1-ol",EtOAc,Hex,1/9,0.25
12,3-butylhept-2-enal (1e-C),3-butylhept-2-enal,EtOAc,Hex,2/8,0.72
13,5-allylidenenonane (1e-D),5-allylidenenonane,Hex,,1,0.9
14,"2,2-dibutyl-3-vinyloxirane ((±)-1e)","2,2-dibutyl-3-vinyloxirane",EtOAc,Hex,0.5/9.5,0.42
15,1-phenylprop-2-en-1-one (1f-B),1-phenylprop-2-en-1-one,EtOAc,Hex,1/9,0.6
16,"2,3-Epoxy-1-(phenyl)-1-propanone (1f-C)","2,3-Epoxy-1-(phenyl)-1-propanone",EtOAc,Hex,1/9,0.2


In [189]:
# delete nan

data_final=data_final[np.all(~pd.isna(data_final[['compound IUPAC name','eluent1','ratio','Rf']]), axis=1)]
data_final.index = range(len(data_final))
len(data_final)

33

In [190]:
# 중복제거
data_id = ['_'.join(map(str,data_final.iloc[i,[1,2,3,4,5]]))for i in range(len(data_final))]
data_final.loc[:,'data_ID'] = data_id
data_final = data_final.groupby(by = 'data_ID', as_index = False).agg({'compound name':'first',
                                                                      'compound IUPAC name' : 'first',
                                                                      'eluent1' : 'first',
                                                                      'eluent2' : 'first',
                                                                      'ratio' : 'first',
                                                                      'Rf' : 'first'})
data_final = data_final.reset_index(drop=True)
len(data_final)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_final.loc[:,'data_ID'] = data_id


33

In [191]:
data_final

Unnamed: 0,data_ID,compound name,compound IUPAC name,eluent1,eluent2,ratio,Rf
0,(6S)-6-(((tert-butyldimethylsilyl)oxy)methyl)-...,12,(6S)-6-(((tert-butyldimethylsilyl)oxy)methyl)-...,DCM,Hex,1/1,0.4
1,"(R,E)-6-((tert-butyldimethylsilyl)oxy)-2-pheny...",16,"(R,E)-6-((tert-butyldimethylsilyl)oxy)-2-pheny...",EtOAc,Hex,2/8,0.21
2,(S)-1-((tert-butyldimethylsilyl)oxy)but-3-en-2...,15,(S)-1-((tert-butyldimethylsilyl)oxy)but-3-en-2...,EtOAc,Hex,1/9,0.52
3,(S)-1-((tert-butyldimethylsilyl)oxy)but-3-en-2...,13,(S)-1-((tert-butyldimethylsilyl)oxy)but-3-en-2...,EtOAc,Hex,1/9,0.44
4,(S)-1-(1-hydroxy-4-methylcyclohexyl)allyl 2-(4...,4’jh,(S)-1-(1-hydroxy-4-methylcyclohexyl)allyl 2-(4...,DCM,,1,0.3
5,(S)-1-(1-hydroxycyclohexyl)allyl benzoate_DCM_...,4ad,(S)-1-(1-hydroxycyclohexyl)allyl benzoate,DCM,,1,0.4
6,(S)-1-(1-hydroxycyclopentyl)allyl 2-(4-methoxy...,4’jc,(S)-1-(1-hydroxycyclopentyl)allyl 2-(4-methoxy...,DCM,,1,0.3
7,(S)-1-(4-(tert-butyl)-1-hydroxycyclohexyl)ally...,4’jg,(S)-1-(4-(tert-butyl)-1-hydroxycyclohexyl)ally...,DCM,,1,0.3
8,(S)-1-hydroxy-2-methylbut-3-en-2-yl 2-(4-metho...,4’jb,(S)-1-hydroxy-2-methylbut-3-en-2-yl 2-(4-metho...,DCM,,1,0.35
9,(S)-1-hydroxy-3-phenylbut-3-en-2-yl benzoate_D...,4af,(S)-1-hydroxy-3-phenylbut-3-en-2-yl benzoate,DCM,,1,0.38


# process

In [193]:
#much_faster

import subprocess

def iupac_to_smiles_local_opsin(iupac_name):
    jar_path = 'opsin-cli-2.8.0-jar-with-dependencies.jar'  # 본인 경로
    command = ['java', '-jar', jar_path]
    
    result = subprocess.run(command, input = iupac_name, capture_output=True, text=True)

    if result.returncode == 0:
        smiles = result.stdout.strip()
        if smiles:
            return smiles
        else:
            return None
    else:
        raise RuntimeError(f"OPSIN failed: {result.stderr}")


In [194]:
# compound: IUPAC --> smiles

smiles_list = []
for iupac in tqdm(data_final['compound IUPAC name']):
    if iupac:
        smiles_list.append(iupac_to_smiles_local_opsin(iupac))
    else:
        smiles_list.append(np.nan)
        
data_final2 = data_final.copy(deep=True)
data_final2['compound IUPAC name'] = smiles_list
data_final2.rename(columns={'compound IUPAC name': 'Smiles'}, inplace=True)
data_final2=data_final2[pd.notna(data_final2['Smiles'])]
data_final2.reset_index(drop=True, inplace=True)
data_final2

100%|██████████| 33/33 [00:16<00:00,  1.95it/s]


Unnamed: 0,data_ID,compound name,Smiles,eluent1,eluent2,ratio,Rf
0,(6S)-6-(((tert-butyldimethylsilyl)oxy)methyl)-...,12,[Si](C)(C)(C(C)(C)C)OC[C@@H]1CC=CC(O1)=O,DCM,Hex,1/1,0.4
1,"(R,E)-6-((tert-butyldimethylsilyl)oxy)-2-pheny...",16,[Si](C)(C)(C(C)(C)C)OC/C=C/C[C@@H](C(=O)O)C1=C...,EtOAc,Hex,2/8,0.21
2,(S)-1-((tert-butyldimethylsilyl)oxy)but-3-en-2...,15,C1(=CC=CC=C1)CC(=O)O[C@H](CO[Si](C)(C)C(C)(C)C...,EtOAc,Hex,1/9,0.52
3,(S)-1-((tert-butyldimethylsilyl)oxy)but-3-en-2...,13,C(C=CC1=CC=CC=C1)(=O)O[C@H](CO[Si](C)(C)C(C)(C...,EtOAc,Hex,1/9,0.44
4,(S)-1-(1-hydroxy-4-methylcyclohexyl)allyl 2-(4...,4’jh,COC1=CC=C(C=C1)CC(=O)O[C@@H](C=C)C1(CCC(CC1)C)O,DCM,,1,0.3
5,(S)-1-(1-hydroxycyclohexyl)allyl benzoate_DCM_...,4ad,C(C1=CC=CC=C1)(=O)O[C@@H](C=C)C1(CCCCC1)O,DCM,,1,0.4
6,(S)-1-(1-hydroxycyclopentyl)allyl 2-(4-methoxy...,4’jc,COC1=CC=C(C=C1)CC(=O)O[C@@H](C=C)C1(CCCC1)O,DCM,,1,0.3
7,(S)-1-(4-(tert-butyl)-1-hydroxycyclohexyl)ally...,4’jg,COC1=CC=C(C=C1)CC(=O)O[C@@H](C=C)C1(CCC(CC1)C(...,DCM,,1,0.3
8,(S)-1-hydroxy-3-phenylbut-3-en-2-yl benzoate_D...,4af,C(C1=CC=CC=C1)(=O)O[C@H](CO)C(=C)C1=CC=CC=C1,DCM,,1,0.38
9,(S)-2-(2-(4-methoxyphenyl)acetoxy)but-3-en-1-y...,17,C(CC=C)(=O)OC[C@H](C=C)OC(CC1=CC=C(C=C1)OC)=O,EtOAc,Hex,1/9,0.625


# save

In [195]:
import os

folder_path = "data_extracted_gpt"
file_count = len([
    f for f in os.listdir(folder_path)
    if os.path.isfile(os.path.join(folder_path, f))
])

richprint(f"📁 '{folder_path}' 폴더에 있는 파일 개수: {file_count}")

# gpt -data 개수 확인
data_num=[]
for i in np.arange(1,file_count+1,1):
    num = len(pd.read_csv(f'./data_extracted_gpt/datatable{i}.csv'))
    data_num.append(num)
richprint(f'논문 별 평균 데이터 수: {np.mean(data_num)}')
richprint(f'gpt가 모은 데이터 수: {np.sum(data_num)}')

In [196]:
if os.path.isfile(f'data_extracted_gpt/datatable{file_count}.csv'):
    last_file = pd.read_csv(f'data_extracted_gpt/datatable{file_count}.csv')
    last_title = last_file['title'][0]
    last_doi = last_file['doi'][0]
else:
    last_doi = 'none'

In [197]:
print(last_title)
print(last_doi)

Palladium-Catalyzed and Photoinduced Benzylic C–H Carbonylation/Annulation under Mild Conditions
https://pubs.acs.org/doi/10.1021/acs.orglett.2c02877


In [198]:
print(title)
print(doi)

Iridium-Catalyzed Enantioselective Ring Opening of Alkenyl Oxiranes by Unactivated Carboxylic Acids
https://pubs.acs.org/doi/10.1021/acs.orglett.2c02919


In [199]:
if doi ==last_doi:
    print('Check doi')
else:
    data_final3= data_final2[~pd.isna(data_final2['Smiles'])].copy()
    data_final3['title'] = [title for i in range(len(data_final3))]
    data_final3['doi'] = [doi for i in range(len(data_final3))]
    data_final3['publish'] = [publish for i in range(len(data_final3))]
    data_final3.index = range(len(data_final3))
    data_final3
data_final3

Unnamed: 0,data_ID,compound name,Smiles,eluent1,eluent2,ratio,Rf,title,doi,publish
0,(6S)-6-(((tert-butyldimethylsilyl)oxy)methyl)-...,12,[Si](C)(C)(C(C)(C)C)OC[C@@H]1CC=CC(O1)=O,DCM,Hex,1/1,0.4,Iridium-Catalyzed Enantioselective Ring Openin...,https://pubs.acs.org/doi/10.1021/acs.orglett.2...,2022.24.40
1,"(R,E)-6-((tert-butyldimethylsilyl)oxy)-2-pheny...",16,[Si](C)(C)(C(C)(C)C)OC/C=C/C[C@@H](C(=O)O)C1=C...,EtOAc,Hex,2/8,0.21,Iridium-Catalyzed Enantioselective Ring Openin...,https://pubs.acs.org/doi/10.1021/acs.orglett.2...,2022.24.40
2,(S)-1-((tert-butyldimethylsilyl)oxy)but-3-en-2...,15,C1(=CC=CC=C1)CC(=O)O[C@H](CO[Si](C)(C)C(C)(C)C...,EtOAc,Hex,1/9,0.52,Iridium-Catalyzed Enantioselective Ring Openin...,https://pubs.acs.org/doi/10.1021/acs.orglett.2...,2022.24.40
3,(S)-1-((tert-butyldimethylsilyl)oxy)but-3-en-2...,13,C(C=CC1=CC=CC=C1)(=O)O[C@H](CO[Si](C)(C)C(C)(C...,EtOAc,Hex,1/9,0.44,Iridium-Catalyzed Enantioselective Ring Openin...,https://pubs.acs.org/doi/10.1021/acs.orglett.2...,2022.24.40
4,(S)-1-(1-hydroxy-4-methylcyclohexyl)allyl 2-(4...,4’jh,COC1=CC=C(C=C1)CC(=O)O[C@@H](C=C)C1(CCC(CC1)C)O,DCM,,1,0.3,Iridium-Catalyzed Enantioselective Ring Openin...,https://pubs.acs.org/doi/10.1021/acs.orglett.2...,2022.24.40
5,(S)-1-(1-hydroxycyclohexyl)allyl benzoate_DCM_...,4ad,C(C1=CC=CC=C1)(=O)O[C@@H](C=C)C1(CCCCC1)O,DCM,,1,0.4,Iridium-Catalyzed Enantioselective Ring Openin...,https://pubs.acs.org/doi/10.1021/acs.orglett.2...,2022.24.40
6,(S)-1-(1-hydroxycyclopentyl)allyl 2-(4-methoxy...,4’jc,COC1=CC=C(C=C1)CC(=O)O[C@@H](C=C)C1(CCCC1)O,DCM,,1,0.3,Iridium-Catalyzed Enantioselective Ring Openin...,https://pubs.acs.org/doi/10.1021/acs.orglett.2...,2022.24.40
7,(S)-1-(4-(tert-butyl)-1-hydroxycyclohexyl)ally...,4’jg,COC1=CC=C(C=C1)CC(=O)O[C@@H](C=C)C1(CCC(CC1)C(...,DCM,,1,0.3,Iridium-Catalyzed Enantioselective Ring Openin...,https://pubs.acs.org/doi/10.1021/acs.orglett.2...,2022.24.40
8,(S)-1-hydroxy-3-phenylbut-3-en-2-yl benzoate_D...,4af,C(C1=CC=CC=C1)(=O)O[C@H](CO)C(=C)C1=CC=CC=C1,DCM,,1,0.38,Iridium-Catalyzed Enantioselective Ring Openin...,https://pubs.acs.org/doi/10.1021/acs.orglett.2...,2022.24.40
9,(S)-2-(2-(4-methoxyphenyl)acetoxy)but-3-en-1-y...,17,C(CC=C)(=O)OC[C@H](C=C)OC(CC1=CC=C(C=C1)OC)=O,EtOAc,Hex,1/9,0.625,Iridium-Catalyzed Enantioselective Ring Openin...,https://pubs.acs.org/doi/10.1021/acs.orglett.2...,2022.24.40


In [200]:
# 최종 저장
data_final3.to_csv(f'data_extracted_gpt/datatable{file_count+1}.csv', index=False)