In [None]:
#install the api
!pip install openai
!pip install PyMuPDF pdf2image

In [1]:
#set up libraries
import json
import os
import pymupdf
from PIL import Image
from pdf2image import convert_from_path
import base64
import pandas as pd
import numpy as np
import re
import shutil


from openai import OpenAI

#import output structure for auto-recognition
from output_structure import tools 

In [3]:
#preprocess of the pdfs
#combine them into one-pages and convert to image

def combine_two_page (input_path):
    input_pdf = pymupdf.open(input_path)
    if len(input_pdf) != 2: 
        print (f"Transfering {input_path}, not two-page")
        file_name = os.path.splitext(os.path.basename(input_path))[0]
        output_path = os.path.join(output_dir,f"{file_name}.pdf")
        image_path = os.path.join(output_dir,f"{file_name}.jpg")
        input_pdf.save(output_path)

        images = convert_from_path(output_path)
        images[0].save(image_path)
        return
    #create new pdf
    output_pdf = pymupdf.open()
    new_page = output_pdf.new_page(width = 2*input_pdf[0].rect.width, height = input_pdf[0].rect.height)
    left_rect = pymupdf.Rect(0, 0, input_pdf[0].rect.width, input_pdf[0].rect.height)
    right_rect = pymupdf.Rect(input_pdf[0].rect.width, 0, 2*input_pdf[0].rect.width, input_pdf[0].rect.height)
    new_page.show_pdf_page(left_rect, input_pdf, 0)
    new_page.show_pdf_page(right_rect, input_pdf, 1)

    file_name = os.path.splitext(os.path.basename(input_path))[0]
    output_path = os.path.join(output_dir,f"{file_name}.pdf")
    image_path = os.path.join(output_dir,f"{file_name}.jpg")
    output_pdf.save(output_path)

    images = convert_from_path(output_path)
    images[0].save(image_path)
    

    print(f"{file_name} combined and saved")

def  process_files_in_folder(folder_path, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            input_path = os.path.join(folder_path, file)
            combine_two_page(input_path)


In [4]:
folder_path = 'sample'
output_dir = 'cards_in_image'# Temporary directory

process_files_in_folder(folder_path, output_dir)

300063 combined and saved
160049 combined and saved
300060 combined and saved
372024 combined and saved
330015 combined and saved
160088 combined and saved
348019 combined and saved
120127 combined and saved
348009 combined and saved
160073 combined and saved
348021 combined and saved
315092 combined and saved
372035 combined and saved
330004 combined and saved
300058 combined and saved
120132 combined and saved
348036 combined and saved
120246 combined and saved
315053 combined and saved
160065 combined and saved
348078 combined and saved
226010 combined and saved
315021 combined and saved
315035 combined and saved
120022 combined and saved
Transfering sample/361004.pdf, not two-page
190086 combined and saved
315020 combined and saved
120037 combined and saved
348051 combined and saved
160015 combined and saved
Transfering sample/359016.pdf, not two-page
190090 combined and saved
Transfering sample/372046.pdf, not two-page
Transfering sample/361013.pdf, not two-page
Transfering sample

In [5]:
#encode the images into base64 strings for model input
def encode_image(image_path):
    with open(image_path, 'rb') as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [6]:
for file in os.listdir(output_dir):
    if file.endswith(".jpg"):
            image = os.path.join(output_dir, file)
            base64_image = encode_image(image)
            file_name = os.path.splitext(os.path.basename(image))[0]
            text_path = os.path.join(output_dir,f"{file_name}.txt")
            with open(text_path, "w") as text:
                 text.write(base64_image)

In [7]:
#load the api
from openai import OpenAI
api_key = 'YOUR_API_KEY' #YOUR_API_KEY
client = OpenAI(api_key = api_key)

In [8]:
#create batch requests for recognition
#check the variable doc for structural output details
request_list = []

def create_request(image_path,model="gpt-4o", tools=tools):
    custom_id = os.path.splitext(os.path.basename(image_path))[0]
    with open(image_path, 'r') as image:
        base64_image = image.read()
    data = {
                "custom_id": custom_id,
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": model,
                    "messages": [
                        {
                            "role": "user", 
                            'content': [
                                {
                                    'type': 'text',
                                    'text': 'extract all data from the patent card'
                                },
                                {
                                    'type': 'image_url',
                                    'image_url': {
                                        'url':f'data:image/jpeg;base64,{base64_image}'
                                    }
                                }
                            ]
                        }
                    ],
                    "max_tokens": 1000,
                    "tools": tools,
                    "tool_choice": {"type": "function", "function": {"name": "extract_patent_data"}}
                }
            }
    request_list.append(data)

In [9]:
#create json file with requests
image_dir = output_dir

for file in os.listdir(image_dir):
    if file.endswith(".txt"):
        image_path = os.path.join(image_dir, file)
        print(image_path)
        create_request(image_path)

shutil.rmtree(output_dir)

jsonl_name = 'request_list.jsonl'
with open(jsonl_name, 'w', encoding='utf-8') as jsonl_file:
    for request in request_list:
        jsonl_file.write(json.dumps(request,ensure_ascii=False) + '\n')

cards_in_image/160015.txt
cards_in_image/359016.txt
cards_in_image/372046.txt
cards_in_image/190090.txt
cards_in_image/349006.txt
cards_in_image/361013.txt
cards_in_image/359003.txt
cards_in_image/330089.txt
cards_in_image/226010.txt
cards_in_image/348078.txt
cards_in_image/315021.txt
cards_in_image/120022.txt
cards_in_image/315035.txt
cards_in_image/190086.txt
cards_in_image/120037.txt
cards_in_image/315020.txt
cards_in_image/361004.txt
cards_in_image/348051.txt
cards_in_image/300011.txt
cards_in_image/359004.txt
cards_in_image/359038.txt
cards_in_image/315030.txt
cards_in_image/349028.txt
cards_in_image/190097.txt
cards_in_image/315019.txt
cards_in_image/349014.txt
cards_in_image/226000.txt
cards_in_image/120187.txt
cards_in_image/361003.txt
cards_in_image/120024.txt
cards_in_image/361016.txt
cards_in_image/300060.txt
cards_in_image/372024.txt
cards_in_image/330015.txt
cards_in_image/348019.txt
cards_in_image/160088.txt
cards_in_image/300063.txt
cards_in_image/160049.txt
cards_in_ima

In [10]:
#send request
batch_input_file = client.files.create(
    file = open(jsonl_name, 'rb'),
    purpose='batch'
)
batch_input_file_id = batch_input_file.id
batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
    "description": "patent card extraction batch test on GPT-4o"
    }
)

In [12]:
#check batch status each batch will be processed within 24 hours
print(f"The task is {batch.status} now.")

The task is validating now.


In [12]:
#when the task is completed, get the result
#convert to dataframe
output_file_id = client.batches.retrieve(batch.id).output_file_id
file_response = client.files.content(output_file_id)

text = pd.read_json(file_response, lines=True)
response = pd.json_normalize(text['response'])
tool_calls = pd.json_normalize(response['body.choices'].apply(lambda x: x[0]['message']['tool_calls'][0]))
tool_calls['function.arguments'] = tool_calls['function.arguments'].apply(json.loads)
rec_result = pd.json_normalize(tool_calls['function.arguments'])
rec_result.head()

rec_result['patentnummer'] = text['custom_id'] #replace the patent number with document name
rec_result.head()



Unnamed: 0,patentnummer,klass,IPC,patenthavare_antal,patenthavare1,patenthavare1_adress,patenthavare1_stad,patenthavare1_land,patenthavare_typ,patenthavare_typ_av_skrivet,...,patenthavare3_land,uppfinnare4,uppfinnare5,tilläggspatentnummer,patentmeddelatden,patenttid_fr_2,patenttid_till_2,tidigare_patenthavare_adress,licensinnehavare,licensdatum
0,160015,,,1,"N.V. Philips’ Gloeilampfabrieken, Eindhoven",Eindhoven,Eindhoven,Nederländerna,company,typed,...,,,,,,,,,,
1,359016,A 01 g 23/08,A 01 g 23/08,1,Brundell och Jonsson AB,"Gävle, SW",Gävle,SW,company,typed,...,,,,,,,,,,
2,372046,E 01 c 7/32,E 01 c 7/32,1,E.I. du de Nemours and Co.,"Wilmington, Del.",Wilmington,US,company,typed,...,,,,,,,,,,
3,190090,56a,B66C1/62,1,Fabrikör Karl Erik Nilssen,Lidingö,Lidingö,Sverige,individual,typed,...,,,,,,,,,,
4,349006,B65g1/02,B65 g 1/02,1,Palmer-Shile Company,"Detroit, Mich.",Detroit,USA,company,typed,...,,,,,,,,,,


In [13]:
#basic post processing
def output_clean(df):
    df['patenthavare1_stad'] = df.apply(
        lambda row: row['patenthavare1'].split(',')[1].strip() if row['patenthavare1_stad'] == '' and ',' in row['patenthavare1'] else row['patenthavare1_stad'],
        axis=1
    ) #try to fill in the city manually if it is not recognized
    df['patenthavare1'] = df['patenthavare1'].str.split(',').str[0] #separate the location information
    
    #to flag the potential wrong expiration date
    df['utgångsår'] = pd.to_numeric(df['utgångsår'], errors='coerce')
    df['sista_aviserat_år'] = pd.to_numeric(df['sista_aviserat_år'], errors='coerce')
   
    df['potential_wrong_expiration_date'] = df.apply(
        lambda row: 1 if pd.notnull(row['utgångsår']) and  pd.notnull(row['sista_aviserat_år'])
         and int(row['utgångsår']) - int(row['sista_aviserat_år']) >1 and row['utgångsskäl'] == 'Lack of payment of fees' else 0,
        axis = 1
    )
    return df

In [14]:
#save the output as csv
rec_result = output_clean(rec_result)
rec_result.to_csv("rec_result_with_flag.csv", index=False)