In [1]:
import os
import re
import json
import time

import tiktoken
import pandas as pd
import numpy as np
from tqdm import tqdm
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
# from langchain.chat_models import ChatOpenAI
# from langchain.memory import ConversationBufferMemory
# from langchain.chains import ConversationalRetrievalChain
# from langchain.llms import HuggingFaceHub
from transformers import AutoTokenizer
from dotenv import load_dotenv

from textutils import extract_ref_from_text, get_number2drawing_dict, convert_refs_to_drawing_num

pd.set_option("display.max_columns", 999)
os.chdir('/Users/hayley/Documents/p4ds/patent_search')
load_dotenv() 


True

In [None]:
# main.py 돌려서 나온 결과 다시 저장
# sample_data = pd.read_csv('data_preprocess/data.csv')
# sample_data.to_excel("data_preprocess/data.xlsx")

# 근데 코드에서 코너 케이스 때문에 처리가 안된 pdf 가 있어서 그거는 excel 파일 다운받아서 눈으로 보고 채움 -> 그러다보니까 \n이 안지워진 경우가 있어서 지웠음
# sample_data = pd.read_excel('data_preprocess/data.xlsx')
# for col in sample_data.columns:
#     sample_data[col] = sample_data[col].str.replace('\n', '')
# sample_data.to_excel("data_preprocess/data.xlsx")

In [None]:
# ## negative sample data 추가
# sample_data = pd.read_excel("data_preprocess/sample_data_old.xlsx", index_col=0)
# negative_samples = pd.read_csv('pdf_process/data.csv')
# negative_samples['id'] = negative_samples['id'].astype(str)
# negative_samples = negative_samples.loc[negative_samples['id']!= '1020160014413 (1)']
    
# negative_samples = negative_samples.loc[~negative_samples['id'].isin(sample_data['id'])]
# sample_data = pd.concat([sample_data, negative_samples], axis=0)

# # add labels column
# sample_data['labels'] = ""
# sample_data.loc[sample_data['id']=='1020180014052', 'labels'] = 'source'
# sample_data.loc[sample_data['id'].isin(['1020050097605','1020177009557', '1020120156759']), 'labels'] = 'target'
# sample_data['labels'] = sample_data['labels'].fillna("negative")
# sample_data.to_excel("data_preprocess/sample_data.xlsx")

In [2]:
sample_data = pd.read_excel("data_preprocess/sample_data.xlsx")

# Preprocessing

**logs**

23.11.15

- 1) problem with extracting reference (number) from texts -> some reference codes are not just numbers. it's number + alphabet e.g. 202a, 202b, or sometimes just uppercase alphabets e.g. T, SR~SZ
- 2) so I tried to first extract the reference codes from "부호에 대한 설명" column. (Because then I can find those specific reference codes inside of texts.) But extracting reference codes from "부호에 대한 설명" was difficult because currently all \<code\>: \<description\> string pairs are concatenated without \n and the parsing reference codes by just relying on regular expression was almost impossible. (some reference codes were uppercase alphabets, but including uppercase alphabets for codes caused problems.) So I asked Mooho if he could leave \n characters left for "부호에 대한 설명" section. He said yes, and I paused developing regex rules further.  

23.11.16
- I proceeded with some noise in reference extraction. 
- I also added some negative samples (3 random samples that are not prior arts of the source patent.)

In [None]:
sample_data = pd.read_excel('data_preprocess/sample_data.xlsx',index_col=0)

## image to numbers, numbers to image
num2drawing_dicts = get_number2drawing_dict('/Users/hayley/Documents/p4ds/patent_search/data_preprocess/mock_data.json')

In [None]:
# c.f. hupd에서 쓴 컬럼들 
#     "abstract": "...", # 요약 -> 우리나라 특허에는 abstract가 따로 있는거 같진 않고 요약 1개 섹션임
#     "claims": "...", # 청구범위
#     "background": "...", # 기술분야 + 배경기술
#     "summary": "...", # 요약? 
#     "full_description": "..." # 해결하려는과제 + 과제의해결수단 + 발명의효과 + 발명을실시하기위한구체적인내용???

In [None]:
sample_data.columns 
# text embedding 대상이 되는 컬럼
# '요약', '청구범위', '기술분야', '배경기술', '해결하려는과제', '과제의해결수단',
#        '발명의효과', '도면의간단한설명', '발명을실시하기위한구체적인내용', '부호의설명'

# 각 섹션을 따로 따로 임베딩하는게 나을지 아니면은, 몇 섹션은 합치는게 나을지 고민이다. -> 일단 빠르게 ㄱㄱ!

In [None]:
text_columns = ['요약', '청구범위', '기술분야', '배경기술', '해결하려는과제', '과제의해결수단',
       '발명의효과', '도면의간단한설명', '발명을실시하기위한구체적인내용', '부호의설명']

In [None]:
# # c.f. 컬럼별 길이 분포 
# all_data = pd.read_excel('/Users/hayley/Documents/p4ds/patent_search/pdf_process/data_large.xlsx', index_col=0)
# for col in all_data.columns:
#     try:
#         if "extracted_numbers" not in col:
#             lens = all_data[col].apply(lambda x: len(str(x)))
#             print(col, lens.mean().round(2))
#     except:
#         continue
# # id 13.04
# # 요약 301.69
# # 대표도 395.03
# # 청구범위 3565.65
# # 기술분야 150.39
# # 배경기술 1713.51
# # 해결하려는과제 210.42
# # 과제의해결수단 1653.4
# # 발명의효과 209.81
# # 도면의간단한설명 1206.95
# # 발명을실시하기위한구체적인내용 17958.91
# # 부호의설명 188.81

In [None]:
patent_chunk_dicts=[]

for i, row in sample_data.iterrows():
    patent_dict = dict( 
        # 특허 번호 따기
        application_number = str(row.id), # 출원 번호
        publication_number = '', # 공개 번호
        patent_number = '', # 등록 번호
        chunks = [],
        # chunks_wo_drawing_and_numbers_desc = []
    )
    print(f"patent {row.id} 's chunk sizes")
    
    # text column들을 돌면서, chunking 하기
    chunks = []
    drawing_nums_list = []
    refs_list = []
    # chunks_wo_drawing_desc = [] # 도면의 간단한 설명, 부호의 설명 제외.. false positive 가 생기지는 않을까?
    for col in text_columns:
        if (col in ['도면의간단한설명', '부호의설명']) or (str(row[col]) == 'nan'):
            continue
        curr_section_chunks = text_splitter.create_documents([str(row[col])], [{"application_number": str(row.id)}])
        chunks.extend(curr_section_chunks)
        
        # chunks_wo_drawing_desc.extend(curr_section_chunks)
        print(col, len(curr_section_chunks), end=' | ')
        
        # chunk 별로 reference 발생한 부호 찾기
        for chnk in curr_section_chunks:
            refs = extract_ref_from_text(chnk)
            refs_list.append(refs)
            
            #  부호가 있었으면 drawing number 로 한번 또 치환하기
            if len(refs) > 0: # if not empty, convert found references to drawing numbers
                drawing_nums = convert_refs_to_drawing_num(refs, num2drawing_dicts[str(row.id)]['num2drawing'])
            else:
                drawing_nums = []
            drawing_nums_list.append(drawing_nums)
                
        
    # patent_dict['chunks'] = list(zip(chunks, zip(refs_list, drawing_nums_list)))
    patent_dict['chunks'] = chunks
    # patent_dict['chunks_wo_drawing_and_numbers_desc'] = chunks_wo_drawing_desc
    print()
    patent_chunk_dicts.append(patent_dict)    
    

In [None]:
json.dump(patent_chunk_dicts, open('data_preprocess/sample_chunk_data.json', 'w'), ensure_ascii = False )

In [None]:
# for col in select_columns:
#     sample_data[f"{col}_ref"] = sample_data[col].apply(extract_ref_from_text)

# sample_data['도면의간단한설명_dict'] = sample_data['도면의간단한설명'].apply(extract_description_for_image)

# sample_data.loc[:,'부호의설명_dict'] = sample_data['부호의설명'].apply(extract_description_for_code)

# for col in select_columns:
#     sample_data[f"{col}_ref_drawing"] = sample_data.apply(convert_ref_to_drawing_num, args=(col,image_df), axis=1)

# OCR

- Tesseract : 잘 못함
- 다른 오픈소스 : pyocr -> tesseract랑 똑같음 / calamari-ocr -> tensorflow 설치해야 되고 등등 maintain이 잘 안되는 패키지 인듯
- GCP vision api : 일단 ui 에서 테스트 해보고 copy json output 버튼 눌러가지고 json output 복사해와서 json output에서 텍스트만 파싱하는 것만 짬

In [None]:
import pytesseract
from PIL import Image

## tesseract를 먼저 다운받고, 다운받은 경로를 넣어주어야 함.
## You need to first download tesseract & insert the path to the exe file below.
pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/Cellar/tesseract/5.3.3/bin/tesseract'

In [None]:
# text image to string
from glob import glob

image_paths = glob('data_preprocess/image/*/*.png')

for img in image_paths:
    print(img)
    print(pytesseract.image_to_string(Image.open(img), lang='eng'))

In [None]:
for i in json_output['fullTextAnnotation']['pages'][0]['blocks']:
    for j in i['paragraphs']:
        for k in j['words']:
            curr_word = ''
            for l in k['symbols']:
                curr_word += l['text']
            if curr_word.startswith('-'):
                curr_word = curr_word[1:]
            print(curr_word)

In [None]:
annotations = json_output['textAnnotations'][0]['description'].split('\n')
print(annotations)

In [None]:
annotations = json_output['textAnnotations'][0]['description'].split('\n')
print(annotations)