## 为多对多关系中的实体先分配ID

主要来自以下字段：
- `inventor_harmonized`：name + country_code
- `assignee_harmonized`：name + country_code
- `examiner`：name + department + level

In [None]:
RAW_DATA_PATH = r'/data/dataset/google_patents'

INVENTOR_OUTPUT_PATH = r'/data/users/liangzhentao/Projects/google-patent-parser/data/output/inventor_harmonized.csv'
ASSIGNEE_OUTPUT_PATH = r'/data/users/liangzhentao/Projects/google-patent-parser/data/output/assignee_harmonized.csv'
EXAMINER_OUTPUT_PATH = r'/data/users/liangzhentao/Projects/google-patent-parser/data/output/examiner.csv'

INVENTOR_HEADERS = ['inventor_id', 'name', 'country_code']
ASSIGNEE_HEADERS = ['assignee_id', 'name', 'country_code']
EXAMINER_HEADERS = ['examiner_id', 'name', 'department', 'level']

In [2]:
import orjson
import csv
from tqdm import tqdm
from glob import glob

准备输入文件

In [3]:
raw_jsonl_file_list = sorted(glob(RAW_DATA_PATH + '/*', recursive=True))
print(f'共有 {len(raw_jsonl_file_list)} 个文件需要处理')
print(f'首个文件名称 {raw_jsonl_file_list[0]}')
print(f'最后文件名称 {raw_jsonl_file_list[-1]}')

共有 13110 个文件需要处理
首个文件名称 /data/dataset/google_patents/patents-000000000000
最后文件名称 /data/dataset/google_patents/patents-000000013109


### 开始处理

In [4]:
count = 0
test_threshold = 10

inventor_harmonized_set = set()
assignee_harmonized_set = set()
examiner_set = set()

for file in tqdm(raw_jsonl_file_list, desc='Processing files'):
    with open(file, 'r', encoding='utf-8') as f:
        for line in f:
            patent = orjson.loads(line)

            # inventor_harmonized
            # inventor_harmonized_list = patent['inventor_harmonized']
            # for inventor in inventor_harmonized_list:
            #     inventor_id = f"{inventor['name']}@{inventor['country_code']}"
            #     inventor_harmonized_set.add(inventor_id)
            
            inventor_harmonized_set.update(
                f"{inv['name']}@@{inv['country_code']}" 
                for inv in patent.get('inventor_harmonized', [])
            )
            
            # assignee_harmonized
            # assignee_harmonized_list = patent['assignee_harmonized']
            # for assignee in assignee_harmonized_list:
            #     assignee_id = f"{assignee['name']}@{assignee['country_code']}"
            #     assignee_harmonized_set.add(assignee_id)

            assignee_harmonized_set.update(
                f"{ass['name']}@@{ass['country_code']}" 
                for ass in patent.get('assignee_harmonized', [])
            )

            # examiner
            # examiner_list = patent['examiner']
            # for examiner in examiner_list:
            #     examiner_id = f"{examiner['name']}@{examiner['department']}@{examiner['level']}"
            #     examiner_set.add(examiner_id)

            examiner_set.update(
                f"{exam['name']}@@{exam['department']}@@{exam['level']}" 
                for exam in patent.get('examiner', [])
            )

    count += 1
    if count >= test_threshold:
        break


Processing files:   0%|          | 9/13110 [00:03<1:23:18,  2.62it/s]


### 将结果排序后赋予ID并保存成CSV格式

In [7]:
def save_set_to_csv(data_set, output_filename, header):

    print(f"Processing and saving data to {output_filename}...")
    sorted_data = sorted(list(data_set))
    
    with open(output_filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
        writer.writerow(header)

        for entity_id, combined_string in enumerate(sorted_data, start=1):
            parts = combined_string.split('@@')
            row_to_write = [entity_id] + parts
            writer.writerow(row_to_write)
            
    print(f"Successfully saved {len(sorted_data)} records to {output_filename}.")

In [8]:
save_set_to_csv(inventor_harmonized_set, INVENTOR_OUTPUT_PATH, INVENTOR_HEADERS)
save_set_to_csv(assignee_harmonized_set, ASSIGNEE_OUTPUT_PATH, ASSIGNEE_HEADERS)
save_set_to_csv(examiner_set, EXAMINER_OUTPUT_PATH, EXAMINER_HEADERS)

Processing and saving data to /data/users/liangzhentao/Projects/google-patent-parser/data/output/inventor_harmonized_ids.csv...
Successfully saved 55296 records to /data/users/liangzhentao/Projects/google-patent-parser/data/output/inventor_harmonized_ids.csv.
Processing and saving data to /data/users/liangzhentao/Projects/google-patent-parser/data/output/assignee_harmonized_ids.csv...
Successfully saved 18705 records to /data/users/liangzhentao/Projects/google-patent-parser/data/output/assignee_harmonized_ids.csv.
Processing and saving data to /data/users/liangzhentao/Projects/google-patent-parser/data/output/examiner_ids.csv...
Successfully saved 5596 records to /data/users/liangzhentao/Projects/google-patent-parser/data/output/examiner_ids.csv.
