In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import json
import os
from shapely import wkt

from his_geo import extractor
from his_geo import geocoder

from geopy.distance import geodesic

from ckip_transformers.nlp import CkipNerChunker
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

from tqdm import tqdm
tqdm.pandas()

# Define Functions

In [2]:
def closeness(x, n):
    if x <= n:
        return (np.cos(x*(np.pi/n))+1)/2
    else:
        return 0
    
def shapely_point_to_geopy(point_shapely):
    return (point_shapely.y, point_shapely.x)


def calculate_geo_closeness(point1, point2, maximum_error_distance):
    # Geodesic Distance between two points
    point1, point2 = shapely_point_to_geopy(point1), shapely_point_to_geopy(point2)
    distance =  geodesic(point1, point2).kilometers
    # Maximum error distance as cut-off
    n = maximum_error_distance
    # Closeness
    closeness_value = closeness(distance, n)
    return closeness_value


def calculate_geo_closeness_for_all_text(gdf_result, gdf_benchmark):
    # # Check if the number of unique ids in the result is the same as the benchmark
    # if len(gdf_result['id'].unique()) != len(gdf_benchmark['id'].unique()):
    #     print("Number of unique ids in the result is not the same as the benchmark")
    # else:
    gdf_result['closeness'] = 0.0
    gdf_result['target_toponym'] = None
    # Iterate through each text (section)
    for id in gdf_benchmark['id'].unique():
        gdf_section_benchmark = gdf_benchmark[gdf_benchmark['id'] == id]
        gdf_section_result = gdf_result[gdf_result['id'] == id]

        # Iterate through each extracted toponym
        for index_result, row_result in gdf_section_result.iterrows():
            if len(gdf_section_result) == 0:
                pass
            else:
                result_toponym = row_result['toponym'].replace("县", "").replace("国", "")
                for index_benchmark, row_benchmark in gdf_section_benchmark.iterrows():
                    benchmark_toponym = row_benchmark['toponym'].replace("县", "").replace("国", "")
                    if benchmark_toponym in result_toponym or result_toponym in benchmark_toponym:
                        gdf_result.loc[index_result, 'target_toponym'] = benchmark_toponym
                        if row_result['geometry'] is None or row_result['geometry'].is_empty:
                            closeness_value = 0
                        else:
                            if row_benchmark['geometry'] is None or row_benchmark['geometry'].is_empty:
                                # Change in the future
                                closeness_value = 0
                            else:
                                # Get the most accurate level polygon of the benchmark data
                                closeness_value = calculate_geo_closeness(row_result['geometry'], row_benchmark['geometry'], row_benchmark['Maximum Error Distance'])
                                break
                    else:
                        closeness_value = 0
                # Add the closeness score to the result dataframe
                gdf_result.loc[index_result, 'closeness'] = closeness_value

    return gdf_result


def calculate_precision(gdf_result):
    section_precisions = gdf_result.groupby('id')['closeness'].mean()
    total_precision = section_precisions.mean()
    return total_precision


def calculate_scores(gdf_result, gdf_benchmark):

    total_precision = 0
    total_recall = 0
    total_f1 = 0

    for id in gdf_result['id'].unique():
        gdf_section_benchmark = gdf_benchmark[gdf_benchmark['id'] == id]
        gdf_section_result = gdf_result[gdf_result['id'] == id]

        # calculate precision for each section (text)
        section_precision = gdf_section_result['closeness'].mean()
        total_precision += section_precision

        # calculate recall for each section (text)
        section_recall = 0
        for i in range(len(gdf_section_benchmark)):
            benchmark_toponym = gdf_section_benchmark.iloc[i]['toponym'].replace("县", "").replace("国", "")
            if benchmark_toponym not in gdf_section_result['target_toponym'].tolist():
                section_recall += 0
            else:
                section_recall += gdf_section_result[gdf_section_result['target_toponym'] == benchmark_toponym]['closeness'].max()
        section_recall = section_recall / len(gdf_section_benchmark)
        total_recall += section_recall

        # calculate f1 score for each section (text)
        if section_precision == 0 and section_recall == 0:
            section_f1 = 0
        else:
            section_f1 = 2 * section_precision * section_recall / (section_precision + section_recall)
        total_f1 += section_f1

    total_precision = total_precision / len(gdf_result['id'].unique())
    total_recall = total_recall / len(gdf_result['id'].unique())
    total_f1 = total_f1 / len(gdf_result['id'].unique())
   

    return total_precision, total_recall, total_f1

In [3]:
def json_to_dataframe(json_file):
    with open(json_file, 'r', encoding="utf-8-sig") as f:
        data = json.load(f)
    df = pd.DataFrame(columns=['id', 'toponym', 'location'])
    for key, value in data.items():
        result = value.split('\n')
        for item in result:
            if item != '':
                try:
                    row = {'id': key, 'toponym': item.split(',')[0].strip(), 'location': item.split(',')[1].strip()}
                except:
                    row = {'id': key, 'toponym': item, 'location': ''}
                df.loc[len(df)] = row
    return df


def match_ids(df_result, df_original):
    df_ids = df_original[['id']].copy()

    df_result.set_index('id', inplace=True)
    df_result.index.name = None
    df_ids.index = df_ids.index.astype('int64')
    df_result.index = df_result.index.astype('int64')

    df_result = df_result.merge(df_ids, left_index=True, right_index=True, how='left')
    df_result = df_result[['id', 'toponym', 'location']]
    return df_result

In [4]:
def extract_gpe_entities(ner_tokens):
    # Filter GPE tokens
    gpe_tokens = [token for token in ner_tokens if token.ner == 'GPE']
    # Sort tokens by their starting index
    gpe_tokens.sort(key=lambda token: token.idx[0])
    merged_entities = []
    if not gpe_tokens:
        return merged_entities
    # Initialize the first entity
    current_entity = gpe_tokens[0].word
    current_end = gpe_tokens[0].idx[1]
    for token in gpe_tokens[1:]:
        start, end = token.idx
        if start == current_end:
            # Adjacent entity, merge it
            current_entity += token.word
            current_end = end
        else:
            # Non-adjacent, add the current entity to the list
            merged_entities.append(current_entity)
            current_entity = token.word
            current_end = end
    # Add the last entity
    merged_entities.append(current_entity)
    return merged_entities


def ner_by_ckip(text, model):

    ner_driver = CkipNerChunker(model=model)

    result = ner_driver([text])[0]

    # addresses = [i.word for i in result if i.ner == 'GPE']
    addresses = list(set(extract_gpe_entities(result)))
    
    return addresses


def ner_by_cluener(text, model, tokenizer):
  ner = pipeline('ner', model=model, tokenizer=tokenizer)
  tag_type = "address"
  ner_result = ner(text)
  single_name_str = ""
  name_list = []
  for char_dic in ner_result:
    # b_or_i, current_entity_type = char_dic['entity'].split("-")
    current_entity_type = char_dic['entity'].split("-")[1]
    # Start a new entity
    if single_name_str == "" and (char_dic['entity'] == f"B-{tag_type}" or char_dic['entity'] == f"I-{tag_type}"):
      single_name_str = char_dic['word']
    # Concatenate the entity
    elif char_dic['entity'] == f"I-{tag_type}":
      single_name_str+=char_dic['word']
    # B-tag type follows another B-tag type
    elif char_dic['entity'] == f"B-{tag_type}":
      name_list.append(single_name_str)
      single_name_str = char_dic['word']
    # B-tag type ended by a non-type type
    elif single_name_str!= "" and tag_type!= current_entity_type:
      name_list.append(single_name_str)
      single_name_str = ""
# The last captured entity
  if single_name_str != "":
    name_list.append(single_name_str)
  # name_list = [i for i in name_list if len(i)>1]
  name_list = list(set(name_list))
  return name_list

# Extraction

## Prompting

### Settings

In [3]:
# Set prompt
prompt = """
I would like you to take on the roles of both a Geographer and a Historian. 
You possess extensive knowledge in Chinese geography and history, with a particular expertise in historical toponymy. 
Your task is to extract precise location references of historical toponyms from texts.
When I provide a scholarly text analyzing the location of one or several historical toponyms, please identify and extract both the toponyms and their corresponding location references from the text. 
Keep the following in mind:
1. If the text presents differing opinions of the same historical toponym's location from various scholars, only extract the most correct location reference that the author of the text acknowledges or agrees with. Do not include location references that the author disputes.
2. If a toponym is mentioned in the text but no location is provided, please skip this toponym.
3. Present the extracted information always in Chinese and strictly adhere to the following format:
   "Toponym 1", "Location 1"
   "Toponym 2", "Location 2"
   Please do not include any explanation, verb or extraneous information.

The text is as follows:

         """

In [4]:
# Set API key
api_key = "YOUR_API_KEY"

# Set models
models = ["chatgpt"]
chatgpt_model_versions = [
                          "gpt-3.5-turbo-0125",
                          "gpt-4-turbo-2024-04-09",
                          "gpt-4o-2024-08-06",
                          ]

In [4]:
# Load data
df = pd.read_csv('../data/evaluation/raw_text.csv')
texts = df['text'].tolist()

### Create extractor and run

In [7]:
for model in chatgpt_model_versions:

    llm_extractor = extractor.Extractor(prompt, output_dir="../data/evaluation/", 
                                        model="chatgpt", model_version=model, api_key=api_key)

    results = llm_extractor.extract_texts(texts)

Extracting text 0 to ./evaluation/extracted_results_chatgpt_gpt-4-turbo-2024-04-09.json
Extracting text 1 to ./evaluation/extracted_results_chatgpt_gpt-4-turbo-2024-04-09.json
Extracting text 2 to ./evaluation/extracted_results_chatgpt_gpt-4-turbo-2024-04-09.json
Extracting text 3 to ./evaluation/extracted_results_chatgpt_gpt-4-turbo-2024-04-09.json
Extracting text 4 to ./evaluation/extracted_results_chatgpt_gpt-4-turbo-2024-04-09.json
Extracting text 5 to ./evaluation/extracted_results_chatgpt_gpt-4-turbo-2024-04-09.json
Extracting text 6 to ./evaluation/extracted_results_chatgpt_gpt-4-turbo-2024-04-09.json
Extracting text 7 to ./evaluation/extracted_results_chatgpt_gpt-4-turbo-2024-04-09.json
Extracting text 8 to ./evaluation/extracted_results_chatgpt_gpt-4-turbo-2024-04-09.json
Extracting text 9 to ./evaluation/extracted_results_chatgpt_gpt-4-turbo-2024-04-09.json
Extracting text 10 to ./evaluation/extracted_results_chatgpt_gpt-4-turbo-2024-04-09.json
Extracting text 11 to ./evaluat

### Post-processing

In [5]:
# structuralize the results
df_original = pd.read_csv('../data/evaluation/raw_text.csv')

json_dir = '../data/evaluation/'
json_files = [i for i in os.listdir(json_dir) if i.endswith('.json')]

for json_file in json_files:
    df_result = json_to_dataframe(json_dir + json_file)
    df_result = match_ids(df_result, df_original)
    df_result.dropna(subset=['location'], inplace=True)
    df_result = df_result[~df_result['location'].str.contains('不详')]
    df_result = df_result[~df_result['location'].str.contains('未明确')]
    df_result = df_result[~df_result['location'].str.contains('未提供')]
    df_result = df_result[~df_result['location'].str.contains('不明确')]
    df_result = df_result[~df_result['location'].str.contains('不提供')]
    df_result = df_result[~df_result['location'].str.contains('未提及')]
    df_result = df_result[~df_result['location'].str.contains('未详')]
    df_result = df_result[~df_result['location'].str.contains('无法确定')]
    df_result = df_result[~df_result['location'].str.contains('[A-Za-z]')]
    df_result.to_csv(json_dir + json_file[:-5] + '.csv', index=False, encoding='utf-8-sig')

## NER

### Bert / Albert

In [6]:
df_original = pd.read_csv('../data/evaluation/raw_text.csv')
models = ["bert-base", 'albert-base']

for model in models:
    df_original['location'] = df_original['text'].progress_apply(lambda x: ner_by_ckip(x, model))

    df_extracted = df_original['location'].explode().reset_index() 
    merged_df = df_extracted.merge(df_original[['id']], left_on='index', right_index=True)
    merged_df = merged_df[['id', 'location']]
    merged_df.to_csv('../data/evaluation/' + 'extracted_results_' + model + '.csv', index=False, encoding='utf-8-sig')

Tokenization: 100%|██████████| 1/1 [00:00<00:00, 1001.51it/s]
Inference: 100%|██████████| 1/1 [00:02<00:00,  2.36s/it]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 1000.31it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  1.89it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 998.64it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  2.31it/s]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 499.86it/s]
Inference: 100%|██████████| 1/1 [00:02<00:00,  2.03s/it]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 1001.03it/s]
Inference: 100%|██████████| 1/1 [00:01<00:00,  1.98s/it]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 499.74it/s]
Inference: 100%|██████████| 1/1 [00:01<00:00,  1.85s/it]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 997.46it/s]
Inference: 100%|██████████| 1/1 [00:01<00:00,  1.97s/it]
Tokenization: 100%|██████████| 1/1 [00:00<00:00, 998.88it/s]
Inference: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s]
Tokenization: 100%|██████████| 1/1 [00:00<?, ?it/s]
I

### Roberta

In [7]:
model = AutoModelForTokenClassification.from_pretrained('uer/roberta-base-finetuned-cluener2020-chinese')

tokenizer = AutoTokenizer.from_pretrained('uer/roberta-base-finetuned-cluener2020-chinese', model_max_length=512)

df_original = pd.read_csv('../data/evaluation/raw_text.csv')
df_original['location'] = df_original['text'].progress_apply(lambda x: ner_by_cluener(x, model, tokenizer))

df_extracted = df_original['location'].explode().reset_index() 
merged_df = df_extracted.merge(df_original[['id']], left_on='index', right_index=True)
merged_df = merged_df[['id', 'location']]
merged_df.to_csv('../data/evaluation/' + 'extracted_results_roberta-cluener.csv', index=False, encoding='utf-8-sig')

100%|██████████| 259/259 [02:18<00:00,  1.87it/s]


# Geocoding

In [None]:
files_dir = '../data/evaluation/'
files = [i for i in os.listdir(files_dir) if "extracted_results" in i and i.endswith('.csv')]
for file in files:
    print(file)
    df = pd.read_csv(files_dir + file, encoding='utf-8-sig')
    addresses = df['location'].tolist()
    geocoder_evaluate = geocoder.Geocoder(addresses, 
                                          lang="ch", 
                                          preferences=['modern', 'historic'], 
                                          geographic_crs="EPSG:4326", 
                                          if_certainty=True)
    geocoder_evaluate.detect_direction()
    geocoder_evaluate.match_address()
    geocoder_evaluate.calculate_point()
    df_geocoded = geocoder_evaluate.data.reset_index().copy().drop(columns=['id'])
    df = df.reset_index().copy()
    df = df.merge(df_geocoded, left_index=True, right_index=True, how='left')
    df.to_csv(files_dir + file[:-4] + '_geocoded.csv', index=False, encoding='utf-8-sig')

# Evaluation

In [9]:
file_dir = '../data/evaluation/'
files = [i for i in os.listdir(file_dir) if 'extracted_results' in i and i.endswith('geocoded.csv')]

df_benchmark = pd.read_csv('../data/evaluation/benchmark.csv', encoding='utf-8-sig')
gdf_benchmark = gpd.GeoDataFrame(df_benchmark, geometry=gpd.points_from_xy(df_benchmark.X, df_benchmark.Y))

for file in files:
    print(file)
    df_result = pd.read_csv(file_dir + file, encoding='utf-8-sig')
    df_result = df_result.dropna(subset=['geometry'])
    df_result['geometry'] = df_result['geometry'].astype(str)
    df_result['geometry'] = df_result['geometry'].apply(wkt.loads)

    gdf_result = gpd.GeoDataFrame(df_result, geometry='geometry')
    if "toponym" not in gdf_result.columns:
        gdf_result = gdf_result.merge(gdf_benchmark[['toponym', 'id']], on="id", how="left")
    gdf_result = calculate_geo_closeness_for_all_text(gdf_result, gdf_benchmark)
    gdf_result.to_csv(file_dir + file[:-4] + '_evaluated.csv', encoding='utf-8-sig')

extracted_results_albert-base_geocoded.csv
extracted_results_bert-base_geocoded.csv
extracted_results_chatgpt_gpt-3.5-turbo-0125_geocoded.csv
extracted_results_chatgpt_gpt-4-turbo-2024-04-09_geocoded.csv
extracted_results_chatgpt_gpt-4o-2024-08-06_geocoded.csv
extracted_results_roberta-cluener_geocoded.csv


In [10]:
file_dir = '../data/evaluation/'
files = [i for i in os.listdir(file_dir) if 'extracted_results' in i and i.endswith('evaluated.csv')]
for file in files:
    df = pd.read_csv(file_dir + file, encoding='utf-8-sig')

    # remove results with no locations
    # df.dropna(subset=['location'], inplace=True)

    precision, recall, f1 = calculate_scores(df, gdf_benchmark)
    print(file)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', f1)

extracted_results_albert-base_geocoded_evaluated.csv
Precision: 0.3992463007425532
Recall: 0.7319299157136846
F1: 0.48620597072648497
extracted_results_bert-base_geocoded_evaluated.csv
Precision: 0.40934062335900817
Recall: 0.7447511615696952
F1: 0.4938614115857279
extracted_results_chatgpt_gpt-3.5-turbo-0125_geocoded_evaluated.csv
Precision: 0.6836116620288818
Recall: 0.7846726882937356
F1: 0.708676986462774
extracted_results_chatgpt_gpt-4-turbo-2024-04-09_geocoded_evaluated.csv
Precision: 0.7334322675403984
Recall: 0.8106123398741139
F1: 0.7559727840874089
extracted_results_chatgpt_gpt-4o-2024-08-06_geocoded_evaluated.csv
Precision: 0.8294277564378774
Recall: 0.8478268665902011
F1: 0.8305851796084152
extracted_results_roberta-cluener_geocoded_evaluated.csv
Precision: 0.5478507527556477
Recall: 0.9143821019914481
F1: 0.6441279041830332
