In [363]:
import pandas as pd
import numpy as np
import re
import uuid

For Lee:
* `bpid`: unique id for each person
* `lee_uuid`: unique id for each instance of taking a test
* `repeated_testtaker`: True if the person has taken the test multiple times in the SAME YEAR, False otherwise


For WS:
* `ws_uuid`: unique id for each instance of taking a test. Directly tied to `id1`
* `repeated_testtaker`: True if the person has taken the test multiple times in the SAME YEAR, False otherwise
* Currently no unique id for each person


Goals:
* Need to detect and merge the same person from both datafiles
* Each instance of taking a test has a unique ID in each datafile

Edge Cases:
* One person takes exam multiple times in the same year. More instances in Lee than in WS --> Copy each person in WS to each instance in Lee
* One person takes exam multiple times in the same year. More instances in WS than in Lee --> Add extra rows to merged file, copying each instance in Lee to match each instance in WS

## Lee Munkwa Version: `data_lee`


In [604]:
bangmok = pd.read_csv('../../data/preprocessing/bangmok/bangmok.csv')
career = pd.read_csv('../../data/preprocessing/bangmok/bangmok_career.csv')
person = pd.read_csv('../../data/preprocessing/bangmok/bangmok_person.csv')

In [365]:
len(bangmok), len(career), len(person)

(15151, 15151, 14638)

In [366]:
# merge bangmok and career
bangmok_career = pd.merge(bangmok,career, on='bid', how='outer')

In [367]:
eq = bangmok_career['bpid_x'] == bangmok_career['bpid_y']
[i for i, x in enumerate(eq) if not x]

# This means we can merge these two columns
bangmok_career = bangmok_career.rename(columns={"bpid_x":"bpid"}).drop(columns=["bpid_y"])

In [368]:
# merge bankmok, career, person
bangmok_career_person = pd.merge(bangmok_career,person,on='bpid',how='left')

In [369]:
data_lee = bangmok_career_person
len(data_lee), len(data_lee['bid'].unique())

(15151, 15151)

Create UUID for each unique BID. This UUID is based off of a hash from the BID, so it will be the same for each BID.

In [370]:
data_lee['uuid'] = [uuid.uuid5(uuid.NAMESPACE_DNS,x) for x in data_lee['bid']]
data_lee = data_lee.set_index('uuid')
data_lee['lee_uuid'] = data_lee.index

Data exploration: Those who have repeated taking the test multiple times in one year

In [371]:
# These are the people who took the same example multiple times in the SAME YEAR 
# i.e. they are the duplicates of the korean name/chinese name/pass year. 
# data_lee[data_lee[['korname', 'chnname', 'pass_year']].duplicated(keep=False)][['korname', 'chnname', 'pass_year']].sort_values(by=['pass_year', 'korname'])
                                                                                                                                    

In [372]:
# These are the people who took the test multiple times in any given year
# data_lee[data_lee[['korname', 'chnname']].duplicated(keep=False)][['korname', 'chnname', 'pass_year']].sort_values(by=['korname', 'pass_year'])


In [373]:
# Marking those who took the test more than once in the same year
data_lee['repeated_testtaker_lee'] = data_lee[['korname', 'chnname','pass_year']].duplicated(keep=False)

In [374]:
len(data_lee)

15151

Drop the columns we don't need for the final table comparison

In [375]:
data_lee = data_lee[['lee_uuid', 'pass_year', 'korname', 'chnname', 'repeated_testtaker_lee']].copy()

In [376]:
data_lee.head(3).T

uuid,4b854af7-3d4b-51c9-802e-117ff4825291,8cee2162-e5a0-5ebf-b4ab-65cc828c8f87,8a1e8a21-4d10-5ae3-b92d-51acfbb31ba6
lee_uuid,4b854af7-3d4b-51c9-802e-117ff4825291,8cee2162-e5a0-5ebf-b4ab-65cc828c8f87,8a1e8a21-4d10-5ae3-b92d-51acfbb31ba6
pass_year,1753,1757,1784
korname,이수득,유언술,신기현
chnname,李秀得,兪彦述,申驥顯
repeated_testtaker_lee,False,False,False


## WS Munkwa Version: `data_ws`

In [605]:
data_ws = pd.read_csv('../../data/raw/WS_Munkwa.csv')

# Ensure there are no duplicate IDs
len(data_ws), len(data_ws['id1'].unique()), len(data_ws['source'].unique())

(14607, 14607, 10591)

Create UUID for each unique BID. This UUID is based off converting the id to a string because that is the only unique value we have in this df. Since this isn't the most secure, we could change this to be random. But I don't think it matters much

In [378]:
data_ws['uuid'] = [uuid.uuid5(uuid.NAMESPACE_DNS,str(x)) for x in data_ws['id1']]
data_ws = data_ws.set_index('uuid');
data_ws['ws_uuid'] = data_ws.index

In [379]:
# data_ws.head().T

In [380]:
# Verify there are no duplicates
data_ws[data_ws.duplicated()]

Unnamed: 0_level_0,id1,namehg1,namehj1,year,prevdegreehg,prevdegreehj,source,courtesynamehg,courtesynamehj,ancestralseathg,...,ancestralseathj,choronymhj,addresshg,addresshj,pennamehg,pennamehj,posthumoustitlehg,posthumoustitlehj,error,ws_uuid
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [381]:
# These are the people who took the same example multiple times in the SAME YEAR 
# i.e. they are the duplicates of the korean name/chinese name/pass year. 
data_ws[data_ws[['namehj1', 'namehg1', 'year']].duplicated(keep=False)][['namehg1', 'namehj1', 'year']].sort_values(by=['year', 'namehg1'])


Unnamed: 0_level_0,namehg1,namehj1,year
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a28023af-8791-5c39-9f49-2856c57cf410,이목,李,1612.0
1da79682-6fb5-5029-bca5-9969dd572b9b,이목,李,1612.0
4f879759-7396-5b4f-8d6c-2a7c57ba1754,김익진,金益振,1651.0
a29d8bb7-b3e1-5a90-b9db-29c779ae4513,김익진,金益振,1651.0
4e6fe550-8d80-5c69-89a4-5071bf539533,이육,李堉,1740.0
cea144cd-a435-5bdd-a69c-7f7cc1a37de7,이육,李堉,1740.0
4bd1bb73-339b-58bb-921a-ae2e33f6bfb4,홍종협,洪鍾協,1880.0
c0b57392-53c2-54a4-b0e4-e58e0f5e9e14,홍종협,洪鍾協,1880.0
26044751-d6bd-534a-ae6b-965fc8389ed4,이윤재,李允在,1887.0
5af472e3-2855-5288-851d-1b8698aae187,이윤재,李允在,1887.0


In [382]:
# Marking those who took the test more than once as such in the SAME YEAR
data_ws['repeated_testtaker_ws'] = data_ws[['namehj1', 'namehg1', 'year']].duplicated(keep=False)

Drop the columns so we only have the ones we need

In [383]:
data_ws = data_ws[['ws_uuid', 'year', 'namehg1', 'namehj1', 'repeated_testtaker_ws']]

In [384]:
data_ws.head().T

uuid,b04965e6-a9bb-591f-8f8a-1adcb2c8dc39,4b166dbe-d99d-5091-abdd-95b83330ed3a,98123fde-012f-5ff3-8b50-881449dac91a,6ed955c6-506a-5343-9be4-2c0afae02eef,c8691da2-158a-5ed6-8537-0e6f140801f2
ws_uuid,b04965e6-a9bb-591f-8f8a-1adcb2c8dc39,4b166dbe-d99d-5091-abdd-95b83330ed3a,98123fde-012f-5ff3-8b50-881449dac91a,6ed955c6-506a-5343-9be4-2c0afae02eef,c8691da2-158a-5ed6-8537-0e6f140801f2
year,1393,1393,1393,1393,1393
namehg1,송개신,김효원,이담,탁함,윤정
namehj1,宋介臣,金孝源,李擔,卓,尹定
repeated_testtaker_ws,False,False,False,False,False


## Clean up encodings

In [385]:
len(data_lee), len(data_ws)

(15151, 14607)

A special Hanja character case: Looks the same but they're NOT

In [386]:
lee_character = '金'
ws_character = '金'

data_ws = data_ws.replace(to_replace=r'金', value=ws_character, regex=True)

These are dueum (두음) - Korean syllables that can be pronounced in different ways. Using this dictionary, I am keeping these consistent across versions

In [387]:
dueum = {
    '라': '나', '락': '낙', '란': '난', '랄': '날', '람': '남', '랍': '납', '랑': '낭', '래': '내', '랭': '냉', '냑': '약', '략': '약', '냥': '양', '량': '양', '녀': '여', '려': '여', '녁': '역', '력': '역', '년': '연', '련': '연', '녈': '열', '렬': '열', '념': '염', '렴': '염', '렵': '엽', '녕': '영', '령': '영', '녜': '예', '례': '예', '로': '노', '록': '녹', '론': '논', '롱': '농', '뢰': '뇌', '뇨': '요', '료': '요', '룡': '용', '루': '누', '뉴': '유', '류': '유', '뉵': '육', '륙': '육', '륜': '윤', '률': '율', '륭': '융', '륵': '늑', '름': '늠', '릉': '능', '니': '이', '리': '이', '린': '인', '림': '임', '립': '입'
}

In [388]:
# Replace duem in both versions and replace the csv in preprocessing file
for k, v in dueum.items():
    data_ws = data_ws.replace(to_replace=k, value=v, regex=True)
    data_lee = data_lee.replace(to_replace=k, value=v, regex=True)

In [389]:
# Little endian thing /ufeff
data_ws['namehj1'] = data_ws['namehj1'].str.replace('\ufeff','')
data_lee['chnname'] = data_lee['chnname'].str.replace('\ufeff','')

## `merge1` - Merge the Overlapping Areas of the Two Versions

We want to create a master table, where we merge the WS into the Lee version

We want to:
* Make a list of different name spellings, if they exist
* Indicate if a person is in Lee but not in WS
* Indicate if a person is in WS but not Lee

In [390]:
len(data_lee), len(data_lee['lee_uuid'].unique())

(15151, 15151)

In [391]:
# First, inner merge on the Korean name, Chinese name, and exam year
master = pd.merge(data_lee, data_ws,  indicator=True, how='inner', left_on=['korname','chnname','pass_year'], right_on = ['namehg1','namehj1','year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master = master.drop(columns=['namehg1', 'namehj1', 'year'])
master = master.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

len(master), len(master['lee_uuid'].unique()), len(master['ws_uuid'].unique())

(13208, 13201, 13195)

Remove duplicates only if `repeated_testtaker_lee` and `repeated_testtaker_ws` are BOTH `True`. 

(For other cases, it means that we WANT to keep the repeats)

In the case when both are True, currently we are getting double the amount of repeats. For example, if person A in Lee takes test twice in the same year, and person A in WS takes the test twice in the same year, then we end up with 4 entries, when we want 2. We need to be careful about how we drop the duplicates.

In [392]:
# Duplicates of the lee_uuid
lee_dup = master[master.duplicated('lee_uuid', False)]

In [393]:
lee_dup[['kor_name', 'pass_year', 'lee_uuid', 'ws_uuid', 'repeated_testtaker_lee', 'repeated_testtaker_ws']]


Unnamed: 0,kor_name,pass_year,lee_uuid,ws_uuid,repeated_testtaker_lee,repeated_testtaker_ws
7892,이윤재,1887,383bd960-4527-540f-bb30-ba9643645ce7,26044751-d6bd-534a-ae6b-965fc8389ed4,False,True
7893,이윤재,1887,383bd960-4527-540f-bb30-ba9643645ce7,5af472e3-2855-5288-851d-1b8698aae187,False,True
8237,홍종협,1880,76caa537-e6b1-5c89-a94b-847732b7bd4a,4bd1bb73-339b-58bb-921a-ae2e33f6bfb4,True,True
8238,홍종협,1880,76caa537-e6b1-5c89-a94b-847732b7bd4a,c0b57392-53c2-54a4-b0e4-e58e0f5e9e14,True,True
8239,홍종협,1880,4663c45d-a6f2-5c33-b405-baed07cac13f,4bd1bb73-339b-58bb-921a-ae2e33f6bfb4,True,True
8240,홍종협,1880,4663c45d-a6f2-5c33-b405-baed07cac13f,c0b57392-53c2-54a4-b0e4-e58e0f5e9e14,True,True
8321,이육,1740,92542b7a-6e09-5d1e-b277-15670311f752,4e6fe550-8d80-5c69-89a4-5071bf539533,True,True
8322,이육,1740,92542b7a-6e09-5d1e-b277-15670311f752,cea144cd-a435-5bdd-a69c-7f7cc1a37de7,True,True
8323,이육,1740,7ba41301-6da0-541d-9c48-3eec7cf1a2a1,4e6fe550-8d80-5c69-89a4-5071bf539533,True,True
8324,이육,1740,7ba41301-6da0-541d-9c48-3eec7cf1a2a1,cea144cd-a435-5bdd-a69c-7f7cc1a37de7,True,True


In [394]:
# Manually remove the people repeated in both Lee and in WS so there is 1-1 mapping
# Deduplicate if the person is marked as repeated in both datasets

# Test to make sure this is true. Once we have manually validated, then drop from master table
lee_dup = lee_dup.drop([8237, 8240, 8321, 8324, 9959, 9962])
lee_dup[['kor_name', 'pass_year', 'lee_uuid', 'ws_uuid', 'repeated_testtaker_lee', 'repeated_testtaker_ws']]

Unnamed: 0,kor_name,pass_year,lee_uuid,ws_uuid,repeated_testtaker_lee,repeated_testtaker_ws
7892,이윤재,1887,383bd960-4527-540f-bb30-ba9643645ce7,26044751-d6bd-534a-ae6b-965fc8389ed4,False,True
7893,이윤재,1887,383bd960-4527-540f-bb30-ba9643645ce7,5af472e3-2855-5288-851d-1b8698aae187,False,True
8238,홍종협,1880,76caa537-e6b1-5c89-a94b-847732b7bd4a,c0b57392-53c2-54a4-b0e4-e58e0f5e9e14,True,True
8239,홍종협,1880,4663c45d-a6f2-5c33-b405-baed07cac13f,4bd1bb73-339b-58bb-921a-ae2e33f6bfb4,True,True
8322,이육,1740,92542b7a-6e09-5d1e-b277-15670311f752,cea144cd-a435-5bdd-a69c-7f7cc1a37de7,True,True
8323,이육,1740,7ba41301-6da0-541d-9c48-3eec7cf1a2a1,4e6fe550-8d80-5c69-89a4-5071bf539533,True,True
9960,김익진,1651,a53a1875-d574-52a1-9ba1-c39620eced18,a29d8bb7-b3e1-5a90-b9db-29c779ae4513,True,True
9961,김익진,1651,8ccdafa7-8b43-581e-9fff-d5c55316f7ff,4f879759-7396-5b4f-8d6c-2a7c57ba1754,True,True


In [395]:
master = master.drop([8237, 8240, 8321, 8324, 9959, 9962])

In [396]:
len(master), len(master['lee_uuid'].unique()), len(master['ws_uuid'].unique())

(13202, 13201, 13195)

In [397]:
# master.head().T

In [398]:
merge1 = master.copy()

## `merge2` - Merge Similar Korean Names

Next, do a manual merge of the ones that were not merged

* Find those in data_lee not in master
* Find those in data_ws not in master
* See if any in data_ws meets 2/3 criteria for those in data_lee, and if so, add them to master

In [399]:
master = merge1

In [400]:
len(master[master.duplicated('lee_uuid')])

1

In [401]:
# Unique IDs in data_lee NOT in master
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]

# Unique IDs in data_ws NOT in master
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]

len(lee_unmerged), len(ws_unmerged)

(1950, 1412)

In [402]:
# Sanity Check
# 1950 + 13202 = 15151 + 1 (repeat)
repeated = len(master[master.duplicated('lee_uuid')])
assert (len(lee_unmerged) + len(master) == len(data_lee) + repeated)

In [403]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

These are the Korean names in WS that ALMOST match the names in Lee but are SUPER close but are not the same name. Can manually check the following list to make sure I'm right but most of them look like they're the same person.

If their Chinese names and exam year are the same, but the Korean names are more than 50 percent similar, add to the Master table.


In [404]:
# Make sure number of times one person retook a test in the same year is the same across both datasets
# for index, row in lee_unmerged.iterrows():
#     exam_year = row['pass_year']
#     chi_name = row['chnname']
#     kor_name = row['korname']
#     repeated = row['repeated_testtaker']
    
#     # only matching Chinese name and exam year
#     ws_person = ws_unmerged[(ws_unmerged['namehj1'] == chi_name) & (ws_unmerged['year'] == exam_year)]
#     if (len(ws_person) > 0):
#         if (repeated != ws_person['repeated_testtaker'][0]):
#             print(kor_name, exam_year, repeated, ws_person['repeated_testtaker'][0])
    

In [405]:
df_similar_kor_name_lee = pd.DataFrame([])
df_similar_kor_name_ws = pd.DataFrame([])

# Iterate through each person in lee_unmerged
# Find that person in ws_person based on Chinese name and exam year
# Determine if if the person has similar Korean names, they are one person
# Look at the repeated_testtaker. If same numbers, fine
# If different numbers, take the larger of the values
# I guess for now, add repeat
for index, row in lee_unmerged.iterrows():
    exam_year = row['pass_year']
    chi_name = row['chnname']
    kor_name = row['korname']
    repeated = row['repeated_testtaker_lee']
    
    # only matching Chinese name and exam year
    ws_person = ws_unmerged[(ws_unmerged['namehj1'] == chi_name) & (ws_unmerged['year'] == exam_year)]
    for person_index in range(len(ws_person)):
#         print(ws_person["namehg1"].iloc[person_index])
        if (similar(ws_person["namehg1"].iloc[person_index].replace('구', '귀'), kor_name) > 0.5):
            print(kor_name, ws_person["namehg1"].iloc[person_index], similar(ws_person["namehg1"].iloc[person_index], kor_name), ws_person["ws_uuid"].iloc[person_index])
            row_copy = row.copy()
            row_copy['other_kor_name_in_WS_version'] = ws_person["namehg1"].iloc[person_index]
            row_copy['other_kor_name_in_WS_version_uuid'] = ws_person["ws_uuid"].iloc[person_index]
            df_similar_kor_name_lee = df_similar_kor_name_lee.append(row_copy)    
            df_similar_kor_name_ws = df_similar_kor_name_ws.append(ws_person)
        
       

민재문 민ެ문 0.6666666666666666 ccbafb31-0648-5e8b-9930-3ac6eef5d57c
윤기번 윤기반 0.6666666666666666 e0736d9f-2e42-5d27-9a7f-d2fb716bfe78
정석부 정석빙 0.6666666666666666 78bf3f0e-4efc-5f66-8ae4-5d57d1346cc7


In [406]:
(len(df_similar_kor_name_lee),len(df_similar_kor_name_ws))

(3, 3)

In [407]:
assert(len(df_similar_kor_name_lee) == len(df_similar_kor_name_ws))

In [408]:
# MERGING only on Chinese name and year
master2 = pd.merge(df_similar_kor_name_lee, df_similar_kor_name_ws, indicator=True, how='inner', left_on=['chnname','pass_year'], right_on = ['namehj1','year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

(3, 3, 3)

In [409]:
# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge2 = master.copy()

In [410]:
len(master[~master['other_kor_name_in_WS_version'].isnull()])

3

In [411]:
# Update these values
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]
len(ws_unmerged),len(lee_unmerged)

(1409, 1947)

In [412]:
# We still have 1409 in data_ws that cannot be matched with data_lee
len(data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())])

1409

## `merge3` - Merge Similar Chinese Names


These are the CHINESE names in WS that ALMOST match the names in Lee but are SUPER close but are not the same name. Can manually check the following list to make sure I'm right but most of them look like they're the same person

<b>Extra thing to check</b>: If the WS name is shorter than ("contains") Lee name, then we can merge them because the names may just be truncated in the Lee version

In [413]:
master = merge2

In [414]:
df_similar_chi_name_lee = pd.DataFrame([])
df_similar_chi_name_ws = pd.DataFrame([])

In [415]:
# Make sure number of times one person retook a test in the same year is the same across both datasets
# Same as above but for Chinese names
# for index, row in lee_unmerged.iterrows():
#     exam_year = row['pass_year']
#     chi_name = row['chnname']
#     kor_name = row['korname']
#     repeated = row['repeated_testtaker_lee']
    
#     # only matching Chinese name and exam year
#     ws_person = ws_unmerged[(ws_unmerged['namehg1'] == kor_name) & (ws_unmerged['year'] == exam_year)]
#     if (len(ws_person) > 0):
#         if (repeated != ws_person['repeated_testtaker_ws'][0]):
#             print(kor_name, chi_name, exam_year, repeated, ws_person['repeated_testtaker_ws'][0])
    

In [416]:
# ws_unmerged[(ws_unmerged['year'] == 1612)][["namehj1", "namehg1", "year"]].sort_values(['namehg1'])

In [417]:
# TODO what to do if there are more than one people with similar names?

df_similar_chi_name_lee = pd.DataFrame([])
df_similar_chi_name_ws = pd.DataFrame([])

for index, row in lee_unmerged.iterrows():
    exam_year = row['pass_year']
    chi_name = row['chnname']
    kor_name = row['korname']
    
    # only matching Chinese name and exam year
    ws_person = ws_unmerged[(ws_unmerged['namehg1'] == kor_name) & (ws_unmerged['year'] == exam_year)]
    
    for person_index in range(len(ws_person)):
        if (similar(ws_person["namehj1"].iloc[person_index], chi_name) > 0.5):
#             if (re.search(re.escape(ws_person["namehj1"][person_index]), chi_name)):
            print(chi_name, ws_person["namehj1"].iloc[person_index], similar(ws_person["namehj1"].iloc[person_index], chi_name), ws_person['namehg1'].iloc[person_index])
            row_copy = row.copy()
            row_copy['other_chi_name_in_WS_version'] = ws_person.iloc[person_index]["namehj1"]
            row_copy['other_chi_name_in_WS_version_uuid'] = ws_person.iloc[person_index]["ws_uuid"]

            df_similar_chi_name_lee = df_similar_chi_name_lee.append(row_copy)
            df_similar_chi_name_ws = df_similar_chi_name_ws.append(ws_person.iloc[person_index])


李廷熽 李廷 0.8 이정소
鄭墀 鄭 0.6666666666666666 정지
李章垕 李章 0.8 이장후
許禝 許 0.6666666666666666 허직
金伈 金 0.6666666666666666 김심
朴頎 朴 0.6666666666666666 박기
田馪 田 0.6666666666666666 전빈
李敉 李 0.6666666666666666 이미
朴耋 朴 0.6666666666666666 박질
林薈 林 0.6666666666666666 임회
金鑌 金 0.6666666666666666 김빈
崔瀹之 崔之 0.8 최약지
金貔 金 0.6666666666666666 김비
金鏗 金 0.6666666666666666 김갱
朴叙 朴 0.6666666666666666 박서
李士侗 李士 0.8 이사동
金㽔 金 0.6666666666666666 김유
趙忭 趙 0.6666666666666666 조변
裵爚 裵 0.6666666666666666 배약
姜仡 姜 0.6666666666666666 강흘
金膩 金 0.6666666666666666 김이
洪叙疇 洪疇 0.8 홍서주
金時霔 金時 0.8 김시주
金煒 金 0.6666666666666666 김위
孔頎 孔 0.6666666666666666 공기
曺好智 曹好智 0.6666666666666666 조호지
金軺 金 0.6666666666666666 김초
呂箎 呂 0.6666666666666666 여호
崔涐 崔 0.6666666666666666 최아
孫叙倫 孫倫 0.8 손서윤
張鄰臣 張隣臣 0.6666666666666666 장인신
金晉錫 金晋錫 0.6666666666666666 김진석
崔湑 崔 0.6666666666666666 최서
金鏗壽 金壽 0.8 김갱수
黃㻶 黃 0.6666666666666666 황필
李世蓁 李世 0.8 이세진
朴元秢 朴元 0.8 박원영
金瑊 金 0.6666666666666666 김감
金鏐 金 0.6666666666666666 김유
許穧 許 0.6666666666666666 허제
南趎 南 0.6666666666666666 남주
姜

權侹 權 0.6666666666666666 권정
許葟 許 0.6666666666666666 허황
趙鑌 趙 0.6666666666666666 조빈
金克忸 金克 0.8 김극유
鄭晣 鄭 0.6666666666666666 정절
李垾 李 0.6666666666666666 이한
安晉生 安晋生 0.6666666666666666 안진생
金訢 金 0.6666666666666666 김흔
奇襸 奇 0.6666666666666666 기찬
辛季琚 辛季 0.8 신계거
李坫 李 0.6666666666666666 이점
姜景叙 姜景 0.8 강경서
宋軼 宋 0.6666666666666666 송일
金硉 金 0.6666666666666666 김율
李諿 李 0.6666666666666666 이집
朴訒 朴 0.6666666666666666 박인
李琚 李 0.6666666666666666 이거
李浤 李 0.6666666666666666 이굉
權璸 權 0.6666666666666666 권빈
金崶 金 0.6666666666666666 김봉
奇褚 奇 0.6666666666666666 기저
閔頤 閔 0.6666666666666666 민이
金磶 金 0.6666666666666666 김석
李瑺 李 0.6666666666666666 이상
李黿 李 0.6666666666666666 이원
李㙉 李 0.6666666666666666 이전
姜諿 姜 0.6666666666666666 강집
姜澂 姜 0.6666666666666666 강징
安覯 安 0.6666666666666666 안구
金璫 金 0.6666666666666666 김당
金熠 金 0.6666666666666666 김습
李云秠 李云 0.8 이운비
閔㥳 閔 0.6666666666666666 민원
李堣 李 0.6666666666666666 이우
金克愊 金克 0.8 김극핍
李頫 李 0.6666666666666666 이부
權橃 權 0.6666666666666666 권벌
李蘋 李 0.6666666666666666 이빈
李膂 李 0.6666666666666666 이여
金磧 

李𨯶永 李永 0.8 이헌영
李景稙 李庚稙 0.6666666666666666 이경직
金睟 金 0.6666666666666666 김수
金濰 金 0.6666666666666666 김유
金㻶 金 0.6666666666666666 김필
金礩 金 0.6666666666666666 김질
朴叔蓁 朴叔 0.8 박숙진
鄭忱 鄭 0.6666666666666666 정침
金嶙 金 0.6666666666666666 김인
金碏 金 0.6666666666666666 김작
安瑭 安 0.6666666666666666 안당
金巓 金 0.6666666666666666 김전
朴承爚 朴承 0.8 박승약
金孝侃 金效侃 0.6666666666666666 김효간
李芑 李 0.6666666666666666 이기
李耔 李 0.6666666666666666 이자
金硡 金 0.6666666666666666 김굉
金銛 金 0.6666666666666666 김섬
慶俶 慶 0.6666666666666666 경숙
李𡊉 李 0.6666666666666666 이말
李簡 李 0.6666666666666666 이간
閔樑 閔 0.6666666666666666 민양
蔡忱 蔡 0.6666666666666666 채침
朴稑 朴 0.6666666666666666 박육
鄭譍 鄭 0.6666666666666666 정응
朴稹 朴 0.6666666666666666 박진
尹行晉 尹行晋 0.6666666666666666 윤행진
權磌 權 0.6666666666666666 권전
安處諴 安處 0.8 안처함
金釴 金 0.6666666666666666 김익
宋㻩 宋 0.6666666666666666 송겸
李薿 李 0.6666666666666666 이의
金亹 金 0.6666666666666666 김미
尹忭 尹 0.6666666666666666 윤변
李頤 李 0.6666666666666666 이이
朴世蓊 朴世 0.8 박세옹
尹世忱 尹世 0.8 윤세침
李賢讜 李賢 0.8 이현당
李秫 李 0.6666666666666666 이출
李無彊 李無疆 0.666666666

許霮 許 0.6666666666666666 허담
韓晩𥙿 韓晩裕 0.6666666666666666 한만유
申曮 申 0.6666666666666666 신엄
韓光綮 韓光 0.8 한광계
崔守忱 崔守 0.8 최수침
安筞 安 0.6666666666666666 안책
權儐 權 0.6666666666666666 권빈
李翼晉 李翼晋 0.6666666666666666 이익진
沈晉賢 沈晋賢 0.6666666666666666 심진현
鄭㝡成 鄭成 0.8 정최성
鄭㝡行 鄭行 0.8 정최행
申漵 申 0.6666666666666666 신서
李垿 李 0.6666666666666666 이서
兪漢㝢 兪漢 0.8 유한우
徐有榘 徐有 0.8 서유구
姜儐 姜 0.6666666666666666 강빈
洪大恊 洪大協 0.6666666666666666 홍대협
鄭東榦 鄭東 0.8 정동간
尹愭 尹 0.6666666666666666 윤기
韓耆𥙿 韓耆裕 0.6666666666666666 한기유
尹瀗 尹 0.6666666666666666 윤헌
尹穳 尹 0.6666666666666666 윤찬
金𨩿 金 0.6666666666666666 김후
趙晉和 趙晋和 0.6666666666666666 조진화
柳訸 柳 0.6666666666666666 유화
沈鈁 沈 0.6666666666666666 심방
金鐮 金 0.6666666666666666 김염
金基叙 金基 0.8 김기서
權傛 權 0.6666666666666666 권용
李晉淵 李晋淵 0.6666666666666666 이진연
金逌根 金根 0.8 김유근
韓晉赫 韓晋赫 0.6666666666666666 한진혁
金鏴 金 0.6666666666666666 김노
金䥧 金 0.6666666666666666 김선
金鍏 金 0.6666666666666666 김위
李圭祊 李圭 0.8 이규팽
韓鎭㦿 韓鎭 0.8 한진호
黃𥞵 黃 0.6666666666666666 황겹
權筴 權 0.6666666666666666 권협
權溭 權 0.6666666666666666 권직
李鼎叙 李鼎 0.8 이정서
李寅臯 李

In [418]:
# THESE NEED TO BE THE SAME
assert(len(df_similar_chi_name_lee) == len(df_similar_chi_name_ws))
print(len(df_similar_chi_name_lee), len(df_similar_chi_name_ws))

1120 1120


In [419]:
# There are repeats. 
len(df_similar_chi_name_lee['lee_uuid'].unique()),len(df_similar_chi_name_ws['ws_uuid'].unique())

(1118, 1116)

In [420]:
# Duplicates
df_similar_chi_name_lee = df_similar_chi_name_lee.reset_index()
lee_dup = df_similar_chi_name_lee[df_similar_chi_name_lee['lee_uuid'].duplicated(False)]
lee_dup[['lee_uuid','korname', 'chnname', 'other_chi_name_in_WS_version', 'other_chi_name_in_WS_version_uuid']]

Unnamed: 0,lee_uuid,korname,chnname,other_chi_name_in_WS_version,other_chi_name_in_WS_version_uuid
816,c5b8a0c3-fe87-5004-b7bb-cd6d2c7da9be,이목,李楘,李,a28023af-8791-5c39-9f49-2856c57cf410
817,c5b8a0c3-fe87-5004-b7bb-cd6d2c7da9be,이목,李楘,李,1da79682-6fb5-5029-bca5-9969dd572b9b
818,a6382748-0883-5c51-afe7-1bebc60152f7,이목,李莯,李,a28023af-8791-5c39-9f49-2856c57cf410
819,a6382748-0883-5c51-afe7-1bebc60152f7,이목,李莯,李,1da79682-6fb5-5029-bca5-9969dd572b9b


In [421]:
# Since there are so few I will just manually delete them
lee_dup = lee_dup.drop([816, 819])
lee_dup[['lee_uuid','korname', 'chnname', 'other_chi_name_in_WS_version', 'other_chi_name_in_WS_version_uuid']]

Unnamed: 0,lee_uuid,korname,chnname,other_chi_name_in_WS_version,other_chi_name_in_WS_version_uuid
817,c5b8a0c3-fe87-5004-b7bb-cd6d2c7da9be,이목,李楘,李,1da79682-6fb5-5029-bca5-9969dd572b9b
818,a6382748-0883-5c51-afe7-1bebc60152f7,이목,李莯,李,a28023af-8791-5c39-9f49-2856c57cf410


In [422]:
# MERGING only on Korean name and year
master2 = pd.DataFrame([])
master2 = pd.merge(df_similar_chi_name_lee, df_similar_chi_name_ws, indicator=True, how='left', left_on=['korname','pass_year'], right_on = ['namehg1','year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

# These numbers are unequal bc of all of the extra duplicates, etc
len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

(1136, 1118, 1116)

In [423]:
# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge3 = master.copy()

In [424]:
len(master[~master['other_chi_name_in_WS_version'].isnull()])

1136

In [425]:
# Update these values
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]

len(ws_unmerged),len(lee_unmerged)

(293, 829)

In [426]:
# We still have 293 in data_ws that cannot be matched with data_lee
len(data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())])

293

## `merge4` - Merge Similar Korean Names - 50%

In [427]:
master = merge3

In [428]:
df_similar_kor_name_lee = pd.DataFrame([])
df_similar_kor_name_ws = pd.DataFrame([])

# Iterate through each person in lee_unmerged
# Find that person in ws_person based on Chinese name and exam year
# Determine if if the person has similar Korean names, they are one person
# Look at the repeated_testtaker. If same numbers, fine
# If different numbers, take the larger of the values
# I guess for now, add repeat
for index, row in lee_unmerged.iterrows():
    exam_year = row['pass_year']
    chi_name = row['chnname']
    kor_name = row['korname']
    repeated = row['repeated_testtaker_lee']
    
    # only matching Chinese name and exam year
    ws_person = ws_unmerged[(ws_unmerged['namehj1'] == chi_name) & (ws_unmerged['year'] == exam_year)]
    
    for person_index in range(len(ws_person)):
#         print(ws_person["namehg1"][person_index], kor_name, ws_person["namehj1"][person_index], similar(ws_person["namehg1"][person_index], kor_name))
        if (similar(ws_person["namehg1"].iloc[person_index], kor_name) >= 0.5):
            print(ws_person["namehg1"].iloc[person_index], kor_name, ws_person["namehj1"].iloc[person_index])
            row_copy = row.copy()
            row_copy['other_kor_name_in_WS_version'] = ws_person["namehg1"].iloc[person_index]
            row_copy['other_kor_name_in_WS_version_uuid'] = ws_person["ws_uuid"].iloc[person_index]
            df_similar_kor_name_lee = df_similar_kor_name_lee.append(row_copy)    
            df_similar_kor_name_ws = df_similar_kor_name_ws.append(ws_person)


표윤 표빈 表贇
정윤 정빈 鄭贇
유송 유책 柳
윤윤 윤빈 尹贇
이윤 이빈 李贇
이옥 이욱 李稶
조윤 조빈 趙贇
임박 임부 林
최윤 최빈 崔贇


I understand these names are very different but for now I will just assume they are the same because they look similar enough...?

In [429]:
assert(len(df_similar_kor_name_lee) == len(df_similar_kor_name_ws))
(len(df_similar_kor_name_lee),len(df_similar_kor_name_ws))

(9, 9)

In [430]:
# MERGING only on Chinese name and year
master2 = pd.merge(df_similar_kor_name_lee, df_similar_kor_name_ws, indicator=True, how='inner', left_on=['chnname','pass_year'], right_on = ['namehj1','year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

(9, 9, 9)

In [431]:
# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge4 = master.copy()

In [432]:
# Update these values. There are 284 left in WS we need to merge in
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]
len(ws_unmerged),len(lee_unmerged)

(284, 820)

## `merge5` - Merge Similar Chinese Names - 50%

In [433]:
master = merge4

In [434]:
# If Chinese names are the same and Korean names are FIFTY PERCENT (this is less than before)
# Use eyes to make sure these are right
# Manually choose the people who are the same

df_similar_chi_name_lee = pd.DataFrame([])
df_similar_chi_name_ws = pd.DataFrame([])

for index, row in lee_unmerged.iterrows():
    exam_year = row['pass_year']
    chi_name = row['chnname']
    kor_name = row['korname']
    
    # only matching Chinese name and exam year
    ws_person = ws_unmerged[(ws_unmerged['namehg1'] == kor_name) & (ws_unmerged['year'] == exam_year)]
    if (len(ws_person) > 0) :
        for person_index in range(len(ws_person)):
            if (similar(ws_person["namehj1"].iloc[person_index], chi_name) >= 0.5):
                print(chi_name, ws_person["namehj1"].iloc[person_index], similar(ws_person["namehj1"].iloc[person_index], chi_name), ws_person['namehg1'].iloc[person_index], row['korname'])
                row_copy = row.copy()
                row_copy['other_chi_name_in_WS_version'] = ws_person.iloc[person_index]["namehj1"]
                row_copy['other_chi_name_in_WS_version_uuid'] = ws_person.iloc[person_index]["ws_uuid"]

                df_similar_chi_name_lee = df_similar_chi_name_lee.append(row_copy)
                df_similar_chi_name_ws = df_similar_chi_name_ws.append(ws_person.iloc[person_index])
                

鄭鄰 鄭隣 0.5 정인 정인
康福 康輻 0.5 강복 강복
安憙 安熹 0.5 안희 안희
張晉 張晋 0.5 장진 장진
李晉 李晋 0.5 이진 이진
李𡎘 李瑛 0.5 이영 이영
徐崗 徐岡 0.5 서강 서강
郭恂 郭珣 0.5 곽순 곽순
徐晉 徐晋 0.5 서진 서진
李蹈 李韜 0.5 이도 이도
韓淃 韓卷 0.5 한권 한권
康晉 康晋 0.5 강진 강진
成晉 成晋 0.5 성진 성진
孫㴻 孫澍 0.5 손주 손주
金冲 金沖 0.5 김충 김충
許晉 許晋 0.5 허진 허진
李冲 李沖 0.5 이충 이충
尹晉 尹晋 0.5 윤진 윤진
沈枰 沈坪 0.5 심평 심평
鄭恊 鄭協 0.5 정협 정협
鄭晉 鄭晋 0.5 정진 정진
李惕然 李 0.5 이척연 이척연
許頴 許潁 0.5 허영 허영
洪墪 洪墩 0.5 홍돈 홍돈
權𠌶 權華 0.5 권화 권화


In [435]:
# THESE NEED TO BE THE SAME
assert(len(df_similar_chi_name_lee) == len(df_similar_chi_name_ws))
print(len(df_similar_chi_name_lee), len(df_similar_chi_name_ws))

25 25


In [436]:
# MERGING only on Korean name and year
master2 = pd.DataFrame([])
master2 = pd.merge(df_similar_chi_name_lee, df_similar_chi_name_ws, indicator=True, how='left', left_on=['korname','pass_year'], right_on = ['namehg1','year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

# These numbers are unequal bc of all of the extra duplicates, etc
len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

master = master.reset_index(drop=True)
merge5 = master.copy()

In [437]:
# Update these values. 259 more to match from WS
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]

len(ws_unmerged),len(lee_unmerged)

(259, 795)

## `merge6` - Merge on year,  1-1 Mapping Between both versions for same year, Korean and Chinese names at least 50% similar
For this one I will ONLY be checking if for the given year, the Korean and Chinese names are more than 50 percent similar

In [438]:
# Fix a few characters manually

# ws_unmerged = ws_unmerged.replace(to_replace=r'葉', value='葉', regex=True)
# ws_unmerged = ws_unmerged.replace(to_replace=r'省', value='省', regex=True)
# ws_unmerged = ws_unmerged.replace(to_replace=r'龜', value='龜', regex=True)
# # ws_unmerged = ws_unmerged.replace(to_replace=r'葉', value='葉', regex=True)
# # ws_unmerged = ws_unmerged.replace(to_replace=r'葉', value='葉', regex=True)



In [439]:
master = merge5

There are now 259 left from WS that I need to merge

In [440]:
len(ws_unmerged),len(lee_unmerged), len(master)

(259, 795, 14375)

In [441]:
ws_unique_years_remaining = ws_unmerged['year'].unique()
len(ws_unique_years_remaining)

179

Quick check: In the remaining `ws_unmerged`, there are no more repeated testtakers in the same year, while in the remaining `lee_unmerged`, there are still quite a few repeated testtakers in the same year.

In [442]:
ws_unmerged[ws_unmerged['repeated_testtaker_ws'] == True]

Unnamed: 0_level_0,ws_uuid,year,namehg1,namehj1,repeated_testtaker_ws
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [443]:
lee_unmerged[lee_unmerged['repeated_testtaker_lee'] == True]

Unnamed: 0_level_0,lee_uuid,pass_year,korname,chnname,repeated_testtaker_lee
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13047f7a-b949-5459-88fc-5b6d4492faf1,13047f7a-b949-5459-88fc-5b6d4492faf1,1466,김염,金𥖝,True
b241eaf2-0615-58c0-b079-335335674107,b241eaf2-0615-58c0-b079-335335674107,1466,김극검,金克儉,True
adffe1c5-3c4f-54fd-92cc-76e555096767,adffe1c5-3c4f-54fd-92cc-76e555096767,1466,김염,金𥖝,True
7828bff3-2486-5330-8071-8bec0838a9ce,7828bff3-2486-5330-8071-8bec0838a9ce,1466,김극검,金克儉,True
25549b5f-1a17-58eb-8968-f8502dab6361,25549b5f-1a17-58eb-8968-f8502dab6361,1466,이경동,李瓊仝,True
eaf75b13-772d-50dc-a565-4165db7d0e55,eaf75b13-772d-50dc-a565-4165db7d0e55,1466,윤자영,尹子濚,True
91c2e162-7274-57eb-b7d9-52f1cea28344,91c2e162-7274-57eb-b7d9-52f1cea28344,1466,이길보,李吉甫,True
f418d62a-e3e0-5178-a602-8dab89509cb1,f418d62a-e3e0-5178-a602-8dab89509cb1,1466,유순,柳洵,True
033c8a4a-c74c-540b-a50b-ccd8bed59184,033c8a4a-c74c-540b-a50b-ccd8bed59184,1466,정난종,鄭蘭宗,True
68257fc7-b55f-54df-8c36-f2511835384f,68257fc7-b55f-54df-8c36-f2511835384f,1466,노사신,盧思愼,True


In [444]:
ws_unmerged[ws_unmerged['year'] == 1486][['namehg1', 'namehj1', 'repeated_testtaker_ws']]

Unnamed: 0_level_0,namehg1,namehj1,repeated_testtaker_ws
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
26d85c87-fcde-51e7-a35c-3efd1480cc86,변희계,邊希季,False
102cce1f-c7d9-5f39-a419-405f6c67fbd0,정이득,鄭而得,False


In [445]:
lee_unmerged[lee_unmerged['pass_year'] == 1486][['korname', 'chnname', 'repeated_testtaker_lee']]

Unnamed: 0_level_0,korname,chnname,repeated_testtaker_lee
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
931545e1-d757-50b2-bbb6-56adec213f40,최부,崔溥,False
93f965e8-88f3-5035-b6e0-9cf94d7eefe6,이대형,李大亨,False
1afe871c-e5d1-553e-b307-21c65dac4535,김준손,金駿孫,False
279c4447-3259-5650-bfa7-04e6653a8f0e,정이교,鄭以僑,False
adfd0902-7579-5e49-9054-5b57818193eb,변희효,邊希孝,False
7b72667a-352b-5634-99fd-1d938a34a2dc,정이심,鄭以諶,False
d257f663-556f-56e8-9a2f-b7d6d8223fcb,신종호,申從濩,False
be8a662d-7991-5b00-a788-d89cd6a1dc0a,박증영,朴增榮,False
36ee22ae-c035-59cb-a1ec-29ee6a584898,민보익,閔輔翼,False
4580d311-f8d1-5cbb-9083-a52517db197d,표연말,表沿沫,False


In [446]:
df_similar_lee = pd.DataFrame([])
df_similar_ws = pd.DataFrame([])

# Find 1-1 mappings between the two 
# All of the 1-1 Mappings are the same person except for the year 1935. 

for year in ws_unique_years_remaining:

    ws_person = ws_unmerged[(ws_unmerged['year'] == year)]
    lee_person = lee_unmerged[(lee_unmerged['pass_year'] == year)]

    # Only if there is a 1-1 mapping between the two
    if (len(ws_person) == 1 and len(lee_person) == 1):
        ws_person_name = ws_person['namehg1'].iloc[0].replace('구', '귀')
        lee_person_name = lee_person['korname'].iloc[0]
        ws_person_name_ch = ws_person['namehj1'].iloc[0]
        lee_person_name_ch = lee_person['chnname'].iloc[0]
        
        if (similar(ws_person_name, lee_person_name) >= 0.5) or (similar(ws_person_name_ch, lee_person_name_ch) >= 0.5):
#             print(year, ws_person_name, lee_person_name, ws_person_name_ch, lee_person_name_ch)
            row_copy = lee_person.copy()
            row_copy['other_kor_name_in_WS_version'] = ws_person['namehg1'].iloc[0]
            row_copy['other_chi_name_in_WS_version'] = ws_person_name_ch
            row_copy['other_kor_name_in_WS_version_uuid'] = ws_person["ws_uuid"].iloc[0]
            row_copy['other_chi_name_in_WS_version_uuid'] = ws_person["ws_uuid"].iloc[0]
            
            df_similar_lee = df_similar_lee.append(row_copy)    
            df_similar_ws = df_similar_ws.append(ws_person)    
            
        else:
            print(year, ws_person_name, lee_person_name, ws_person_name_ch, lee_person_name_ch)


1435.0 노진해 노선경 盧晋諧 盧善卿


In [447]:
len(df_similar_lee)

101

In [448]:
# MERGING only on Chinese name and year
master2 = pd.merge(df_similar_lee, df_similar_ws, indicator=True, how='inner', left_on=['pass_year'], right_on = ['year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

(101, 101, 101)

In [449]:
# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge6 = master.copy()

In [450]:
# Update these values: 158 Left
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]
len(ws_unmerged),len(lee_unmerged)

(158, 694)

## `merge7` Merge on year, 1-many mapping from WS-Lee for the same year

In [451]:
master = merge6

In [452]:
# 158 more to merge from WS
len(ws_unmerged),len(lee_unmerged), len(master)

(158, 694, 14476)

In [453]:
ws_unique_years_remaining = ws_unmerged['year'].unique()
len(ws_unique_years_remaining)

78

In [454]:
ws_unmerged[ws_unmerged['year'] == 1636][['namehg1', 'namehj1', 'repeated_testtaker_ws']]

Unnamed: 0_level_0,namehg1,namehj1,repeated_testtaker_ws
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
d20527f9-6166-5eab-9d67-61b2f56d8f70,남선,南,False


In [455]:
lee_unmerged[lee_unmerged['pass_year'] == 1636][['korname', 'chnname', 'repeated_testtaker_lee']]

Unnamed: 0_level_0,korname,chnname,repeated_testtaker_lee
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
b5c49d52-764c-5561-ab15-0fb1310da88e,남훤,南翧,False
2170c17e-0ee4-51c5-a85e-fa9bdac8595a,남노성,南老星,False
85789025-50fc-58cb-9c0e-2d5925d695dc,이도,李禂,False
4f67f8a3-cb31-505f-9851-e2fc51b609b0,홍명일,洪命一,False
6c7d5fae-8ec9-5e41-8de1-3cc66ced6304,최계훈,崔繼勳,False
bcc41e44-5e60-5226-9e94-441342940e47,허계,許啓,False
32151664-286d-518b-97ef-7e500fc45191,신희계,辛喜季,False


Find the (Korean names that are 50% similar) and (Chinese names that are 50% similar) for 1-Many mappings (1 is WS). So we need to find the person BEST fits from the Lee version

In [456]:
df_similar_lee = pd.DataFrame([])
df_similar_ws = pd.DataFrame([])

# Find 1-1 mappings between the two 
# All of the 1-1 Mappings are the same person except for the year 1935. 

for year in ws_unique_years_remaining:

    ws_person = ws_unmerged[(ws_unmerged['year'] == year)]
    lee_persons = lee_unmerged[(lee_unmerged['pass_year'] == year)]

    # Only if there is a 1-1 mapping between the two
    if (len(ws_person) == 1):
#         print(len(lee_person), year)
        
        # Find the Lee person who is the closest 
        kr_highest = 0
        ch_highest = 0
        
        
        temp = pd.DataFrame([])
        for lee_index, lee_row in lee_persons.iterrows():
#             print(lee_persons)
            kr_similarity = similar(ws_person['namehg1'].iloc[0].replace('구', '귀'), lee_row['korname'])
            ch_similarity = similar(ws_person['namehj1'].iloc[0], lee_row['chnname'])
        
            if (kr_similarity >= 0.5 and ch_similarity >= 0.5):
                temp = temp.append([{'kr_similarity': kr_similarity,
                                 'ch_similarity': ch_similarity,
                                 'ws_korname': ws_person['namehg1'].iloc[0],
                                 'lee_korname': lee_row['korname'],
                                 'ws_chnname': ws_person['namehj1'].iloc[0],
                                 'lee_chnname': lee_row['chnname'],
                                 'lee_person': lee_row
                                }])
            
        
        if not temp.empty:
            temp = temp.reset_index(drop=True)
            tempmax = temp.loc[temp['kr_similarity'].idxmax()]
            print(year, tempmax)

            row_copy = tempmax['lee_person'].copy()
            row_copy['other_kor_name_in_WS_version'] = ws_person['namehg1'].iloc[0]
            row_copy['other_chi_name_in_WS_version'] = ws_person['namehj1'].iloc[0]
            row_copy['other_kor_name_in_WS_version_uuid'] = ws_person["ws_uuid"].iloc[0]
            row_copy['other_chi_name_in_WS_version_uuid'] = ws_person["ws_uuid"].iloc[0]
            df_similar_lee = df_similar_lee.append(row_copy)

            df_similar_ws = df_similar_ws.append(ws_person)
#         print(temp.max())
  


1427.0 kr_similarity                                             0.666667
ch_similarity                                             0.666667
ws_korname                                                     백효참
lee_korname                                                    백효삼
ws_chnname                                                     白效參
lee_chnname                                                    白效參
lee_person       lee_uuid                  ee212a3c-34cb-58a5-b...
Name: 0, dtype: object
1479.0 kr_similarity                                                    1
ch_similarity                                             0.666667
ws_korname                                                     강구손
lee_korname                                                    강귀손
ws_chnname                                                     姜龜孫
lee_chnname                                                    姜龜孫
lee_person       lee_uuid                  57219107-687b-5578-9...
Name: 0, dtype: object
14

In [457]:
df_similar_lee[['korname','other_kor_name_in_WS_version', 'chnname', 'other_chi_name_in_WS_version', 'repeated_testtaker_lee']]

Unnamed: 0,korname,other_kor_name_in_WS_version,chnname,other_chi_name_in_WS_version,repeated_testtaker_lee
ee212a3c-34cb-58a5-b235-af2bcad7d58c,백효삼,백효참,白效參,白效參,0.0
57219107-687b-5578-9eca-47809ea11741,강귀손,강구손,姜龜孫,姜龜孫,0.0
bb4342d1-d8ee-5bd9-a573-912b43845970,강삼,강참,姜參,姜參,0.0
ef1dbaeb-2aa6-54e2-b402-a764a8923ac6,이전손,이부손,李傳孫,李傅孫,0.0
aac9afba-fc0b-5328-a0d3-fbde83d21178,곽열,곽설,郭說,郭說,0.0
b5c49d52-764c-5561-ab15-0fb1310da88e,남훤,남선,南翧,南,0.0
2fa4e1b6-7668-52ec-b971-9c4d6ca73304,박현선,박견선,朴見善,朴見善,0.0
5b84fbb1-5440-5d9b-a0a1-5401c20eefc8,심귀서,심구서,沈龜瑞,沈龜瑞,0.0
64b2f7a5-030b-5e36-a805-f6358267d412,이진섭,이진엽,李震葉,李震葉,0.0
f71341b2-dd96-501f-9820-1b7b541dc44c,어유귀,어유구,魚有龜,魚有龜,0.0


In [458]:
len(df_similar_lee)

16

In [459]:
# MERGING only on the year, 1-many 
master2 = pd.merge(df_similar_lee, df_similar_ws, indicator=True, how='inner', left_on=['pass_year'], right_on = ['year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

(16, 16, 16)

In [460]:
# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge7 = master.copy()

In [461]:
# Update these values: 142 Left
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]
len(ws_unmerged),len(lee_unmerged)

(142, 678)

## `merge8` Merge Many-to-Many based on the most similar name for each name. Need to revisit this section based on HOW similar these names ACTUALLY are after having domain expert tell me so

In [462]:
master = merge7

In [463]:
len(ws_unmerged),len(lee_unmerged)

(142, 678)

In [464]:
ws_unique_years_remaining = ws_unmerged['year'].unique()
len(ws_unique_years_remaining)

62

In [465]:
ws_unique_years_remaining

array([1435.,   nan, 1466., 1486., 1492., 1507., 1514., 1519., 1552.,
       1553., 1556., 1572., 1579., 1583., 1594., 1597., 1603., 1605.,
       1606., 1612., 1613., 1616., 1624., 1630., 1635., 1648., 1650.,
       1663., 1666., 1675., 1676., 1678., 1680., 1691., 1702., 1705.,
       1721., 1722., 1723., 1727., 1735., 1739., 1744., 1752., 1754.,
       1756., 1761., 1763., 1766., 1768., 1774., 1775., 1786., 1789.,
       1792., 1795., 1815., 1825., 1865., 1870., 1887., 1891.])

In [466]:
ws_unmerged[ws_unmerged['year'] == 1870][['namehg1', 'namehj1', 'repeated_testtaker_ws']]


Unnamed: 0_level_0,namehg1,namehj1,repeated_testtaker_ws
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a63670da-c7fe-58cc-a9d2-2194cd4a7dba,백악흥,白樂興,False
4af84ca3-781c-5dc1-b1da-f82c84827a1f,신종구,愼鍾龜,False


In [467]:
lee_unmerged[lee_unmerged['pass_year'] == 1870][['korname', 'chnname', 'repeated_testtaker_lee']]


Unnamed: 0_level_0,korname,chnname,repeated_testtaker_lee
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bdda417a-35a9-5e2e-bcab-54d87962f399,신종귀,愼鍾龜,False
cbb481c4-427e-5b6e-97aa-4b8493f500b1,백낙흥,白樂興,False


In [468]:
pd.set_option('display.max_rows', 10)

In [469]:
lee_persons = lee_unmerged[(lee_unmerged['pass_year'] == year)].copy()
lee_persons.drop

<bound method DataFrame.drop of                                                                   lee_uuid  \
uuid                                                                         
646e87be-8e55-52e9-9fb0-70ff31998fac  646e87be-8e55-52e9-9fb0-70ff31998fac   
0f5f5bce-06b4-5fda-bf73-626d26da0617  0f5f5bce-06b4-5fda-bf73-626d26da0617   

                                      pass_year korname chnname  \
uuid                                                              
646e87be-8e55-52e9-9fb0-70ff31998fac       1891     황낙성     黃樂成   
0f5f5bce-06b4-5fda-bf73-626d26da0617       1891     박예양     朴澧陽   

                                      repeated_testtaker_lee  
uuid                                                          
646e87be-8e55-52e9-9fb0-70ff31998fac                   False  
0f5f5bce-06b4-5fda-bf73-626d26da0617                   False  >

In [470]:
df_similar_lee = pd.DataFrame([])
df_similar_ws = pd.DataFrame([])

# Find 1-1 mappings between the two 
# All of the 1-1 Mappings are the same person except for the year 1935. 

for year in ws_unique_years_remaining:

    ws_persons = ws_unmerged[(ws_unmerged['year'] == year)]
    lee_persons = lee_unmerged[(lee_unmerged['pass_year'] == year)]

    for ws_index, ws_row in ws_persons.iterrows():

        # Find the Lee person who is the closest 
        kr_highest = 0
        ch_highest = 0
        
        temp = pd.DataFrame([])
        for lee_index, lee_row in lee_persons.iterrows():
#             print(lee_persons)
            kr_similarity = similar(ws_row['namehg1'].replace('구', '귀'), lee_row['korname'])
            ch_similarity = similar(ws_row['namehj1'], lee_row['chnname'])
        
            if (kr_similarity >= 0.5 and ch_similarity >= 0.5):
                temp = temp.append([{'kr_similarity': kr_similarity,
                                 'ch_similarity': ch_similarity,
                                 'ws_korname': ws_row['namehg1'],
                                 'lee_korname': lee_row['korname'],
                                 'ws_chnname': ws_row['namehj1'],
                                 'lee_chnname': lee_row['chnname'],
                                 'lee_person': lee_row
                                }])
                # Remove this lee_row from lee_persons
#                 print(lee_index)
                lee_persons.drop(index=lee_index)
                
        if not temp.empty:
            temp = temp.reset_index(drop=True)
            tempmax = temp.loc[temp['kr_similarity'].idxmax()]
#             print(temp.max())
#             print(year, tempmax)

            row_copy = tempmax['lee_person'].copy()
            row_copy['other_kor_name_in_WS_version'] = ws_row['namehg1']
            row_copy['other_chi_name_in_WS_version'] = ws_row['namehj1']
            row_copy['other_kor_name_in_WS_version_uuid'] = ws_row["ws_uuid"]
            row_copy['other_chi_name_in_WS_version_uuid'] = ws_row["ws_uuid"]
            
            ws_copy = ws_row.copy()
            ws_copy['other_kor_name_in_Lee_version'] = row_copy['korname']
            
            df_similar_lee = df_similar_lee.append(row_copy)
            df_similar_ws = df_similar_ws.append(ws_copy)
  


In [471]:
# count

### NOTE: May need someone to manually check that these are the same person, but they look the same to me...

In [472]:
# namecheck = df_similar_lee[['pass_year', 'korname','other_kor_name_in_WS_version', 'chnname', 'other_chi_name_in_WS_version', 'repeated_testtaker_lee', 'lee_uuid']].sort_values(by='pass_year')
# namecheck.to_csv('namecheck.csv')

In [473]:
# NOTE: I am going to merge these for now. 

In [474]:
pd.set_option('display.max_rows', None)
df_similar_lee[['pass_year', 'korname','other_kor_name_in_WS_version', 'chnname', 'other_chi_name_in_WS_version', 'repeated_testtaker_lee', 'lee_uuid']].sort_values(by='pass_year')


Unnamed: 0,pass_year,korname,other_kor_name_in_WS_version,chnname,other_chi_name_in_WS_version,repeated_testtaker_lee,lee_uuid
0e0499fc-6df4-55a3-9d82-cd005ac1feef,1466.0,김귀,김구,金龜,金龜,0.0,0e0499fc-6df4-55a3-9d82-cd005ac1feef
adfd0902-7579-5e49-9054-5b57818193eb,1486.0,변희효,변희계,邊希孝,邊希季,0.0,adfd0902-7579-5e49-9054-5b57818193eb
bdc87d0b-2e57-50c8-aec0-b53b9a99ea84,1492.0,강숙돌,강숙요,姜叔突,姜叔,0.0,bdc87d0b-2e57-50c8-aec0-b53b9a99ea84
5da21798-30c7-54ed-8156-1e0d6ad8a2c1,1492.0,이귀,이구,李龜,李龜,0.0,5da21798-30c7-54ed-8156-1e0d6ad8a2c1
b5886799-19aa-5d80-b7b0-d34db467f368,1507.0,박귀원,박구원,朴龜元,朴龜元,0.0,b5886799-19aa-5d80-b7b0-d34db467f368
f0cf27a4-e2e4-53fb-8a0e-3dab60b2b3bf,1507.0,조삼,조참,趙參,趙參,0.0,f0cf27a4-e2e4-53fb-8a0e-3dab60b2b3bf
52ef0959-dcbd-5aff-bef2-5de97b8bf341,1514.0,최희삼,최희참,崔希參,崔希參,0.0,52ef0959-dcbd-5aff-bef2-5de97b8bf341
c8879ecf-b35a-5c71-97d6-7f138ae0843e,1514.0,이귀영,이구영,李龜齡,李龜齡,0.0,c8879ecf-b35a-5c71-97d6-7f138ae0843e
70cf46a8-d110-5c1f-bf7f-602f7bc90638,1519.0,김희열,김희설,金希說,金希說,0.0,70cf46a8-d110-5c1f-bf7f-602f7bc90638
3bf50186-81a5-50b1-858c-7f760d03c7b0,1519.0,이기,이규,李巙,李,0.0,3bf50186-81a5-50b1-858c-7f760d03c7b0


In [475]:
# The random duplicate because these were equally likely. Ah... 
df_similar_lee[df_similar_lee.duplicated(['lee_uuid'], keep=False)][['pass_year', 'korname','other_kor_name_in_WS_version', 'chnname', 'other_chi_name_in_WS_version', 'repeated_testtaker_lee', 'lee_uuid']].sort_values(by='pass_year')


Unnamed: 0,pass_year,korname,other_kor_name_in_WS_version,chnname,other_chi_name_in_WS_version,repeated_testtaker_lee,lee_uuid
a46c3d94-c08c-5ddb-8109-24c86eab2efd,1635.0,이성,이신,李垶,李,0.0,a46c3d94-c08c-5ddb-8109-24c86eab2efd
a46c3d94-c08c-5ddb-8109-24c86eab2efd,1635.0,이성,이예,李垶,李,0.0,a46c3d94-c08c-5ddb-8109-24c86eab2efd


In [476]:
# MERGING only on the year, 1-many ; and also the Korean name that we determined was most similar
master2 = pd.merge(df_similar_lee, df_similar_ws, indicator=True, how='inner', left_on=['pass_year', 'korname'], right_on = ['year', 'other_kor_name_in_Lee_version'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

(134, 131, 132)

I will just drop all four of these because these duplicates are annoying and get in the way right now

In [477]:
master2[master2.duplicated(['ws_uuid'], keep=False)][['pass_year', 'kor_name','other_kor_name_in_WS_version', 'chi_name', 'other_chi_name_in_WS_version', 'repeated_testtaker_lee', 'lee_uuid', 'ws_uuid']].sort_values(by='pass_year')


Unnamed: 0,pass_year,kor_name,other_kor_name_in_WS_version,chi_name,other_chi_name_in_WS_version,repeated_testtaker_lee,lee_uuid,ws_uuid
43,1635.0,이성,이신,李垶,李,0.0,a46c3d94-c08c-5ddb-8109-24c86eab2efd,2202ff57-a1dd-522f-befe-078025b63fcb
44,1635.0,이성,이신,李垶,李,0.0,a46c3d94-c08c-5ddb-8109-24c86eab2efd,e935f2a7-258d-506b-ab77-e57a3685c3ad
45,1635.0,이성,이예,李垶,李,0.0,a46c3d94-c08c-5ddb-8109-24c86eab2efd,2202ff57-a1dd-522f-befe-078025b63fcb
46,1635.0,이성,이예,李垶,李,0.0,a46c3d94-c08c-5ddb-8109-24c86eab2efd,e935f2a7-258d-506b-ab77-e57a3685c3ad


In [478]:
master2 = master2.drop([43, 44, 45, 46])


In [479]:
master2[master2.duplicated(['ws_uuid'], keep=False)][['pass_year', 'kor_name','other_kor_name_in_WS_version', 'chi_name', 'other_chi_name_in_WS_version', 'repeated_testtaker_lee', 'lee_uuid', 'ws_uuid']].sort_values(by='pass_year')


Unnamed: 0,pass_year,kor_name,other_kor_name_in_WS_version,chi_name,other_chi_name_in_WS_version,repeated_testtaker_lee,lee_uuid,ws_uuid


In [480]:
# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge8 = master.copy()

In [481]:
# Update these values: 12 Left
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]
len(ws_unmerged),len(lee_unmerged)

(12, 548)

## `merge9` - Manual Merge!

In [584]:
master = merge8

In [585]:
len(ws_unmerged),len(lee_unmerged), len(master)

(12, 548, 14622)

In [586]:
14622 + 548 + 12

15182

In [587]:
master = master.filter(['lee_uuid', 'ws_uuid'])

For the ones that do not match, just add them with the corresponding uuid, and leave the other uuid blank

In [588]:
ws_unmerged_filter = ws_unmerged.filter(['ws_uuid'])
ws_unmerged_filter["lee_uuid"] = None

lee_unmerged_filter = lee_unmerged.filter(['lee_uuid'])
lee_unmerged_filter["ws_uuid"] = None

In [589]:
len(ws_unmerged_filter), len(lee_unmerged_filter)

(12, 548)

In [590]:
# Merge these with master
frames = [master, ws_unmerged_filter, lee_unmerged_filter]
master = pd.concat(frames, sort=False)

merge9 = master.copy()

In [591]:
len(master)

15182

# Clean up Master

In [592]:
len(master)

15182

In [593]:
master = master.reset_index(drop=True)

Last thing - each thing in master table gets an ID

In [594]:
master['uuid'] = [uuid.uuid4() for _ in range(len(master.index))]
master = master.set_index('uuid')

In [595]:
master.head()

Unnamed: 0_level_0,lee_uuid,ws_uuid
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1
89aa92b3-4343-420d-8d0b-2c648e24ce57,4b854af7-3d4b-51c9-802e-117ff4825291,660a2a20-d407-5ad0-b364-cd9e7edbb82c
2e90dc82-5b92-4762-a1f4-aadcc0761b7e,8a1e8a21-4d10-5ae3-b92d-51acfbb31ba6,85fe658e-5b35-50b1-a41b-aeb916ae13bb
defd7e17-7e15-457e-99f6-de7602627700,fecc75c4-0045-5485-b47c-851767e002e3,9b6c41ef-f960-5856-9a25-99c0f379e59f
3b7d01b1-4ffc-4648-9fbe-2afd3467d6db,8688b608-b1fb-55ab-a77e-a10268ac80af,ceff916a-aab9-58f5-a84e-9f424a5587f0
4d4981e8-d1be-487f-baf1-d98155806432,46963c7a-bb8b-52f1-a06a-f128133fe0ab,eb3b33fc-139f-5a8e-952e-c26a1c7b0b48


In [599]:
len(master['lee_uuid'].unique()), len(master['ws_uuid'].unique()), len(master)

(15152, 14608, 15182)

In [609]:
# Export
master.to_csv('/Users/yenniejun/Documents/code/JoseonMunkwa/data/Merged_Munkwa.csv')

In [608]:
master

Unnamed: 0_level_0,lee_uuid,ws_uuid
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1
89aa92b3-4343-420d-8d0b-2c648e24ce57,4b854af7-3d4b-51c9-802e-117ff4825291,660a2a20-d407-5ad0-b364-cd9e7edbb82c
2e90dc82-5b92-4762-a1f4-aadcc0761b7e,8a1e8a21-4d10-5ae3-b92d-51acfbb31ba6,85fe658e-5b35-50b1-a41b-aeb916ae13bb
defd7e17-7e15-457e-99f6-de7602627700,fecc75c4-0045-5485-b47c-851767e002e3,9b6c41ef-f960-5856-9a25-99c0f379e59f
3b7d01b1-4ffc-4648-9fbe-2afd3467d6db,8688b608-b1fb-55ab-a77e-a10268ac80af,ceff916a-aab9-58f5-a84e-9f424a5587f0
4d4981e8-d1be-487f-baf1-d98155806432,46963c7a-bb8b-52f1-a06a-f128133fe0ab,eb3b33fc-139f-5a8e-952e-c26a1c7b0b48
459c361e-be3f-4285-97fb-59a26b54b4e8,5a860ee8-ecb6-5702-9d17-1bcabacdf769,18ed6221-5556-5632-86ed-848515adeafb
3c5b10af-d99b-4d5b-b76c-d908d2ad93ad,62a2be53-4a81-5a7c-aa77-84e485b4d4a9,eb3ca555-53bc-507a-a8b2-bf549018afee
21d7c6f3-29ac-44c1-b115-8794cf063b63,656c27d1-89e6-5c88-a942-d9835ee49cfd,cfcbf8f9-04d1-521e-8ee9-9946c94e9ba6
292d2422-cbf3-40a3-a6bc-5a12186f3a75,b15efb59-4089-503c-bf88-dcacee144303,c241fb28-21f6-537c-a043-6c352077dc32
da1f2fd1-e291-4f80-a9d8-6ee1cb96ca3b,395b4a62-0a47-5a11-8888-a5e9a07c90cf,690945cd-c33a-5b55-871d-04ddd7461287
