In [3]:
import pandas as pd
import numpy as np
import re
import uuid

For Lee:
* `bpid`: unique id for each person
* `lee_uuid`: unique id for each instance of taking a test
* `repeated_testtaker`: True if the person has taken the test multiple times in the SAME YEAR, False otherwise


For WS:
* `ws_uuid`: unique id for each instance of taking a test. Directly tied to `id1`
* `repeated_testtaker`: True if the person has taken the test multiple times in the SAME YEAR, False otherwise
* Currently no unique id for each person


Goals:
* Need to detect and merge the same person from both datafiles
* Each instance of taking a test has a unique ID in each datafile

Edge Cases:
* One person takes exam multiple times in the same year. More instances in Lee than in WS --> Copy each person in WS to each instance in Lee
* One person takes exam multiple times in the same year. More instances in WS than in Lee --> Add extra rows to merged file, copying each instance in Lee to match each instance in WS

## Join the different tables of the Lee Version

In [900]:
bangmok = pd.read_csv('data/lee_bangmok.csv')
career = pd.read_csv('data/lee_bangmok_career.csv')
person = pd.read_csv('data/lee_bangmok_person.csv')
family = pd.read_csv('data/lee_bangmok_family.csv')

In [901]:
len(bangmok), len(career), len(person), len(family)

(15151, 15151, 14638, 70468)

In [902]:
# merge bangmok and career
bangmok_career = pd.merge(bangmok,career, on='bid', how='outer')

In [903]:
eq = bangmok_career['bpid_x'] == bangmok_career['bpid_y']
[i for i, x in enumerate(eq) if not x]

# This means we can merge these two columns
bangmok_career = bangmok_career.rename(columns={"bpid_x":"bpid"}).drop(columns=["bpid_y"])

In [904]:
# merge bankmok, career, person
bangmok_career_person = pd.merge(bangmok_career,person,on='bpid',how='left')

In [905]:
# Not sure how to merge family so will not do that for now
len(family['source']), len(family['source'].unique())

(70468, 14634)

In [906]:
data_lee = bangmok_career_person
len(data_lee), len(data_lee['bid'].unique())

(15151, 15151)

Create UUID for each unique BID. This UUID is based off of a hash from the BID, so it will be the same for each BID

In [907]:
data_lee['uuid'] = [uuid.uuid5(uuid.NAMESPACE_DNS,x) for x in data_lee['bid']]
data_lee = data_lee.set_index('uuid')
data_lee['lee_uuid'] = data_lee.index

In [908]:
# Verify there are no duplicates
data_lee[data_lee.duplicated()]

Unnamed: 0_level_0,bid,bpid,affilliation,rank,competitors,exam_type,pass_year,king,lid,previous,...,solardate,bcid,gid,chnname,korname,family_clan,birth,death,plastic,lee_uuid
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [909]:
# These are the people who took the same example multiple times in the SAME YEAR 
# i.e. they are the duplicates of the korean name/chinese name/pass year. 
data_lee[data_lee[['korname', 'chnname', 'pass_year']].duplicated(keep=False)][['korname', 'chnname', 'pass_year']].sort_values(by=['pass_year', 'korname'])
                                                                                                                                    

Unnamed: 0_level_0,korname,chnname,pass_year
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bce827d1-0d70-5d9b-8c3e-47692a94056e,이승소,李承召,1447
4dd2ba02-6605-52de-aa90-baedc1786856,이승소,李承召,1447
8b8eb1bb-dd28-5ead-8e78-3396a662b7c8,정종소,鄭從韶,1447
c11cc3b8-55a2-5c41-9cf3-4c1c7539a08f,정종소,鄭從韶,1447
1dc793b9-084b-52d0-bb6c-fb8d8b310f54,강희맹,姜希孟,1466
e324aaec-07fc-5a90-aabf-f85d70ce7182,강희맹,姜希孟,1466
b241eaf2-0615-58c0-b079-335335674107,김극검,金克儉,1466
7828bff3-2486-5330-8071-8bec0838a9ce,김극검,金克儉,1466
c52db075-f8bc-5bed-bc6e-4333771de0a0,김수온,金守溫,1466
ec7822da-ffb5-53b0-81c4-c2cc764e6fa2,김수온,金守溫,1466


In [910]:
# These are the people who took the test multiple times in any given year
data_lee[data_lee[['korname', 'chnname']].duplicated(keep=False)][['korname', 'chnname', 'pass_year']].sort_values(by=['korname', 'pass_year'])


Unnamed: 0_level_0,korname,chnname,pass_year
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
14a5a9ec-723b-5622-940b-e82f7e53c8d9,강경서,姜景叙,1477
169379ac-cd05-5e80-ac62-84aceec611d3,강경서,姜景叙,1497
ac2303ab-18c3-5e66-a621-5ed2f09cd9a0,강극성,姜克誠,1553
c4151007-f5a6-589c-88b8-29bc2cef77bd,강극성,姜克誠,1556
09e1cd5c-3920-5168-bf14-f5c4d1ef1db1,강문회,姜文會,1469
ae760054-22e6-59df-9255-8f07d3abecd5,강문회,姜文會,1771
6224c725-bf65-5e57-b71a-1cad21155d9c,강백연,姜栢年,1627
a893081e-9120-50c8-a8b1-a0d4fe2609c3,강백연,姜栢年,1646
04c1d22e-66d4-5bd2-9e23-22b1b9bb7fb6,강세귀,姜世龜,1678
9ae547cb-6379-5243-9135-c4c55efd54c5,강세귀,姜世龜,1679


In [911]:
# Marking those who took the test more than once as such
data_lee['repeated_testtaker'] = data_lee[['korname', 'chnname','pass_year']].duplicated(keep=False)

In [912]:
# This is the Lee file
data_lee.to_csv('/Users/yenniejun/Documents/code/JoseonMunkwa/data/Lee_Munkwa.csv')

In [17]:
len(data_lee)

15151

In [18]:
# data_lee.head(3).T

## Clean up WS Version

In [891]:
data_ws = pd.read_csv('data/WS_Munkwa.csv')
# Ensure there are no duplicate IDs
len(data_ws), len(data_ws['id1'].unique()), len(data_ws['source'].unique())

(14607, 14607, 10591)

Create UUID for each unique BID. This UUID is based off converting the id to a string because that is the only unique value we have in this df. Since this isn't the most secure, we could change this to be random. But I don't think it matters much

In [892]:
data_ws['uuid'] = [uuid.uuid5(uuid.NAMESPACE_DNS,str(x)) for x in data_ws['id1']]
data_ws = data_ws.set_index('uuid');
data_ws['ws_uuid'] = data_ws.index

In [893]:
# Verify there are no duplicates
data_ws[data_ws.duplicated()]

Unnamed: 0_level_0,id1,namehg1,namehj1,year,prevdegreehg,prevdegreehj,source,courtesynamehg,courtesynamehj,ancestralseathg,...,ancestralseathj,choronymhj,addresshg,addresshj,pennamehg,pennamehj,posthumoustitlehg,posthumoustitlehj,error,ws_uuid
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [894]:
# These are the people who took the same example multiple times in the SAME YEAR 
# i.e. they are the duplicates of the korean name/chinese name/pass year. 
data_ws[data_ws[['namehj1', 'namehg1', 'year']].duplicated(keep=False)][['namehg1', 'namehj1', 'year']].sort_values(by=['year', 'namehg1'])


Unnamed: 0_level_0,namehg1,namehj1,year
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a28023af-8791-5c39-9f49-2856c57cf410,이목,李,1612.0
1da79682-6fb5-5029-bca5-9969dd572b9b,이목,李,1612.0
4f879759-7396-5b4f-8d6c-2a7c57ba1754,김익진,金益振,1651.0
a29d8bb7-b3e1-5a90-b9db-29c779ae4513,김익진,金益振,1651.0
4e6fe550-8d80-5c69-89a4-5071bf539533,이육,李堉,1740.0
cea144cd-a435-5bdd-a69c-7f7cc1a37de7,이육,李堉,1740.0
4bd1bb73-339b-58bb-921a-ae2e33f6bfb4,홍종협,洪鍾協,1880.0
c0b57392-53c2-54a4-b0e4-e58e0f5e9e14,홍종협,洪鍾協,1880.0
26044751-d6bd-534a-ae6b-965fc8389ed4,이윤재,李允在,1887.0
5af472e3-2855-5288-851d-1b8698aae187,이윤재,李允在,1887.0


In [895]:
# Marking those who took the test more than once as such in the SAME YEAR
data_ws['repeated_testtaker'] = data_ws[['namehj1', 'namehg1', 'year']].duplicated(keep=False)

## Clean up encodings

In [899]:
lee_character = '金'
ws_character = '金'

data_ws = data_ws.replace(to_replace=r'金', value=ws_character, regex=True)
                                                                
# ws_unmerged.loc['namehj1'] = ws_unmerged.loc['namehj1'].str.replace(lee_character, ws_character)

## Merge the Overlapping Areas of the Two Versions

We want to create a master table, where we merge the WS into the Lee version

We want to:
* Make a list of different name spellings, if they exist
* Indicate if a person is in Lee but not in WS
* Indicate if a person is in WS but not Lee

In [1008]:
data_lee[(data_lee['chnname'] == '宋介臣') & (data_lee['korname'] == '송개신') & (data_lee['pass_year'] == 1393)]

Unnamed: 0_level_0,bid,bpid,affilliation,rank,competitors,exam_type,pass_year,king,lid,previous,...,bcid,gid,chnname,korname,family_clan,birth,death,plastic,lee_uuid,repeated_testtaker
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2827f0d2-c539-5988-9836-6ab6d955b060,http://people.aks.ac.kr/front/dirSer/exm/exmVi...,G002+AKS-KHF_13C1A1AC1CC2E0B1373X0,문과,1,33,식년시,1393,태조,96,생원(生員),...,bc4676,2229,宋介臣,송개신,홍주(洪州),1373.0,,silver,2827f0d2-c539-5988-9836-6ab6d955b060,False


In [1009]:
def person_exists(kor_name, chi_name, exam_year):
    lee_person = data_lee[(data_lee['chnname'] == chi_name) 
                          & (data_lee['korname'] == kor_name) 
                          & (data_lee['pass_year'] == exam_year)]
        
    ws_person = data_ws[(data_ws['namehj1'] == chi_name) 
                          & (data_ws['namehg1'] == kor_name) 
                          & (data_ws['year'] == exam_year)]
    
    return lee_person, ws_person

In [1010]:
len(data_lee), len(data_lee['lee_uuid'].unique())

(15151, 15151)

In [1011]:
# First, inner merge on the Korean name, Chinese name, and exam year
master = pd.merge(data_lee, data_ws,  indicator=True, how='inner', left_on=['korname','chnname','pass_year'], right_on = ['namehg1','namehj1','year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master = master.drop(columns=['namehg1', 'namehj1', 'year'])
master = master.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

# For an edge case where there are repeats of the same person in both datasets, we get double the amount of
# repeats. For example, person A in lee takes test twice in same year, person A in WS takes twice in same year
# Then we end up with 4 entries, when we want 2, thus we will drop the duplicates
master = master.drop_duplicates(subset=['lee_uuid'])

len(master), len(master['lee_uuid'].unique()), len(master['ws_uuid'].unique())

(12052, 12052, 12043)

In [1012]:
# master.head().T

In [1014]:
merge1 = master.copy()

In [1015]:
len(master), len(master['lee_uuid'].unique()), len(master['ws_uuid'].unique())

(12052, 12052, 12043)

## Edge case for repeats

Cases:
* Lee: True, WS: False -- This case is okay. We will match the repeats with each other and de-duplicate
* Lee: False, WS: True -- Luckily, there is only one case of this, for 이윤재. Will just manually add this guy to data_lee to have an extra repeat of this dude

In [1016]:
# Make sure number of times one person retook a test in the same year is the same across both datasets
for index, row in data_lee.iterrows():
    exam_year = row['pass_year']
    chi_name = row['chnname']
    kor_name = row['korname']
    repeated = row['repeated_testtaker']
    
    # only matching Chinese name and exam year
    ws_person = data_ws[(data_ws['namehj1'] == chi_name) & (data_ws['year'] == exam_year)]
    if (len(ws_person) > 0):
        if (repeated != ws_person['repeated_testtaker'][0]):
            print(kor_name, exam_year, repeated, ws_person['repeated_testtaker'][0])
    

최호 1616 True 0.0
최호 1616 True 0.0
이승소 1447 True 0.0
정종소 1447 True 0.0
정종소 1447 True 0.0
이승소 1447 True 0.0
신승선 1466 True 0.0
신승선 1466 True 0.0
안대진 1586 True 0.0
안대진 1586 True 0.0
이윤재 1887 False 1.0
유내 1727 True 0.0
이제 1699 True 0.0
이제 1699 True 0.0
유내 1727 True 0.0


The following is an edge case. We have 8 different `ws_uuid` duplicated. This is shown below as happening because in Lee version, the person shows up as a repeated test taker, but in the WS version, the person is NOT shown up as a repeated test taker. So this duplication is OK to have.

In [1018]:
len(master[master['ws_uuid'].duplicated(keep=False)])

18

In [1019]:
master[master['ws_uuid'].duplicated(keep=False)][['kor_name', 'chi_name', 'pass_year', 'lee_uuid', 'ws_uuid', 'repeated_testtaker_x', 'repeated_testtaker_y']]

Unnamed: 0,kor_name,chi_name,pass_year,lee_uuid,ws_uuid,repeated_testtaker_x,repeated_testtaker_y
1110,최호,崔濩,1616,be273ba9-ceaf-5b7a-9d1e-ea34a4c48a7f,094f1a35-0dca-505f-bc57-e8c5bfe049bb,True,0.0
1111,최호,崔濩,1616,e8014037-f2ac-5686-beed-92a2cc2d9e91,094f1a35-0dca-505f-bc57-e8c5bfe049bb,True,0.0
4149,이승소,李承召,1447,bce827d1-0d70-5d9b-8c3e-47692a94056e,c0c3ba67-8bd0-5bb5-b5c2-49e8bada41ad,True,0.0
4150,이승소,李承召,1447,4dd2ba02-6605-52de-aa90-baedc1786856,c0c3ba67-8bd0-5bb5-b5c2-49e8bada41ad,True,0.0
4158,정종소,鄭從韶,1447,8b8eb1bb-dd28-5ead-8e78-3396a662b7c8,13ba0017-fd73-5524-9d09-97f2da35f6d7,True,0.0
4159,정종소,鄭從韶,1447,c11cc3b8-55a2-5c41-9cf3-4c1c7539a08f,13ba0017-fd73-5524-9d09-97f2da35f6d7,True,0.0
4383,신승선,愼承善,1466,4f8d3d5e-2087-5a3d-a49e-3f7784455da2,cb010d29-ea27-51ba-ad55-82d7e7fbcaf8,True,0.0
4384,신승선,愼承善,1466,2c1b6dc5-2624-51e2-a086-391d0ed444aa,cb010d29-ea27-51ba-ad55-82d7e7fbcaf8,True,0.0
5424,안대진,安大進,1586,2a509d8c-3cad-52b7-9251-9eaa2641e22a,548b703b-a662-5979-ad64-4d19775e4878,True,0.0
5425,안대진,安大進,1586,28022a2b-925f-538f-b69a-a433c686c89c,548b703b-a662-5979-ad64-4d19775e4878,True,0.0


Manually for 이윤재, make extra row of him in master table to indicate that this guy has taken the exam twice

In [1020]:
(len(data_lee[data_lee['korname'] == '이윤재']), 
len(data_ws[data_ws['namehg1'] == '이윤재']), 
len(master[master['kor_name'] == '이윤재']))

(1, 2, 1)

Add the one missing guy to master table

In [1021]:
from_ws['id1'][0]

13974.0

In [1022]:
from_ws = data_ws[data_ws['namehg1'] == '이윤재'][1:2]
missing_man = master[master['kor_name'] == '이윤재'].copy()
missing_man['ws_uuid'] = from_ws['ws_uuid'][0]
missing_man['id1'] = from_ws['id1'][0]
master = master.append(missing_man)

In [1023]:
(len(data_lee[data_lee['korname'] == '이윤재']), 
len(data_ws[data_ws['namehg1'] == '이윤재']), 
len(master[master['kor_name'] == '이윤재']))

(1, 2, 2)

## Manually Merge the Rest

Next, do a manual merge of the ones that were not merged

* Find those in data_lee not in master
* Find those in data_ws not in master
* See if any in data_ws meets 2/3 criteria for those in data_lee, and if so, add them to master

### Korean names are similar 2/3 of time

In [1024]:
# find the unique IDs in data_lee not in master
# len(data_lee['lee_uuid'].unique()), len(master['lee_uuid'].unique())
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
len(lee_unmerged)

3099

In [1025]:
# Sanity Check
# 4727 + 10424 = 15151 + 1 (the extra 1 is the repeat we had to add)
assert (len(lee_unmerged) + len(master) == len(data_lee) + 1)

In [1026]:
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]
len(ws_unmerged)

2566

In [1027]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [1028]:
# Find the dudes in Lee with same Chinese name and exam year as WS dudes
# If their Korean names are at least 60 percent similar, then add them to the merged master table
# TODO - add the UUID of the WS dude to the master table
# TODO - add a list of dictionary to master table including dude's other names, and what other sources these other names come from

These are the Korean names in WS that ALMOST match the names in Lee but are SUPER close but are not the same name. Can manually check the following list to make sure I'm right but most of them look like they're the same person

In [1029]:
# Make sure number of times one person retook a test in the same year is the same across both datasets
# for index, row in lee_unmerged.iterrows():
#     exam_year = row['pass_year']
#     chi_name = row['chnname']
#     kor_name = row['korname']
#     repeated = row['repeated_testtaker']
    
#     # only matching Chinese name and exam year
#     ws_person = ws_unmerged[(ws_unmerged['namehj1'] == chi_name) & (ws_unmerged['year'] == exam_year)]
#     if (len(ws_person) > 0):
#         if (repeated != ws_person['repeated_testtaker'][0]):
#             print(kor_name, exam_year, repeated, ws_person['repeated_testtaker'][0])
    

In [1030]:
df_similar_kor_name_lee = pd.DataFrame([])
df_similar_kor_name_ws = pd.DataFrame([])

# Iterate through each person in lee_unmerged
# Find that person in ws_person based on Chinese name and exam year
# Determine if if the person has similar Korean names, they are one person
# Look at the repeated_testtaker. If same numbers, fine
# If different numbers, take the larger of the values
# I guess for now, add repeat
for index, row in lee_unmerged.iterrows():
    exam_year = row['pass_year']
    chi_name = row['chnname']
    kor_name = row['korname']
    repeated = row['repeated_testtaker']
    
    # only matching Chinese name and exam year
    ws_person = ws_unmerged[(ws_unmerged['namehj1'] == chi_name) & (ws_unmerged['year'] == exam_year)]
    
    for person_index in range(len(ws_person)):
        if (similar(ws_person["namehg1"][person_index], kor_name) > 0.5):
            print(ws_person["namehg1"][person_index], kor_name, ws_person["ws_uuid"][person_index])
            row_copy = row.copy()
            row_copy['other_kor_name_in_WS_version'] = ws_person["namehg1"][person_index]
            row_copy['other_kor_name_in_WS_version_uuid'] = ws_person["ws_uuid"][person_index]
            df_similar_kor_name_lee = df_similar_kor_name_lee.append(row_copy)    
            df_similar_kor_name_ws = df_similar_kor_name_ws.append(ws_person)
        
       

김창륜 김창윤 ceff916a-aab9-58f5-a84e-9f424a5587f0
김여량 김여양 39032670-be49-5d62-8ecf-5d1373a8fa1e
박룡배 박용배 74caa469-d2ba-5a1b-9ae2-a9a3c63923b4
김원로 김원노 b256c444-6821-5192-9a8d-4257a45b8610
이언렬 이언열 0b347dc2-5a62-503f-bef1-1ddf590787eb
김종리 김종이 a5a4ee27-4652-5f7d-9e4d-652a965d288e
정윤로 정윤노 619a5b3a-5ec8-5ff7-b0b1-5070a7c17694
최맹량 최맹양 882ba5b2-4cff-521c-a603-305f9a1cebec
김려하 김여하 5d8e0b6c-6a9e-57a4-a0b1-75fbb09e50ae
유상리 유상이 df33a324-1b9a-55b1-9c63-10259a0d995e
김니로 김니노 fc23754d-ea1e-5902-a6f0-92f3390e5432
반제로 반제노 a18dd37e-b36e-5b63-a041-fe82b07d719e
강제로 강제노 0c3eabe9-04a7-5f84-a9d8-87654a519683
양봉래 양봉내 51e108c8-a11c-57ce-a179-1168ba501b6a
권칠림 권칠임 4d9ddd8a-2e2e-5ccc-91f9-7908b447765c
안자립 안자입 39711849-c869-54b7-b898-d35f504c0fd9
최수로 최수노 a3d3d396-dbf3-5493-bc06-cd191f2e8426
문여량 문여양 5a7ce886-1cec-57de-b0ed-4b60d46f7ac3
김중량 김중양 9f12959c-773e-52cc-8c3c-979d8344a1f6
김상렴 김상염 3f4c82dc-6349-51ef-a4e0-2f3bb8475806
강원량 강원양 3dfd158d-15bf-5d09-a919-086ce7b4ead9
이선로 이선노 b5ce283c-4c4a-547e-a2fd-57490a61f902
손계륜 손계윤 17

이종렬 이종열 d601d599-af18-5f0f-9bc3-70262faf6215
오석령 오석영 7efd1fe7-a2f3-5ae0-9999-f34217b8c46d
이한룡 이한용 7c423e51-0cb7-5885-bfc8-434d20937dbf
김리길 김이길 a0b81518-6207-5c1e-a070-44abdfcaafea
김린채 김인채 9335b15c-d87e-59bf-b28a-04e953bb0231
김용련 김용연 b4e8785e-e382-5ffa-9d2f-72d7bc7f642a
정량한 정양한 65d8dcac-7d6a-5f60-8a68-4d49e6e5bd79
김하련 김하연 a9f08f3d-2efa-5700-a3cc-df38b71095c3
박언룡 박언용 2a624f89-f071-502c-a088-fe80668a2d52
김정룡 김정용 5c7b7d23-db2a-535a-9a7b-651139555c0e
이성륜 이성윤 d0a05901-6b40-5013-b131-cd2713a9a65a
안봉래 안봉내 188fa2ff-a99e-5c71-9f91-a034a9383d14
정래성 정내성 bd048f28-1eba-5170-9caf-c1422f567000
김량척 김양척 3dbcab37-9e9a-5edd-a8c7-bb0d013699bc
민승룡 민승용 37d9e497-554b-5b81-a3e8-53d20353231f
이종렬 이종열 189289e0-96d9-59f8-af3b-ac4ff59103ab
김락일 김낙일 5ffda620-604a-51a7-938d-d3e1183cdd10
이룡주 이용주 7282153a-bd23-50f1-836e-4773c3bf8808
민치룡 민치용 b6f563c5-c979-578b-9480-de505ad4797e
민광로 민광노 1b2de68e-56e1-51ac-9e99-895b61e7955a
김형린 김형인 229e1a6d-30e1-50eb-bb4c-775d0111f167
이익렬 이익열 affa5ba2-9854-57bd-aeb9-74872d2ea733
김상려 김상여 07

지봉령 지봉영 da482a8f-780f-5c04-b4e9-1de59612d7d8
박시룡 박시용 bb2386b9-a7f8-5d47-a991-5e6b29b538b4
박룡표 박용표 4376d325-5ac6-57e1-b950-a2fc4a69be4a
계룡혁 계용혁 e72aa758-7d50-533c-ad5a-8fc66ecb9a8e
최영룡 최영용 4a018cca-3ec5-51f6-a6e9-3acd73403c28
장리관 장이관 ff96f9ae-3641-5a56-a3cb-4e69ca37f865
이병룡 이병용 5fc65e5c-82f7-5c08-ae83-e0075b13e55c
김려현 김여현 41b04aa9-c96d-5720-938a-5569cfc9ded6
김정룡 김정용 5028212d-f158-5a4c-b5fb-cf69a0bada56
조종룡 조종용 35929049-3839-5537-88ab-27f179d29c55
최우락 최우낙 4e2d3756-c8a4-597b-80b4-30a4c5811ab9
김룡기 김용기 bd79fcf4-d5ce-5390-84c8-19da94f12296
김명뢰 김명뇌 59029d73-c819-571e-8efc-304c353deb93
박례병 박예병 7e77065b-ae56-5b67-8fed-c7ee9f4babd8
천광록 천광녹 5bd5d616-e51c-59e0-a6f4-42a9f2015359
오년근 오연근 ca542852-d5a7-5d40-9e9d-c7f339594583
정재룡 정재용 b3ec017a-f367-53c8-9a11-d20952f733e9
장치량 장치양 fd2481d4-768c-5705-9d3f-3afbcc523e3c
김성룡 김성용 9f5d0119-2c99-541b-a2c2-3c216f541171
오계련 오계연 92fd929e-fa55-50b7-bab9-d7edde464464
김홍락 김홍낙 0da716e9-9932-5aff-93be-b51f6e9ac246
서상룡 서상용 8be805be-712d-53a2-b8e9-fdbd3be89eb2
윤경룡 윤경용 3e

김로진 김노진 6e3217ac-4844-5bd6-bb36-f77d2d6a32f9
임학령 임학영 707002e6-120c-57ea-91bf-2df21e48dce1
이유년 이유연 d9a605f5-d381-5e6b-9f95-a3c9f58a16a9
김극녕 김극영 4aeab85e-0c86-5d41-8ccb-3db692241d59
유신로 유신노 7a5a4835-9d04-5440-8b57-ec42759e80a2
김제룡 김제용 5eafd74e-7f04-55e3-a58e-009345267a5b
전명룡 전명용 68735a53-8bfb-53f7-ade5-bad21a0cb82a
윤홍립 윤홍입 ec196611-0cba-5bee-8cad-daf2a72e27b8
김하량 김하양 7ca4243e-9272-5d3e-a5e5-06dbfa727bc9
이흥록 이흥녹 3de5819f-bb2f-5c52-b604-3dc0c1a1d672
조시량 조시양 84ce92ab-20b3-59f7-b7b1-8fc5b2847aab
송정렴 송정염 114f6253-facf-52f6-8a25-1612cf16ab94
황립신 황입신 aecbaa49-058f-511a-9e51-87a3f13a6826
민여로 민여노 14e9f135-f91b-52b9-85d5-c63163b478c8
한리겸 한이겸 05d174f2-887e-5f22-a215-0309bea6aac6
윤리지 윤이지 9f3c84c6-547f-5b47-bada-5824e3b83c95
남몽뢰 남몽뇌 b9875f7f-9765-5f03-be3e-282bc453e569
유경립 유경입 82343510-975a-5560-976a-9e4778048c4b
이운림 이운임 7894cf6c-b162-5841-9f80-3a5a9ec9bad0
곽룡백 곽용백 41e7428d-6365-5131-9740-ddb498535c0b
김명룡 김명용 8550adc6-f39c-5d84-b9c6-431a3e99bb7c
성진령 성진영 bcddf0ff-e4ff-5b1b-b50c-18e6356b9063
임홍량 임홍양 60

윤춘년 윤춘연 8140228c-df5d-5c7a-88bb-4cf809f3af2e
김덕룡 김덕용 3883d966-6e17-567d-b494-eb56548b7e5b
정사량 정사양 218381e5-29b8-53f2-929a-9df3dfa6f79a
신국량 신국양 35f5c4be-eedc-52d6-bb93-74e0e2ebe976
이희량 이희양 7b0848d4-fe54-591e-9947-999dd5fc5a0c
김억령 김억영 d2d042da-8c2c-5cc7-80ef-a739429f18cf
조응룡 조응용 41ace463-3dc8-5155-bdb5-c91055eac93d
박소립 박소입 151f4a64-44da-5860-babd-f9dd57f36643
박희립 박희입 d928086c-2534-5a0b-bf7c-34190976ce2c
하응림 하응임 70fec4b0-9e8d-56db-ae56-90d616f3eac0
박란영 박난영 be023642-cae5-5bb8-a14e-2532cb323ef9
유희림 유희임 b9765a8d-1aae-56b0-9b08-aa002d3a4de7
임국로 임국노 f6a95b60-36c7-5c8c-8d32-caac85bdfe72
정희룡 정희용 b08755cd-00ab-5577-9d31-9bc3ce73cc6b
김례종 김예종 15c5e065-1e56-56ba-968d-5077f607d4dc
유성룡 유성용 fd47bb96-ecf3-5b55-a0f7-c96a7087a6bc
이경린 이경인 566a1b20-59ce-50a7-abff-92a543ee03ed
유영립 유영입 8e6c4450-c039-579f-a4f8-e015d0a6b559
임영로 임영노 d851e023-09be-55f8-b096-7f2bac37ea4d
홍종록 홍종녹 fd2fadc7-d1da-53fc-a594-20ef25b908d5
박동로 박동노 1183a060-4410-596f-a75c-9030d7a3c524
유동립 유동입 2576cfce-b227-5256-8bdc-deb894f80f85
조응록 조응녹 18

민항렬 민항열 28b4f6fa-a758-5ed9-9c87-040c7f6b1bb3
남강로 남강노 c5577c3d-ef88-55a8-bb3f-1efef52a5637
홍리건 홍이건 150a2201-b008-587b-bcb3-38222c6eacc3
김리정 김이정 4423648f-6a0b-5407-ac4d-369ce4752da8
김한로 김한노 afdce55c-f727-5f98-921b-f7b3544aed8b
김리용 김이용 2ba11654-0120-5865-9b27-4a5164b5ebb5
이량재 이양재 4bc2b7d5-5381-5544-8257-1ec63c965bc5
김로영 김노영 b6febc9c-e791-508a-8a63-68b7a7799d75
이경륜 이경윤 3c737104-00a2-5561-b992-341577aa87f5
신대년 신대연 7c820404-e2a8-57c9-9cc3-2b13b7fb3b94
서룡보 서용보 1df9a318-31b3-5494-9126-ac76a5287a65
서유련 서유연 0cc9ffdd-9608-59e8-b0c3-6c42e6f0334f
김리후 김이후 c4199cea-49fc-59b1-b4e6-35dadf3b325e
심념조 심염조 19aba08a-3b45-54dc-b236-531b30528c4e
김계락 김계낙 bcdff3a0-69eb-5ec0-bc24-eb4570851a93
정래백 정내백 69aea975-a7c5-55c1-93f1-eddd9a78c397
유한녕 유한영 8f6b1629-ea87-5f95-8181-34db9367d33a
김락순 김낙순 1ef81ef8-8b30-5c9c-b1f9-3f6a1937fcf1
윤익렬 윤익열 a59a73de-81c3-5103-91af-edd4946d65da
박륜수 박윤수 d345b217-f397-5870-b89e-fc9f3fbb6f67
이해린 이해인 7e1c7b1a-5c14-580f-8623-ddf09ad01829
남리익 남이익 b07a2ffd-b276-5b9d-9593-f824805ff19c
김리교 김이교 4b

In [1031]:
(len(df_similar_kor_name_lee),len(df_similar_kor_name_ws))

(1050, 1050)

In [1032]:
assert(len(df_similar_kor_name_lee) == len(df_similar_kor_name_ws))

In [1034]:
# MERGING only on Chinese name and year
master2 = pd.merge(df_similar_kor_name_lee, df_similar_kor_name_ws, indicator=True, how='inner', left_on=['chnname','pass_year'], right_on = ['namehj1','year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

(1050, 1050, 1050)

In [1036]:
# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge2 = master.copy()

In [1037]:
len(master[~master['other_kor_name_in_WS_version'].isnull()])

2100

In [1038]:
# Update these values
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]
len(ws_unmerged),len(lee_unmerged)

(1516, 2049)

In [1039]:
# We still have 3337 in data_ws that cannot be matched with data_lee
len(data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())])

1516

### Chinese names are similar 2/3 of time

These are the CHINESE names in WS that ALMOST match the names in Lee but are SUPER close but are not the same name. Can manually check the following list to make sure I'm right but most of them look like they're the same person

In [1040]:
df_similar_chi_name_lee = pd.DataFrame([])
df_similar_chi_name_ws = pd.DataFrame([])

In [1041]:
# Make sure number of times one person retook a test in the same year is the same across both datasets
# Same as above but for Chinese names
for index, row in lee_unmerged.iterrows():
    exam_year = row['pass_year']
    chi_name = row['chnname']
    kor_name = row['korname']
    repeated = row['repeated_testtaker']
    
    # only matching Chinese name and exam year
    ws_person = ws_unmerged[(ws_unmerged['namehg1'] == kor_name) & (ws_unmerged['year'] == exam_year)]
    if (len(ws_person) > 0):
        if (repeated != ws_person['repeated_testtaker'][0]):
            print(kor_name, chi_name, exam_year, repeated, ws_person['repeated_testtaker'][0])
    

이목 李楘 1612 False 1.0
이목 李莯 1612 False 1.0


In [1042]:
ws_unmerged[(ws_unmerged['year'] == 1612)][["namehj1", "namehg1", "year"]].sort_values(['namehg1'])

Unnamed: 0_level_0,namehj1,namehg1,year
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
f2885d9a-2896-53d4-b5b7-179641a51cef,權省吾,권생오,1612.0
d977fe7c-544d-5a03-b6e1-00281472a94e,金寧,김녕,1612.0
67583335-837c-598e-bb55-5d3dea7c85e5,金,김령,1612.0
e744163d-f908-5175-8004-65e4b881f08b,南宮,남궁경,1612.0
eb37f4eb-e102-5639-9309-1f9d9a54d2e9,申易于,신역우,1612.0
8b448c2e-c804-5616-8a60-5a8feb6e99e1,沈,심지,1612.0
5a91305f-c95c-5e51-8dde-3f7205b35227,吳,오숙,1612.0
ad4515cf-9fa2-565c-9fcc-ae8d3e29e252,李,이계,1612.0
a28023af-8791-5c39-9f49-2856c57cf410,李,이목,1612.0
1da79682-6fb5-5029-bca5-9969dd572b9b,李,이목,1612.0


In [1044]:
# Same as above, but with Chinese names

# TODO what to do if there are more than one people with similar names?

df_similar_chi_name_lee = pd.DataFrame([])
df_similar_chi_name_ws = pd.DataFrame([])

for index, row in lee_unmerged.iterrows():
    exam_year = row['pass_year']
    chi_name = row['chnname']
    kor_name = row['korname']
    
    # only matching Chinese name and exam year
    ws_person = ws_unmerged[(ws_unmerged['namehg1'] == kor_name) & (ws_unmerged['year'] == exam_year)]
    
    for person_index in range(len(ws_person)):
        if (similar(ws_person["namehj1"][person_index], chi_name) > 0.5):
            print(ws_person["namehj1"][person_index], chi_name, similar(ws_person["namehj1"][person_index], chi_name), ws_person['namehg1'][person_index])
            row_copy = row.copy()
            row_copy['other_chi_name_in_WS_version'] = ws_person.iloc[person_index]["namehj1"]
            row_copy['other_chi_name_in_WS_version_uuid'] = ws_person.iloc[person_index]["ws_uuid"]

            df_similar_chi_name_lee = df_similar_chi_name_lee.append(row_copy)
            df_similar_chi_name_ws = df_similar_chi_name_ws.append(ws_person.iloc[person_index])



李廷 李廷熽 0.8 이정소
鄭 鄭墀 0.6666666666666666 정지
李章 李章垕 0.8 이장후
許 許禝 0.6666666666666666 허직
金 金伈 0.6666666666666666 김심
朴 朴頎 0.6666666666666666 박기
田 田馪 0.6666666666666666 전빈
李 李敉 0.6666666666666666 이미
朴 朴耋 0.6666666666666666 박질
林 林薈 0.6666666666666666 임회
金 金鑌 0.6666666666666666 김빈
崔之 崔瀹之 0.8 최약지
金 金貔 0.6666666666666666 김비
金 金鏗 0.6666666666666666 김갱
朴 朴叙 0.6666666666666666 박서
李士 李士侗 0.8 이사동
金 金㽔 0.6666666666666666 김유
趙 趙忭 0.6666666666666666 조변
裵 裵爚 0.6666666666666666 배약
姜 姜仡 0.6666666666666666 강흘
洪疇 洪叙疇 0.8 홍서주
金時 金時霔 0.8 김시주
金 金煒 0.6666666666666666 김위
孔 孔頎 0.6666666666666666 공기
曹好智 曺好智 0.6666666666666666 조호지
金 金軺 0.6666666666666666 김초
呂 呂箎 0.6666666666666666 여호
崔 崔涐 0.6666666666666666 최아
金晋錫 金晉錫 0.6666666666666666 김진석
崔 崔湑 0.6666666666666666 최서
金壽 金鏗壽 0.8 김갱수
黃 黃㻶 0.6666666666666666 황필
李世 李世蓁 0.8 이세진
金 金瑊 0.6666666666666666 김감
許 許穧 0.6666666666666666 허제
南 南趎 0.6666666666666666 남주
姜 姜㶏 0.6666666666666666 강은
姜允權 姜胤權 0.6666666666666666 강윤권
梁 梁訒 0.6666666666666666 양인
李 李訔 0.6666666666666666 이은
林 林薈

閔 閔頤 0.6666666666666666 민이
金 金磶 0.6666666666666666 김석
李 李瑺 0.6666666666666666 이상
李 李黿 0.6666666666666666 이원
李 李㙉 0.6666666666666666 이전
姜 姜諿 0.6666666666666666 강집
姜 姜澂 0.6666666666666666 강징
安 安覯 0.6666666666666666 안구
金 金璫 0.6666666666666666 김당
金 金熠 0.6666666666666666 김습
李云 李云秠 0.8 이운비
閔 閔㥳 0.6666666666666666 민원
李 李堣 0.6666666666666666 이우
金克 金克愊 0.8 김극핍
李 李頫 0.6666666666666666 이부
權 權橃 0.6666666666666666 권벌
李 李蘋 0.6666666666666666 이빈
金 金磧 0.6666666666666666 김적
許 許硡 0.6666666666666666 허굉
申 申鏛 0.6666666666666666 신상
宋 宋澂 0.6666666666666666 송징
金 金璠 0.6666666666666666 김번
李 李迨 0.6666666666666666 이태
李 李胖 0.6666666666666666 이반
黃 黃𣉮 0.6666666666666666 황진
李之 李之蕡 0.8 이지분
金 金埏 0.6666666666666666 김연
金 金顒 0.6666666666666666 김옹
李 李構 0.6666666666666666 이구
林 林畡 0.6666666666666666 임해
金彦 金彦琚 0.8 김언거
許 許礦 0.6666666666666666 허광
趙邦禎 趙邦楨 0.6666666666666666 조방정
李 李畬 0.6666666666666666 이여
羅士 羅士愃 0.8 나사선
鄭 鄭鍠 0.6666666666666666 정굉
鄭 鄭荃 0.6666666666666666 정전
禹 禹鏛 0.6666666666666666 우상
柳仲 柳仲郢 0.8 유중영
申承 申承澡 0.8 신승조


朴世 朴世蓊 0.8 박세옹
尹世 尹世忱 0.8 윤세침
李賢 李賢讜 0.8 이현당
李 李秫 0.6666666666666666 이출
李無疆 李無彊 0.6666666666666666 이무강
閔 閔荃 0.6666666666666666 민전
李 李璖 0.6666666666666666 이거
尹 尹杲 0.6666666666666666 윤고
閔 閔箎 0.6666666666666666 민호
崔 崔堣 0.6666666666666666 최우
任 任珗 0.6666666666666666 임선
趙 趙𤩶 0.6666666666666666 조업
安 安馠 0.6666666666666666 안함
金 金虬 0.6666666666666666 김규
金士 金士銛 0.8 김사섬
鄭 鄭惕 0.6666666666666666 정척
柳 柳㙉 0.6666666666666666 유전
鄭 鄭䃫 0.6666666666666666 정담
李 李蘧 0.6666666666666666 이거
羅 羅恮 0.6666666666666666 나전
崔 崔顒 0.6666666666666666 최옹
具 具忭 0.6666666666666666 구변
崔弘 崔弘僴 0.8 최홍한
徐 徐崦 0.6666666666666666 서엄
李廷 李廷馣 0.8 이정암
李裕仁 李𥙿仁 0.6666666666666666 이유인
金 金澥 0.6666666666666666 김해
金 金戣 0.6666666666666666 김규
權 權𢢝 0.6666666666666666 권수
李 李璁 0.6666666666666666 이총
盧 盧㙉 0.6666666666666666 노전
許 許篈 0.6666666666666666 허봉
韓 韓顒 0.6666666666666666 한옹
韓 韓戭 0.6666666666666666 한인
羅允 羅允忱 0.8 나윤침
金 金僩 0.6666666666666666 김한
崔 崔錪 0.6666666666666666 최전
洪仁U 洪仁恕 0.6666666666666666 홍인서
柳 柳𧩦 0.6666666666666666 유극
尹 尹皞 0.666666666666

李 李㘾 0.6666666666666666 이우
睦仁 睦仁㘽 0.8 목인재
李裕膺 李𥙿膺 0.6666666666666666 이유응
林洙 林謇洙 0.8 임건수
洪承裕 洪承𥙿 0.6666666666666666 홍승유
趙定燮 趙定爕 0.6666666666666666 조정섭
李裕承 李𥙿承 0.6666666666666666 이유승
李承皐 李承臯 0.6666666666666666 이승고
姜 姜𧄽 0.6666666666666666 강찬
韓東 韓晢東 0.8 한절동
金 金㠎 0.6666666666666666 김집
金 金嶷 0.6666666666666666 김의
金翼 金隼翼 0.8 김준익
李丙 李丙逌 0.8 이병유
李大 李大頤 0.8 이대이
許 許㥌 0.6666666666666666 허국
玉晋輝 玉晉輝 0.6666666666666666 옥진휘
金昌 金昌鼐 0.8 김창내
徐道裕 徐道𥙿 0.6666666666666666 서도유
朴 朴鎤 0.6666666666666666 박황
李 李濰 0.6666666666666666 이유
權 權愃 0.6666666666666666 권선
李 李偰 0.6666666666666666 이설
張彦 張彦忱 0.8 장언침
權世 權世橚 0.8 권세숙
羅 羅㶷 0.6666666666666666 나순
朴柱 朴 0.6666666666666666 박기주
權 權偌 0.6666666666666666 권야
金 金琂 0.6666666666666666 김언
尹 尹㻻 0.6666666666666666 윤돈
李 李茳 0.6666666666666666 이강
琴 琴𢢜 0.6666666666666666 금업
李晩 李晩煃 0.8 이만규
張晋遠 張晉遠 0.6666666666666666 장진원
朴 朴嶔 0.6666666666666666 박금
閔 閔忱 0.6666666666666666 민침
李承 李㝡承 0.8 이최승
金晋鐸 金晉鐸 0.6666666666666666 김진탁
閔 閔惕 0.6666666666666666 민척
尹滋 尹滋畊 0.8 윤자경
權 權愭 0.6666666666666666 권기


In [1045]:
# THESE NEED TO BE THE SAME
assert(len(df_similar_chi_name_lee) == len(df_similar_chi_name_ws))
print(len(df_similar_chi_name_lee), len(df_similar_chi_name_ws))

1055 1055


In [1046]:
# There are repeates
len(df_similar_chi_name_lee['lee_uuid'].unique()),len(df_similar_chi_name_ws['ws_uuid'].unique())

(1053, 1051)

In [1047]:
df_similar_chi_name_lee[df_similar_chi_name_lee['lee_uuid'].duplicated(False)][['lee_uuid','korname', 'chnname', 'other_chi_name_in_WS_version', 'other_chi_name_in_WS_version_uuid']]


Unnamed: 0,lee_uuid,korname,chnname,other_chi_name_in_WS_version,other_chi_name_in_WS_version_uuid
c5b8a0c3-fe87-5004-b7bb-cd6d2c7da9be,c5b8a0c3-fe87-5004-b7bb-cd6d2c7da9be,이목,李楘,李,a28023af-8791-5c39-9f49-2856c57cf410
c5b8a0c3-fe87-5004-b7bb-cd6d2c7da9be,c5b8a0c3-fe87-5004-b7bb-cd6d2c7da9be,이목,李楘,李,1da79682-6fb5-5029-bca5-9969dd572b9b
a6382748-0883-5c51-afe7-1bebc60152f7,a6382748-0883-5c51-afe7-1bebc60152f7,이목,李莯,李,a28023af-8791-5c39-9f49-2856c57cf410
a6382748-0883-5c51-afe7-1bebc60152f7,a6382748-0883-5c51-afe7-1bebc60152f7,이목,李莯,李,1da79682-6fb5-5029-bca5-9969dd572b9b


In [1048]:
# Since there are so few I will just manually delete them
# GETTING RID OF REPEATS
df_similar_chi_name_lee = df_similar_chi_name_lee.reset_index()

i = df_similar_chi_name_lee[df_similar_chi_name_lee['lee_uuid'] == uuid.UUID('{c5b8a0c3-fe87-5004-b7bb-cd6d2c7da9be}')].index[0]
df_similar_chi_name_lee = df_similar_chi_name_lee.drop(i)

i = df_similar_chi_name_lee[df_similar_chi_name_lee['lee_uuid'] == uuid.UUID('{a6382748-0883-5c51-afe7-1bebc60152f7}')].index[1]
df_similar_chi_name_lee = df_similar_chi_name_lee.drop(i)


In [1049]:
# MERGING only on Korean name and year
master2 = pd.DataFrame([])
master2 = pd.merge(df_similar_chi_name_lee, df_similar_chi_name_ws, indicator=True, how='left', left_on=['korname','pass_year'], right_on = ['namehg1','year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

# These numbers are unequal bc of all of the extra duplicates, etc
len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

(1063, 1053, 1051)

In [1050]:
# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge3 = master.copy()

In [1051]:
len(master[~master['other_chi_name_in_WS_version'].isnull()])

1063

In [1052]:
# Update these values
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]

len(ws_unmerged),len(lee_unmerged)

(465, 996)

In [1053]:
# We still have 467 in data_ws that cannot be matched with data_lee
len(data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())])

465

### Chinese names are similar, again, this time 50%

In [1054]:
# If Chinese names are the same and Korean names are FIFTY PERCENT (this is less than before)
# Use eyes to make sure these are right
# Manually choose the people who are the same

df_similar_chi_name_lee = pd.DataFrame([])
df_similar_chi_name_ws = pd.DataFrame([])

for index, row in lee_unmerged.iterrows():
    exam_year = row['pass_year']
    chi_name = row['chnname']
    kor_name = row['korname']
    
    # only matching Chinese name and exam year
    ws_person = ws_unmerged[(ws_unmerged['namehg1'] == kor_name) & (ws_unmerged['year'] == exam_year)]
    if (len(ws_person) > 0) :
        for person_index in range(len(ws_person)):
            if (similar(ws_person["namehj1"][person_index], chi_name) >= 0.5):
                print(ws_person["namehj1"][0], chi_name, similar(ws_person["namehj1"][0], chi_name), ws_person['namehg1'][0], row['korname'])
                row_copy = row.copy()
                row_copy['other_chi_name_in_WS_version'] = ws_person.iloc[person_index]["namehj1"]
                row_copy['other_chi_name_in_WS_version_uuid'] = ws_person.iloc[person_index]["ws_uuid"]

                df_similar_chi_name_lee = df_similar_chi_name_lee.append(row_copy)
                df_similar_chi_name_ws = df_similar_chi_name_ws.append(ws_person.iloc[person_index])
                

康輻 康福 0.5 강복 강복
安熹 安憙 0.5 안희 안희
張晋 張晉 0.5 장진 장진
李晋 李晉 0.5 이진 이진
李瑛 李𡎘 0.5 이영 이영
徐岡 徐崗 0.5 서강 서강
郭珣 郭恂 0.5 곽순 곽순
徐晋 徐晉 0.5 서진 서진
李韜 李蹈 0.5 이도 이도
韓卷 韓淃 0.5 한권 한권
康晋 康晉 0.5 강진 강진
成晋 成晉 0.5 성진 성진
孫澍 孫㴻 0.5 손주 손주
金沖 金冲 0.5 김충 김충
許晋 許晉 0.5 허진 허진
李沖 李冲 0.5 이충 이충
尹晋 尹晉 0.5 윤진 윤진
沈坪 沈枰 0.5 심평 심평
鄭協 鄭恊 0.5 정협 정협
鄭晋 鄭晉 0.5 정진 정진
李 李惕然 0.5 이척연 이척연
許潁 許頴 0.5 허영 허영
洪墩 洪墪 0.5 홍돈 홍돈
權華 權𠌶 0.5 권화 권화


In [1055]:
# THESE NEED TO BE THE SAME
assert(len(df_similar_chi_name_lee) == len(df_similar_chi_name_ws))
print(len(df_similar_chi_name_lee), len(df_similar_chi_name_ws))

24 24


In [1056]:
df_similar_chi_name_lee[df_similar_chi_name_lee['lee_uuid'].duplicated(False)][['lee_uuid','korname', 'chnname', 'other_chi_name_in_WS_version', 'other_chi_name_in_WS_version_uuid']]

Unnamed: 0,lee_uuid,korname,chnname,other_chi_name_in_WS_version,other_chi_name_in_WS_version_uuid


In [1058]:
# MERGING only on Korean name and year
master2 = pd.DataFrame([])
master2 = pd.merge(df_similar_chi_name_lee, df_similar_chi_name_ws, indicator=True, how='left', left_on=['korname','pass_year'], right_on = ['namehg1','year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

# These numbers are unequal bc of all of the extra duplicates, etc
len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge4 = master.copy()

In [1059]:
# Update these values
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]

len(ws_unmerged),len(lee_unmerged)

(441, 972)

### Korean names are similar, again, this time 50%

In [1060]:
df_similar_kor_name_lee = pd.DataFrame([])
df_similar_kor_name_ws = pd.DataFrame([])

# Iterate through each person in lee_unmerged
# Find that person in ws_person based on Chinese name and exam year
# Determine if if the person has similar Korean names, they are one person
# Look at the repeated_testtaker. If same numbers, fine
# If different numbers, take the larger of the values
# I guess for now, add repeat
for index, row in lee_unmerged.iterrows():
    exam_year = row['pass_year']
    chi_name = row['chnname']
    kor_name = row['korname']
    repeated = row['repeated_testtaker']
    
    # only matching Chinese name and exam year
    ws_person = ws_unmerged[(ws_unmerged['namehj1'] == chi_name) & (ws_unmerged['year'] == exam_year)]
    
    for person_index in range(len(ws_person)):
#         print(ws_person["namehg1"][person_index], kor_name, ws_person["namehj1"][person_index], similar(ws_person["namehg1"][person_index], kor_name))
        if (similar(ws_person["namehg1"][person_index], kor_name) >= 0.5):
            print(ws_person["namehg1"][person_index], kor_name, ws_person["namehj1"][person_index])
            row_copy = row.copy()
            row_copy['other_kor_name_in_WS_version'] = ws_person["namehg1"][person_index]
            row_copy['other_kor_name_in_WS_version_uuid'] = ws_person["ws_uuid"][person_index]
            df_similar_kor_name_lee = df_similar_kor_name_lee.append(row_copy)    
            df_similar_kor_name_ws = df_similar_kor_name_ws.append(ws_person)


김녕 김영 金寧
서륜 서윤 徐倫
강륜 강윤 姜綸
김렴 김염 金濂
이뢰 이뇌 李賴
강리 강이 康理
박린 박인 朴璘
김률 김율 金慄
옥려 옥여 玉礪
황린 황인 黃璘
이류 이유 李溜
표윤 표빈 表贇
권륜 권윤 權綸
허량 허양 許亮
이란 이난 李蘭
박률 박율 朴慄
김녕 김영 金寧
봉륜 봉윤 奉綸
정윤 정빈 鄭贇
김렬 김열 金洌
박린 박인 朴璘
강린 강인 姜麟
유송 유책 柳
권륙 권육 權
김련 김연 金輦
승륜 승윤 承綸
허륜 허윤 許綸
정려 정여 鄭旅
조녕 조영 趙寧
채륜 채윤 蔡倫
박려 박여 朴旅
강로 강노 姜老
권륜 권윤 權綸
권람 권남 權擥
김려 김여 金礪
박량 박양 朴良
권률 권율 權慄
이륙 이육 李陸
이래 이내 李徠
김륜 김윤 金崙
오릉 오능 吳凌
안림 안임 安琳
정륜 정윤 鄭綸
권류 권유 權瑠
최린 최인 崔潾
남률 남율 南慄
신류 신유 申瑠
임류 임유 任瀏
이령 이영 李翎
채락 채낙 蔡洛
박란 박난 朴蘭
임려 임여 任呂
신련 신연 辛璉
박률 박율 朴栗
김렴 김염 金濂
신륜 신윤 辛崙
장령 장영 張翎
이량 이양 李樑
이령 이영 李翎
권률 권율 權慄
이로 이노 李魯
이렴 이염 李濂
황뉴 황유 黃紐
한련 한연 韓璉
신률 신율 申慄
황림 황임 黃琳
김련 김연 金璉
윤리 윤이 尹理
윤윤 윤빈 尹贇
이류 이유 李瀏
신로 신노 申魯
강련 강연 姜鍊
최린 최인 崔璘
황념 황염 黃恬
임량 임양 任樑
성륜 성윤 成倫
김로 김노 金魯
이로 이노 李輅
성락 성낙 成洛
박렴 박염 朴簾
이립 이입 李砬
이윤 이빈 李贇
정륜 정윤 鄭崙
최렴 최염 崔濂
황락 황낙 黃洛
이람 이남 李覽
이옥 이욱 李稶
김륜 김윤 金崙
조윤 조빈 趙贇
임박 임부 林
유래 유내 柳徠
이륜 이윤 李綸
엄륜 엄윤 嚴綸
김려 김여 金礪
유래 유내 柳徠
엄린 엄인 嚴璘
이로 이노 李潞
남로 남노 南潞
장륙 장육 張陸
최윤 최빈 崔贇
이림 이임 李霖
김뉴 김유 金紐
문려 문여 文勵
조림 조임 趙琳


In [1062]:
(len(df_similar_kor_name_lee),len(df_similar_kor_name_ws))

(104, 104)

In [1063]:
assert(len(df_similar_kor_name_lee) == len(df_similar_kor_name_ws))

In [1065]:
master2 = pd.merge(df_similar_kor_name_lee, df_similar_kor_name_ws, indicator=True, how='inner', left_on=['chnname','pass_year'], right_on = ['namehj1','year'])


In [1066]:
# MERGING only on Chinese name and year
master2 = pd.merge(df_similar_kor_name_lee, df_similar_kor_name_ws, indicator=True, how='inner', left_on=['chnname','pass_year'], right_on = ['namehj1','year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

(106, 104, 103)

In [1072]:
# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge5 = master.copy()

In [1073]:
# Update these values
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]
len(ws_unmerged),len(lee_unmerged)

(338, 868)

# THIS NEEDS TO BE DOUBLE CHECKED

## Year is the Same ... Korean and Chinese names 50% similar or less 
For this one I will ONLY be checking if for the given year, the Korean and Chinese names are more than 50 percent similar

In [1074]:
master = master.reset_index(drop=True)

In [1075]:
len(ws_unmerged),len(lee_unmerged), len(master)

(338, 868, 15476)

In [1076]:
a = '省'
b = '省'
a == b

False

In [1077]:
a = '龜'
b = '龜'
a == b

False

In [864]:
pd.set_option('display.max_rows', None)


In [1078]:
df_similar_kor_name_lee = pd.DataFrame([])
df_similar_kor_name_ws = pd.DataFrame([])

# Iterate through each person in lee_unmerged
# Find that person in ws_person based on Chinese name and exam year
# Determine if if the person has similar Korean names, they are one person
# Look at the repeated_testtaker. If same numbers, fine
# If different numbers, take the larger of the values
# I guess for now, add repeat
for index, row in lee_unmerged.iterrows():
    exam_year = row['pass_year']
    chi_name = row['chnname']
    kor_name = row['korname']
    repeated = row['repeated_testtaker']
    
    # only matching Chinese name and exam year
    ws_person = ws_unmerged[(ws_unmerged['year'] == exam_year)]
    
    for person_index in range(len(ws_person)):
        if (similar(ws_person["namehg1"][person_index], kor_name) >= 0.5) and (similar(ws_person["namehj1"][person_index], chi_name) >= 0.5):
            print(kor_name, ws_person["namehg1"][person_index], chi_name, ws_person["namehj1"][person_index])
            row_copy = row.copy()
            row_copy['other_kor_name_in_WS_version'] = ws_person["namehg1"][person_index]
            row_copy['other_chi_name_in_WS_version'] = ws_person["namehj1"][person_index]
            row_copy['other_kor_name_in_WS_version_uuid'] = ws_person["ws_uuid"][person_index]
            df_similar_kor_name_lee = df_similar_kor_name_lee.append(row_copy)    
            df_similar_kor_name_ws = df_similar_kor_name_ws.append(ws_person)

곽성귀 곽성구 郭聖龜 郭聖龜
이하 이복 李𩡘 李馥
정인 정린 鄭鄰 鄭隣
곽상 정상 郭常 鄭常
김이 김니 金膩 金
손서윤 손서륜 孫叙倫 孫倫
김귀 김구 金龜 金龜
장인신 장린신 張鄰臣 張隣臣
강삼 강참 姜參 姜參
섭천지 엽천지 葉千枝 葉千枝
박원영 박원령 朴元秢 朴元
김유 김류 金鏐 金
박귀원 박구원 朴龜元 朴龜元
최희삼 최희참 崔希參 崔希參
이전손 이부손 李傳孫 李傅孫
성몽열 성몽설 成夢說 成夢說
한세해 최세해 韓世瀣 崔世瀣
구정열 구정설 仇廷說 仇廷說
장언인 장언린 張彦鄰 張彦隣
김이 김니 金柅 金
최희열 최희설 崔希說 崔希說
박몽열 박몽설 朴夢說 朴夢說
양몽열 양몽설 梁夢說 梁夢說
이삼성 이삼생 李三省 李三省
김의민 전의민 金義民 全義民
노인진 노인전 盧仁瑱 盧仁
김삼낙 김삼악 金三樂 金三樂
강시성 강시생 康時省 康時省
김명열 김명설 金命說 金命說
김염 김렴 金𥖝 金
오억 오의 吳嶷 吳
노열지 노의지 盧說之 盧議之
장진 장전 張瑱 張
홍유귀 홍유구 洪有龜 洪有龜
김영 김령 金坽 金
권희열 권희설 權希說 權希說
김진내 김진래 金晉來 金晋來
유귀감 유구감 柳龜鑑 柳龜鑑
김정귀 김정구 金正龜 金正龜
정단 정현 鄭𦒜 鄭
이경열 이경설 李景說 李景說
문덕귀 문덕구 文德龜 文德龜
이귀정 이구정 李龜禎 李龜禎
이절 이탈 李梲 李
홍수귀 홍수구 洪受龜 洪受龜
김염 김구 金𥖝 金龜
고명열 고명설 高命說 高命說
홍성귀 홍성구 洪聖龜 洪聖龜
김귀정 김구정 金龜禎 金龜禎
박효삼 박효참 朴孝參 朴孝參
문명귀 문명구 文命龜 文命龜
김귀상 김구상 金龜祥 金龜祥
김낙수 김악수 金樂洙 金樂洙
김광서 전광서 金光瑞 全光瑞
주만이 주만리 朱萬离 朱萬
강노 강로 姜栳 姜
허책 허속 許䇿 許
이귀운 이구운 李龜雲 李龜雲
우석귀 우석구 禹錫龜 禹錫龜
허준 허휴 許䥴 許
백치낙 백치악 白致樂 白致樂
김이후 김리후 金履垕 金履
김몽이 김몽니 金夢柅 金夢
김광정 전광정 金光鼎 全光鼎
김약귀 김약구 金若龜 金若龜
신종귀 신종구 愼鍾龜 愼鍾龜
안역인 안급인 安𤣻仁 安仁
김국헌 전국헌 金國憲 全國憲
이병귀

In [1079]:
(len(df_similar_kor_name_lee),len(df_similar_kor_name_ws))

(314, 627)

In [1080]:
# MERGING only on Chinese name and year
master2 = pd.merge(df_similar_kor_name_lee, df_similar_kor_name_ws, indicator=True, how='inner', left_on=['pass_year'], right_on = ['year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

(1627, 313, 323)

In [1081]:
# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge6 = master.copy()

In [1099]:
# Update these values
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]
len(ws_unmerged),len(lee_unmerged)

(15, 555)

### ALMOST DONE! Only 15 don't match

In [1102]:
len(ws_unmerged),len(lee_unmerged)

(12, 555)

In [1103]:
ws_unmerged[['namehg1', 'namehj1', 'year']]

Unnamed: 0_level_0,namehg1,namehj1,year
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cc2f6185-57d6-5995-91d1-4922a2849055,노진해,盧晋諧,1435.0
f4db3f2f-a1f5-5238-8df5-a5b00dbd959d,최니로,崔泥老,1441.0
b897a60d-1c85-5ad0-a1cb-f2c3cda5680b,이극배,李克培,
893bcf9f-b7b6-59fa-8e02-98c4b00cfb80,이운봉,李云,1453.0
4ed91eba-f1df-5747-8ad2-ec5c333a4e68,서근,徐土+甲,1462.0
78b1d4a1-c0e5-5092-84c7-2c55f44420dc,권구로,權龜老,1693.0
02a9d67c-53e1-589e-a023-0affbc08903b,장수구,張守龜,1708.0
cea144cd-a435-5bdd-a69c-7f7cc1a37de7,이육,李堉,1740.0
82b40807-e32a-53ec-ad79-df1cf83da7b0,신리록,申履祿,1822.0
537ff78f-c509-5ef7-9104-2af8e562dd50,권구락,權龜洛,1837.0


In [1142]:
# MANUAL CHECK
lee_unmerged[lee_unmerged['pass_year'] == 1876][['korname', 'chnname']][4:5]

Unnamed: 0_level_0,korname,chnname
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1
5aa6c870-9d40-508f-abcb-ec0fd7ea5818,박노삼,朴魯參


In [1112]:
ws_person["namehg1"][0]

'최니로'

Unnamed: 0_level_0,korname,chnname,pass_year
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
af5a2271-4040-573c-ad39-08c46f8a114f,이극배,李克培,1447


In [1205]:
# Just do some manual merging
df_similar_kor_name_lee = pd.DataFrame([])
df_similar_kor_name_ws = pd.DataFrame([])

years = [1441, 1453, 1462, 1693, 1708, 1822, 1837]

for year in years:
    lee_person = lee_unmerged[lee_unmerged['pass_year'] == year]
    ws_person = ws_unmerged[ws_unmerged['year'] == year]
    lee_person['other_kor_name_in_WS_version'] = ws_person["namehg1"][0]
    lee_person['other_chi_name_in_WS_version'] = ws_person["namehj1"][0]
    lee_person['other_kor_name_in_WS_version_uuid'] = ws_person["ws_uuid"][0]

    df_similar_kor_name_lee = df_similar_kor_name_lee.append(lee_person)    
    df_similar_kor_name_ws = df_similar_kor_name_ws.append(ws_person)
    
# Special case is the year 1876
year = 1876
lee_person = lee_unmerged[lee_unmerged['pass_year'] == year][4:5]
ws_person = ws_unmerged[ws_unmerged['year'] == year]
lee_person['other_kor_name_in_WS_version'] = ws_person["namehg1"][0]
lee_person['other_chi_name_in_WS_version'] = ws_person["namehj1"][0]
lee_person['other_kor_name_in_WS_version_uuid'] = ws_person["ws_uuid"][0]

df_similar_kor_name_lee = df_similar_kor_name_lee.append(lee_person)    
df_similar_kor_name_ws = df_similar_kor_name_ws.append(ws_person)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [1220]:
df_similar_kor_name_lee

Unnamed: 0_level_0,bid,bpid,affilliation,rank,competitors,exam_type,pass_year,king,lid,previous,...,korname,family_clan,birth,death,plastic,lee_uuid,repeated_testtaker,other_kor_name_in_WS_version,other_chi_name_in_WS_version,other_kor_name_in_WS_version_uuid
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
af5a2271-4040-573c-ad39-08c46f8a114f,http://people.aks.ac.kr/front/dirSer/exm/exmVi...,G002+AKS-KHF_13C774ADF9BC30B1422X0,문과,15,33,식년시,1447,세종,305,신방생원(新榜生員),...,이극배,광주(廣州),1422.0,1495.0,silver,af5a2271-4040-573c-ad39-08c46f8a114f,False,이극배,李克培,b897a60d-1c85-5ad0-a1cb-f2c3cda5680b


In [1217]:
# MERGING only on Chinese name and year
master2 = pd.merge(df_similar_kor_name_lee, df_similar_kor_name_ws, indicator=True, how='inner', left_on=['pass_year'], right_on = ['year'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge7 = master.copy()

In [1218]:
# Update these values
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]
len(ws_unmerged),len(lee_unmerged)

(7, 547)

### ALMOST DONE - 4 more to go

In [1227]:
# ws_unmerged.drop(ws_unmerged.tail(3).index,inplace=True)

In [1222]:
lee_unmerged[lee_unmerged['korname'] == '이극배'][['korname', 'chnname', 'pass_year']]

Unnamed: 0_level_0,korname,chnname,pass_year
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
af5a2271-4040-573c-ad39-08c46f8a114f,이극배,李克培,1447


In [1223]:
# Another special case
lee_person = lee_unmerged[lee_unmerged['korname'] == '이극배']
ws_person = ws_unmerged[ws_unmerged['namehg1'] == '이극배']
lee_person['other_kor_name_in_WS_version'] = ws_person["namehg1"][0]
lee_person['other_chi_name_in_WS_version'] = ws_person["namehj1"][0]
lee_person['other_kor_name_in_WS_version_uuid'] = ws_person["ws_uuid"][0]

df_similar_kor_name_lee = df_similar_kor_name_lee.append(lee_person)    
df_similar_kor_name_ws = df_similar_kor_name_ws.append(ws_person)

# MERGING only on Chinese name and year
master2 = pd.merge(df_similar_kor_name_lee, df_similar_kor_name_ws, indicator=True, how='inner', left_on=['korname'], right_on = ['namehg1'])

# Drop duplicate Korean name, Chinese name, and exam year, then rename those columns to be clearer
master2 = master2.drop(columns=['namehg1', 'namehj1', 'year'])
master2 = master2.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})

len(master2), len(master2['lee_uuid'].unique()), len(master2['ws_uuid'].unique())

# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge7 = master.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [1236]:
# Update these values
lee_unmerged = data_lee[~data_lee['lee_uuid'].isin(master['lee_uuid'].unique())]
ws_unmerged = data_ws[~data_ws['ws_uuid'].isin(master['ws_uuid'].unique())]
len(ws_unmerged),len(lee_unmerged)

(6, 546)

In [1237]:
len(ws_unmerged),len(lee_unmerged)

(6, 546)

### Can't match these. IDK ㅠㅠㅠㅠ

In [1238]:
ws_unmerged[['namehg1', 'namehj1', 'year']]

Unnamed: 0_level_0,namehg1,namehj1,year
uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cc2f6185-57d6-5995-91d1-4922a2849055,노진해,盧晋諧,1435.0
cea144cd-a435-5bdd-a69c-7f7cc1a37de7,이육,李堉,1740.0
c0b57392-53c2-54a4-b0e4-e58e0f5e9e14,홍종협,洪鍾協,1880.0
namehj1,,,
prevdegreehj,,,
choronymhj,,,


## Just merge the remaining unmatchable ones with Master

In [1239]:
len(lee_unmerged)

546

In [1240]:
lee_unmerged.head().T

uuid,8cee2162-e5a0-5ebf-b4ab-65cc828c8f87,4024acde-e21b-532c-b7c2-0191b1cf948e,ecf30d95-77b0-5eb8-a16b-09ddc8e20224,a4c797c1-72b7-56ca-96f0-9e635dba41fd,212661e1-c72d-5529-8031-4b0a4654b501
bid,http://people.aks.ac.kr/front/dirSer/exm/exmVi...,http://people.aks.ac.kr/front/dirSer/exm/exmVi...,http://people.aks.ac.kr/front/dirSer/exm/exmVi...,http://people.aks.ac.kr/front/dirSer/exm/exmVi...,http://people.aks.ac.kr/front/dirSer/exm/exmVi...
bpid,G002+AKS-KHF_13C720C5B8C220B1703X0,G002+AKS-KHF_13C815AD11C6D0U9999X0,G002+AKS-KHF_12C548C9C0FFFFB1377X0,G002+AKS-KHF_13D55CCC98BA85U9999X0,G002+AKS-KHF_13B178C120ACBDU9999X0
affilliation,문과,문과,문과,문과,문과
rank,3,3,5,7,6
competitors,7,5,5,12,33
exam_type,중시,중시,중시,중시,식년시
pass_year,1757,1416,1416,1427,1435
king,영조,태종,태종,세종,세종
lid,30,96,96,96,96
previous,세자시강원보덕(世子侍講院輔德),성균관학유(成均館學諭),승문원정자(承文院正字),교리(校理),신생원(新生員)


In [1246]:
lee_unmerged = lee_unmerged.rename(columns={'korname':'kor_name', 'chnname':'chi_name'})
lee_unmerged['ws_uuid'] = np.NaN

len(lee_unmerged), len(lee_unmerged['lee_uuid'].unique())

# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge8 = master.copy()

In [None]:
ws_unmerged = ws_unmerged.rename(columns={'namehg1':'kor_name', 'namehj1':'chi_name'})
ws_unmerged['lee_uuid'] = np.NaN

len(ws_unmerged), len(ws_unmerged['ws_uuid'].unique())

# Merge these with master
frames = [master, master2]
master = pd.concat(frames, sort=False)

merge8 = master.copy()

# Clean up Master

In [1248]:
len(master)

17133

In [1250]:
master = master.reset_index(drop=True)

Unnamed: 0,0,1,2,3,4
bid,http://people.aks.ac.kr/front/dirSer/exm/exmVi...,http://people.aks.ac.kr/front/dirSer/exm/exmVi...,http://people.aks.ac.kr/front/dirSer/exm/exmVi...,http://people.aks.ac.kr/front/dirSer/exm/exmVi...,http://people.aks.ac.kr/front/dirSer/exm/exmVi...
bpid,G002+AKS-KHF_13C774C218B4DDB1697X0,G002+AKS-KHF_13C2E0AE30D604B1747X0,G002+AKS-KHF_13BC15C885ACB8B1744X0,G002+AKS-KHF_13C5C4C775C870B1849X0,G002+AKS-KHF_13B0A8C815AD6CB1866X0
affilliation,문과,문과,문과,문과,문과
rank,3,15,5,29,5
competitors,15,18,10,44,8
exam_type,정시2,정시,정시,정시,함경도도과
pass_year,1753,1784,1785,1885,1887
king,영조,정조,정조,고종,고종
lid,30,30,30,80,80
previous,부사과(副司果),유학(幼學),유학(幼學),유학(幼學),유학(幼學)


In [331]:
# Find the dudes in Lee with same Chinese name and exam year as WS dudes
# If their Korean names are at least 60 percent similar, then add them to the merged master table
# TODO - add the UUID of the WS dude to the master table
# TODO - add a list of dictionary to master table including dude's other names, and what other sources these other names come from

In [273]:
# TO DO - add ID for the mastertable
