Built upon previous work [05212023 Country Names Record Linkage Part I (cosine-similarity).ipynb](https://github.com/tiangenglu/recordlinkage/blob/main/05212023%20Country%20Names%20Record%20Linkage%20Part%20I%20\(cosine-similarity\).ipynb)

In [1]:
import os
import numpy as np
import pandas as pd
import boto3
import json
import io
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Access data from S3

In [2]:
with open("aws_credential.txt", 'r') as file:
    aws_credential=json.load(file)
s3=boto3.Session(
    profile_name = None, 
    region_name = 'us-east-2').client(
    's3',
    aws_access_key_id=aws_credential['access_key'],
    aws_secret_access_key=aws_credential['secret_key'])

## Country codes `.txt` to `bytes` to `pd.DataFrame`

In [3]:
content_list1=s3.list_objects(Bucket = aws_credential['bucket'], Prefix ='resource/')['Contents']
# if size = 0, it's the folder resource/ itself
[d['Key'] for d in content_list1 if d['Size']>0] 

['resource/country.txt']

In [4]:
# call it _bytes because the type of the object retrieved is "bytes"
country_code_bytes=s3.get_object(
    Bucket = aws_credential['bucket'],
    Key = 'resource/' + 'country.txt')['Body'].read()

In [5]:
# the header and the iloc selections are specifically for this table
country_code_raw=pd.read_csv(io.BytesIO(country_code_bytes), delimiter = "\t", header = 3).iloc[1:241]
country_code_raw.head(2)

Unnamed: 0,Code | Name | ISO Code
1,1000 | United States of America ...
2,1010 | Greenland ...


In [6]:
print(country_code_raw.columns)
new_cols_country_code=[col.strip().split(' ')[0].lower() for col in country_code_raw.columns[0].split('|')]
new_cols_country_code

Index(['Code     |    Name                                                | ISO Code'], dtype='object')


['code', 'name', 'iso']

In [7]:
# data frame from NESTED LIST(series) COMPREHENSION
country_code=pd.DataFrame(data = [[e.strip() for e in row] # remove excessive spaces in every element in a row
                     for row in country_code_raw.iloc[:,0].str.split('|')], # split each row by | into columns
             columns=new_cols_country_code)
country_code.head(2)

Unnamed: 0,code,name,iso
0,1000,United States of America,US
1,1010,Greenland,GL


### Best practices

- Try to make the label dictionary data, the table that provides standardized country names and codes, more inclusive.
- Manually add "Great Britain and Northern Ireland" to United Kingdom

In [8]:
# manually edit the label dictionary, adding Great Britain to United Kingdom
uk_idx=country_code[country_code['name'].str.contains('United Kingdom')].index[0]
country_code['name'].loc[uk_idx] = 'United Kingdom (Great Britain and Northern Ireland)'
country_code[country_code['name'].str.contains('United Kingdom')]

Unnamed: 0,code,name,iso
57,4120,United Kingdom (Great Britain and Northern Ire...,GB


## Messy country label data

In [9]:
country_messy_bytes=s3.get_object(
    Bucket = aws_credential['bucket'],
    Key = 'visa_output/' + 'country_list.txt')['Body'].read()
print(type(country_messy_bytes))

<class 'bytes'>


In [10]:
country_messy=pd.read_csv(io.BytesIO(country_messy_bytes), delimiter = "\t", names = ['country'])
country_messy.head(2)

Unnamed: 0,country
0,COTE D'IVOIRE
1,GREAT BRITAIN AND NORTHERN IRELAND


# Record linkage

## TfidfVectorizer

In [11]:
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')
corpus = tfidf_vectorizer.fit_transform(country_code['name'])
type(corpus)

scipy.sparse._csr.csr_matrix

In [12]:
print(f'corpus.shape: {corpus.shape}')
corpus.toarray()

corpus.shape: (240, 300)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
print(f"""The type of `tfidf_vectorizer.get_feature_names_out()` is \
{type(tfidf_vectorizer.get_feature_names_out())}. 
The length of `tfidf_vectorizer.get_feature_names_out()` is \
{len(tfidf_vectorizer.get_feature_names_out())}.
First few items in it are: {tfidf_vectorizer.get_feature_names_out()[:20]}""")

The type of `tfidf_vectorizer.get_feature_names_out()` is <class 'numpy.ndarray'>. 
The length of `tfidf_vectorizer.get_feature_names_out()` is 300.
First few items in it are: ['administered' 'afghanistan' 'africa' 'african' 'albania' 'algeria'
 'america' 'american' 'andorra' 'angola' 'anguilla' 'antarctic' 'antigua'
 'arab' 'arabia' 'argentina' 'armenia' 'aruba' 'australia' 'austria']


## Tfidf Matrix

In [14]:
# Create a data frame of the tfidf matrix, column names are the features
tfidf_matrix_df = pd.DataFrame(corpus.toarray(),\
                               columns = tfidf_vectorizer.get_feature_names_out())

## Cosine-similarity

In [15]:
n = 2 # Keep top 2 best matches
# transform the messy data to be matched
for i in range(len(country_messy['country'])):
    # loop over every row in the messy data
    query = country_messy['country'].iloc[i]
    # tokenize and vectorize each individual query row, must be wrap with []
    query_vector = tfidf_vectorizer.transform([query])
    # run cosine-similarity, corpus is the fitted and transformed(vectorized) label dictionary data
    cosine_sim = pd.DataFrame(cosine_similarity(corpus, query_vector),\
                             columns = ['cosine_similarity'],\
                             index = country_code.index) # use index of label dictionary data
    cosine_sim = cosine_sim.sort_values(by = ['cosine_similarity'], ascending = False)
    # Keep top n best matches, and keep all columns(`,:]`) in the coded/labeled/dictionary
    # output gives potential matches as country names
    output = country_code.loc[cosine_sim.index[0:n],:]
    output.index = ['match_1','match_2']
    # scores report the cosine similarity score of these matches
    scores = cosine_sim[0:n]
    
    # NOW, CREATE FINAL OUTPUTS
    if i == 0:
        # `deep = True`, a new object will be created with a copy of the calling object’s data and indices
        all_outputs = output.copy(deep = True) # for the first iteration, just copy the result
        all_scores = scores.copy(deep = True)
    else:
        # from the 2nd iteration onwards
        all_outputs = pd.concat([all_outputs,output]) # append new rows as iterations continue
        all_scores = pd.concat([all_scores, scores])

In [16]:
# Convert the indeces to a regular column
all_outputs['rank'] = all_outputs.index
# Must reset index for all dataframes before merging
all_outputs = all_outputs.reset_index(drop = True)
all_outputs.head()

Unnamed: 0,code,name,iso,rank
0,7480,Cote d'Ivoire,CI,match_1
1,1000,United States of America,US,match_2
2,4120,United Kingdom (Great Britain and Northern Ire...,GB,match_1
3,4190,Ireland,IE,match_2
4,7880,Madagascar,MG,match_1


In [17]:
# Must reset index for all dataframes before merging
all_scores = all_scores.reset_index(drop = True)
all_scores.head()

Unnamed: 0,cosine_similarity
0,1.0
1,0.0
2,0.83543
3,0.481546
4,1.0


In [18]:
# without .values, still creates a data frame but has repetitive index 0,0,1,1,...
messy_repeat_n=pd.DataFrame(np.repeat(country_messy['country'].values, n, axis = 0),columns=['query'])
messy_repeat_n.tail()

Unnamed: 0,query
511,SINT MAARTEN
512,SERBIA
513,SERBIA
514,SRI LANKA
515,SRI LANKA


In [19]:
# combine columns from 3 data frames:
# (1)messy labels * n, (2)matched results, and (3)cosine similarity values
all_matches=pd.concat(
    [messy_repeat_n,all_outputs,all_scores],axis=1).\
rename(columns={'name':'match'}).\
sort_values(by=['cosine_similarity','query'], ascending=[False, True]).\
reset_index(drop=True)
# filter values
all_matches = all_matches.loc[(all_matches['rank'] == 'match_1')&(all_matches['cosine_similarity']>0.5)].\
reset_index(drop=True)

In [20]:
all_matches[all_matches['cosine_similarity']<0.9]

Unnamed: 0,query,code,match,iso,rank,cosine_similarity
214,TANZANIA,7830,Tanzania (United Republic of Tanzania),TZ,match_1,0.88954
215,SAMOA,6150,Samoa (Western Samoa),WS,match_1,0.880773
216,"MARSHALL ISLANDS, REPUBLIC OF THE",6810,Marshall Islands,MH,match_1,0.873917
217,ST KITTS AND NEVIS,2483,Saint Kitts and Nevis,KN,match_1,0.867664
218,ST PIERRE AND MIQUELON,1610,Saint Pierre and Miquelon,PM,match_1,0.867664
219,ST VINCENT AND THE GRENADINES,2488,Saint Vincent and the Grenadines,VC,match_1,0.867664
220,GERMANY,4280,Germany (Federal Republic of Germany),DE,match_1,0.857421
221,"CONGO, DEMOCRATIC REPUBLIC OF THE",7660,"Congo, Democratic Republic of the Congo (forme...",rCD,match_1,0.857177
222,DEMOCRATIC REPUBLIC OF THE CONGO,7660,"Congo, Democratic Republic of the Congo (forme...",rCD,match_1,0.857177
223,PITCAIRN,6225,Pitcairn Islands,PN,match_1,0.837638


# Manual Corrections

In [21]:
all_matches.loc[all_matches['query']=='NORTHERN IRELAND DV ONLY',
                ['code','match','iso']] = ('4120',
                                           'United Kingdom (Great Britain and Northern Ireland)',
                                           'GB')
all_matches.loc[all_matches['query']=='NORTHERN IRELAND DV ONLY']

Unnamed: 0,query,code,match,iso,rank,cosine_similarity
239,NORTHERN IRELAND DV ONLY,4120,United Kingdom (Great Britain and Northern Ire...,GB,match_1,0.707107


In [22]:
all_matches.loc[all_matches['query']=='SAINT MARTIN',
                ['code','match','iso']] = ('2774',
                                           'Sint Maarten',
                                           'SX')
all_matches.loc[all_matches['query']=='SAINT MARTIN']

Unnamed: 0,query,code,match,iso,rank,cosine_similarity
240,SAINT MARTIN,2774,Sint Maarten,SX,match_1,0.629568


In [23]:
len(all_matches['match'].unique())

219

In [24]:
len(country_code)

240

In [25]:
unused_countries=[c for c in country_code['name'] if c not in all_matches['match'].unique()]
print(f'These are the unused country labels, assign manually if needed: {unused_countries}')

These are the unused country labels, assign manually if needed: ['United States of America', 'Martinique', 'French Guiana', 'Falkland Islands (Islas Malvinas)', 'Svalbard and Jan Mayen', 'Gaza Strip administered by Israel', 'West Bank administered by Israel', "Laos (Lao People's Democratic Republic)", 'Macao', 'Norfolk Island', 'Heard Island and McDonald Islands', 'Tokelau', 'Mayotte', 'Reunion', 'French Southern and Antarctic Lands', 'Puerto Rico', 'Virgin Islands of the United States', 'Guam', 'American Samoa', 'Northern Mariana Islands', 'United States Minor Outlying Islands']


In [26]:
needs_manual_match=[c for c in country_messy['country'] if c not in all_matches['query'].unique()]
print(f"The following names from the messy data didn't find a match, needs manual efforts: {needs_manual_match}")

The following names from the messy data didn't find a match, needs manual efforts: ['KYRGYSTAN', 'UNKNOWN', 'LAOS', 'NON NATIONLITY BASED ISSUANCES', 'JERUSALEM', 'PALESTINIAN AUTHORITY TRAVEL DOCUMENT', 'MACAU', 'SWAZILAND', 'NON NATIONALITY BASED ISSUANCES', 'WESTERN SAHARA', 'MACAU S A R', 'OTHER', 'NO NATIONALITY']


In [27]:
# need to look up the entire label data dictionary for matches
manual_matches_dict={'KYRGYSTAN':'Kyrgyzstan',
                     'LAOS':"Laos (Lao People's Democratic Republic)",
                     'MACAU':'Macao',
                     'SWAZILAND':'Eswatini',
                     'MACAU S A R':'Macao'}

In [28]:
manual_matches=pd.DataFrame.from_dict(data=manual_matches_dict,orient='index')\
.reset_index()\
.rename(columns={'index':'query',0:'match'})
manual_matches

Unnamed: 0,query,match
0,KYRGYSTAN,Kyrgyzstan
1,LAOS,Laos (Lao People's Democratic Republic)
2,MACAU,Macao
3,SWAZILAND,Eswatini
4,MACAU S A R,Macao


In [29]:
manual_matches_merge=manual_matches.merge(country_code, 
                     how='inner',
                     left_on='match',
                     right_on='name').drop(columns=['name'])
manual_matches_merge

Unnamed: 0,query,match,code,iso
0,KYRGYSTAN,Kyrgyzstan,4635,KG
1,LAOS,Laos (Lao People's Democratic Republic),5530,LA
2,MACAU,Macao,5660,MO
3,SWAZILAND,Eswatini,7950,SZ
4,MACAU S A R,Macao,5660,MO


In [30]:
matches_to_use=pd.concat([manual_matches_merge,all_matches[manual_matches_merge.columns]])
matches_to_use=matches_to_use.sort_values(
    by=['query','match','iso','code']).reset_index(drop=True)
matches_to_use

Unnamed: 0,query,match,code,iso
0,AFGHANISTAN,Afghanistan,5310,AF
1,ALBANIA,Albania,4810,AL
2,ALGERIA,Algeria,7210,DZ
3,ANDORRA,Andorra,4271,AD
4,ANGOLA,Angola,7620,AO
...,...,...,...,...
245,VIETNAM,Vietnam,5520,VN
246,WALLIS AND FUTUNA,Wallis and Futuna,6413,WF
247,YEMEN,Yemen (Republic of Yemen),5210,YE
248,ZAMBIA,Zambia,7940,ZM


# Output results to S3

In [31]:
# upload pandas dataframe to s3
csv_buffer = io.StringIO()
matches_to_use.to_csv(csv_buffer, index=False)
s3.put_object(Body = csv_buffer.getvalue(), 
              Bucket = aws_credential['bucket'], 
              Key = 'visa_output/country_code_matches.csv')
print('Output to S3 completed.')

Output to S3 completed.


In [32]:
matches_to_use.to_csv('country_code_matches.csv',index=False)