Built upon previous work [05212023 Country Names Record Linkage Part I (cosine-similarity).ipynb](https://github.com/tiangenglu/recordlinkage/blob/main/05212023%20Country%20Names%20Record%20Linkage%20Part%20I%20(cosine-similarity).ipynb)

In [1]:
import os
import numpy as np
import pandas as pd
import boto3
import json
import io
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display, HTML

# Access data from S3

In [2]:
with open("aws_credential.txt", 'r') as file:
    aws_credential=json.load(file)
s3=boto3.Session(
    profile_name = None, 
    region_name = 'us-east-2').client(
    's3',
    aws_access_key_id=aws_credential['access_key'],
    aws_secret_access_key=aws_credential['secret_key'])

## Country codes `.txt` to `bytes` to `pd.DataFrame`

In [4]:
content_list1=s3.list_objects(Bucket = aws_credential['bucket'], Prefix ='resource/')['Contents']
# if size = 0, it's the folder resource/ itself
[d['Key'] for d in content_list1 if d['Size']>0] 

['resource/country.txt']

In [6]:
# call it _bytes because the type of the object retrieved is "bytes"
country_code_bytes=s3.get_object(
    Bucket = aws_credential['bucket'],
    Key = 'resource/' + 'country.txt')['Body'].read()

In [19]:
# the header and the iloc selections are specifically for this table
country_code_raw=pd.read_csv(io.BytesIO(country_code_bytes), delimiter = "\t", header = 3).iloc[1:241]
country_code_raw.head(2)

Unnamed: 0,Code | Name | ISO Code
1,1000 | United States of America ...
2,1010 | Greenland ...


In [29]:
print(country_code_raw.columns)
new_cols_country_code=[col.strip().split(' ')[0].lower() for col in country_code_raw.columns[0].split('|')]
new_cols_country_code

Index(['Code     |    Name                                                | ISO Code'], dtype='object')


['code', 'name', 'iso']

In [48]:
# data frame from NESTED LIST(series) COMPREHENSION
country_code=pd.DataFrame(data = [[e.strip() for e in row] # remove excessive spaces in every element in a row
                     for row in country_code_raw.iloc[:,0].str.split('|')], # split each row by | into columns
             columns=new_cols_country_code)
country_code.head(2)

Unnamed: 0,code,name,iso
0,1000,United States of America,US
1,1010,Greenland,GL


## Messy country label data

In [49]:
country_messy_bytes=s3.get_object(
    Bucket = aws_credential['bucket'],
    Key = 'visa_output/' + 'country_list.txt')['Body'].read()
print(type(country_messy_bytes))

<class 'bytes'>


In [54]:
country_messy=pd.read_csv(io.BytesIO(country_messy_bytes), delimiter = "\t", names = ['country'])
country_messy.head(2)

Unnamed: 0,country
0,COTE D'IVOIRE
1,GREAT BRITAIN AND NORTHERN IRELAND


# Record linkage

## TfidfVectorizer

In [56]:
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')
corpus = tfidf_vectorizer.fit_transform(country_code['name'])
type(corpus)

scipy.sparse._csr.csr_matrix

In [70]:
print(f'corpus.shape: {corpus.shape}')
corpus.toarray()

corpus.shape: (240, 298)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [63]:
print(f"""The type of `tfidf_vectorizer.get_feature_names_out()` is \
{type(tfidf_vectorizer.get_feature_names_out())}. 
The length of `tfidf_vectorizer.get_feature_names_out()` is \
{len(tfidf_vectorizer.get_feature_names_out())}.
First few items in it are: {tfidf_vectorizer.get_feature_names_out()[:20]}""")

The type of `tfidf_vectorizer.get_feature_names_out()` is <class 'numpy.ndarray'>. 
The length of `tfidf_vectorizer.get_feature_names_out()` is 298.
First few items in it are: ['administered' 'afghanistan' 'africa' 'african' 'albania' 'algeria'
 'america' 'american' 'andorra' 'angola' 'anguilla' 'antarctic' 'antigua'
 'arab' 'arabia' 'argentina' 'armenia' 'aruba' 'australia' 'austria']


## Tfidf Matrix

In [68]:
# Create a data frame of the tfidf matrix, column names are the features
tfidf_matrix_df = pd.DataFrame(corpus.toarray(),\
                               columns = tfidf_vectorizer.get_feature_names_out())