In [1]:
import pandas as pd
import boto3
import json
import io
import os

# S3 Connection

In [2]:
with open("aws_credential.txt", 'r') as file:
    aws_credential=json.load(file)
s3=boto3.Session(
    profile_name = None, 
    region_name = 'us-east-2').client(
    's3',
    aws_access_key_id=aws_credential['access_key'],
    aws_secret_access_key=aws_credential['secret_key'])

In [3]:
content_list=s3.list_objects(Bucket = aws_credential['bucket'], Prefix ='visa_output/')['Contents']
# if size = 0, it's the folder resource/ itself
[d['Key'] for d in content_list if d['Size']>0] 

['visa_output/country_code_matches.csv',
 'visa_output/country_list.txt',
 'visa_output/df_iv.csv',
 'visa_output/df_niv.csv',
 'visa_output/iv_alltime.csv',
 'visa_output/niv_alltime.csv',
 'visa_output/time_iv.txt',
 'visa_output/time_niv.txt']

In [4]:
country_codes_bytes=s3.get_object(Bucket = aws_credential['bucket'], 
              Key = 'visa_output/country_code_matches.csv')['Body'].read()

In [5]:
# read-in well-built .csv files, set delimiter to ",", and then the first row as header
# set data type of country codes to string object, since none start w/ 0 so no need to zfill()
country_codes=pd.read_csv(io.BytesIO(country_codes_bytes), 
                          delimiter = ",", 
                          header = 0, 
                          dtype={'code':str})

In [10]:
country_codes.head(3)

Unnamed: 0,query,match,code,iso
0,AFGHANISTAN,Afghanistan,5310,AF
1,ALBANIA,Albania,4810,AL
2,ALGERIA,Algeria,7210,DZ


In [None]:
df_iv_bytes = s3.get_object(Bucket=aws_credential['bucket'],
                           Key='visa_output/df_iv.csv')['Body'].read()
df_niv_bytes = s3.get_object(Bucket=aws_credential['bucket'],
                           Key='visa_output/df_niv.csv')['Body'].read()
df_iv=pd.read_csv(io.BytesIO(df_iv_bytes), decimal=",", header=0)
df_niv=pd.read_csv(io.BytesIO(df_niv_bytes), decimal=",", header=0)

In [8]:
df_iv.head(3)

Unnamed: 0,nationality,visa,count,time
0,AFGHANISTAN,CR1,11,2017-03-31
1,AFGHANISTAN,DV1,2,2017-03-31
2,AFGHANISTAN,DV2,1,2017-03-31


In [9]:
df_niv.head(3)

Unnamed: 0,nationality,visa,count,time
0,NON NATIONALITY BASED ISSUANCES,A2,1,2017-03-31
1,NON NATIONALITY BASED ISSUANCES,B1,1,2017-03-31
2,NON NATIONALITY BASED ISSUANCES,B1/B2,163,2017-03-31


# Apply Labels

In [15]:
iv_alltime_raw=df_iv.merge(country_codes, 
            left_on='nationality', 
            right_on = 'query', 
            how = 'left')#.drop(columns=['query'])

In [23]:
niv_alltime_raw=df_niv.merge(country_codes, 
            left_on='nationality', 
            right_on = 'query', 
            how = 'left')#.drop(columns=['query'])

In [22]:
iv_na_rows=iv_alltime_raw.loc[iv_alltime_raw.isnull().any(axis=1)] # don't omit axis=1
iv_na_rows['nationality'].unique()

array(['NAMIBIA', 'WESTERN SAHARA', 'JERUSALEM', 'OTHER',
       'PALESTINIAN AUTHORITY TRAVEL DOCUMENT', 'NO NATIONALITY'],
      dtype=object)

In [24]:
niv_na_rows=niv_alltime_raw.loc[niv_alltime_raw.isnull().any(axis=1)] # don't omit axis=1
niv_na_rows['nationality'].unique()

array(['NON NATIONALITY BASED ISSUANCES', 'NAMIBIA',
       'PALESTINIAN AUTHORITY TRAVEL DOCUMENT', 'WESTERN SAHARA',
       'NON NATIONLITY BASED ISSUANCES', 'UNKNOWN'], dtype=object)

In [16]:
iv_alltime_raw.isnull().sum()

nationality      0
visa             0
count            0
time             0
query            9
match            9
code             9
iso            115
dtype: int64

In [25]:
niv_alltime_raw.head(3)

Unnamed: 0,nationality,visa,count,time,query,match,code,iso
0,NON NATIONALITY BASED ISSUANCES,A2,1,2017-03-31,,,,
1,NON NATIONALITY BASED ISSUANCES,B1,1,2017-03-31,,,,
2,NON NATIONALITY BASED ISSUANCES,B1/B2,163,2017-03-31,,,,


In [27]:
niv_na_rows['query'].fillna(niv_na_rows['nationality'])

0               NON NATIONALITY BASED ISSUANCES
1               NON NATIONALITY BASED ISSUANCES
2               NON NATIONALITY BASED ISSUANCES
3               NON NATIONALITY BASED ISSUANCES
4               NON NATIONALITY BASED ISSUANCES
                          ...                  
320398    PALESTINIAN AUTHORITY TRAVEL DOCUMENT
320399    PALESTINIAN AUTHORITY TRAVEL DOCUMENT
320400    PALESTINIAN AUTHORITY TRAVEL DOCUMENT
320401    PALESTINIAN AUTHORITY TRAVEL DOCUMENT
320402    PALESTINIAN AUTHORITY TRAVEL DOCUMENT
Name: query, Length: 2893, dtype: object

In [29]:
niv_alltime_raw['nationality_new']=niv_alltime_raw['query'].fillna(niv_alltime_raw['nationality'])

In [30]:
iv_alltime_raw['nationality_new']=iv_alltime_raw['query'].fillna(iv_alltime_raw['nationality'])

In [31]:
niv_alltime_raw.head()

Unnamed: 0,nationality,visa,count,time,query,match,code,iso,nationality_new
0,NON NATIONALITY BASED ISSUANCES,A2,1,2017-03-31,,,,,NON NATIONALITY BASED ISSUANCES
1,NON NATIONALITY BASED ISSUANCES,B1,1,2017-03-31,,,,,NON NATIONALITY BASED ISSUANCES
2,NON NATIONALITY BASED ISSUANCES,B1/B2,163,2017-03-31,,,,,NON NATIONALITY BASED ISSUANCES
3,NON NATIONALITY BASED ISSUANCES,B2,1,2017-03-31,,,,,NON NATIONALITY BASED ISSUANCES
4,NON NATIONALITY BASED ISSUANCES,C1/D,42,2017-03-31,,,,,NON NATIONALITY BASED ISSUANCES


In [35]:
niv_alltime=niv_alltime_raw.\
drop(columns=['nationality','query','match','code','iso']).\
rename(columns={'nationality_new':'nationality'})
niv_alltime = niv_alltime[['nationality','visa','count','time']]

In [37]:
niv_alltime.head(3)

Unnamed: 0,nationality,visa,count,time
0,NON NATIONALITY BASED ISSUANCES,A2,1,2017-03-31
1,NON NATIONALITY BASED ISSUANCES,B1,1,2017-03-31
2,NON NATIONALITY BASED ISSUANCES,B1/B2,163,2017-03-31


In [38]:
iv_alltime_raw.head(3)

Unnamed: 0,nationality,visa,count,time,query,match,code,iso,nationality_new
0,AFGHANISTAN,CR1,11,2017-03-31,AFGHANISTAN,Afghanistan,5310,AF,AFGHANISTAN
1,AFGHANISTAN,DV1,2,2017-03-31,AFGHANISTAN,Afghanistan,5310,AF,AFGHANISTAN
2,AFGHANISTAN,DV2,1,2017-03-31,AFGHANISTAN,Afghanistan,5310,AF,AFGHANISTAN


In [36]:
iv_alltime=iv_alltime_raw.\
drop(columns=['nationality','query','match','code','iso']).\
rename(columns={'nationality_new':'nationality'})
iv_alltime = iv_alltime[['nationality','visa','count','time']]

In [39]:
iv_alltime.head(3)

Unnamed: 0,nationality,visa,count,time
0,AFGHANISTAN,CR1,11,2017-03-31
1,AFGHANISTAN,DV1,2,2017-03-31
2,AFGHANISTAN,DV2,1,2017-03-31


# Export data w/ standard labels