# Objectives

- Built upon previous work [2023-06-30_cleaning_df_bothdata.py](https://github.com/tiangenglu/WebScrape/blob/main/06302023_cleaning_df_bothdata.py)
- Detect new raw data and ONLY process the newly scraped data
- After cleaning, append to the existing all-time visa data

In [52]:
import os
import pandas as pd
import boto3
import json
import io

In [2]:
%%time
old_data = pd.read_csv("visa_alltime.csv")

CPU times: user 183 ms, sys: 32.3 ms, total: 215 ms
Wall time: 242 ms


In [3]:
old_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488523 entries, 0 to 488522
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   nationality  488523 non-null  object
 1   visa         488523 non-null  object
 2   count        488523 non-null  int64 
 3   time         488523 non-null  object
 4   type         488523 non-null  object
dtypes: int64(1), object(4)
memory usage: 18.6+ MB


In [4]:
niv_catalog = pd.read_csv('niv_catalog.csv')
iv_catalog = pd.read_csv('iv_catalog.csv')

# Scraped raw data in AWS S3

In [8]:
with open("aws_credential.txt", 'r') as file:
    aws_credential=json.load(file)
s3=boto3.Session(
    profile_name = None, 
    region_name = 'us-east-2').client(
    's3',
    aws_access_key_id=aws_credential['access_key'],
    aws_secret_access_key=aws_credential['secret_key'])

In [17]:
bucket_niv_content=s3.list_objects(Bucket = aws_credential['bucket'], Prefix ='messy_data/visa_scraped/niv/')['Contents']
bucket_iv_content=s3.list_objects(Bucket = aws_credential['bucket'], Prefix ='messy_data/visa_scraped/iv/')['Contents']

In [22]:
bucket_niv_items=[d['Key'].split('/')[-1] for d in bucket_niv_content]
bucket_iv_items=[d['Key'].split('/')[-1] for d in bucket_iv_content] 

In [37]:
bucket_iv_date=[f.split('.')[0].split('_')[-1] for f in bucket_iv_items]
bucket_niv_date=[f.split('.')[0].split('_')[-1] for f in bucket_niv_items]

# How many months (raw data) need processing? 

In [33]:
len_diff=len(bucket_iv_items) - len(old_data.time.unique())
if len_diff > 0:
    print(f'Need to append {len_diff} month(s) to existing data.')
else: print("No actions required, compiled visa data is up to date.")

Need to append 1 month(s) to existing data.


In [32]:
old_data.time.max()

'2025-01-31'

In [38]:
max(bucket_niv_date)

'2025-02-28'

In [41]:
# last item minus the difference in length
bucket_niv_date[-len_diff-1] == old_data.time.max()

True

In [102]:
niv_date_to_add=bucket_niv_date[-len_diff:]
print(f'Needs the following month(s): {niv_date_to_add}')

Needs the following month(s): ['2025-02-28']


In [94]:
bucket_niv_date[-2:]

['2025-01-31', '2025-02-28']

# Read in raw data from S3

In [51]:
niv_df_raw = [None] * len_diff

In [101]:
bucket_niv_items[-(0+1)]

'niv_2025-02-28.txt'

In [123]:
for i in range(len_diff):
    print(i)
    file = s3.get_object(Bucket = aws_credential['bucket'],
              # offset zero indexing: -(i+1), start from most recent (last in)
              Key = 'messy_data/visa_scraped/niv/'+bucket_niv_items[-(i+1)])['Body'].read()
    #niv_df_raw[i] = pd.DataFrame(file.decode("utf-8").split('\n')) # this works
    niv_df_raw[i] = pd.read_csv(io.BytesIO(file), delimiter = "\t", header = None) # also works

0


In [60]:
test_object=s3.get_object(Bucket = aws_credential['bucket'], 
              Key = 'messy_data/visa_scraped/niv/' + bucket_niv_items[-len_diff])['Body'].read()

In [61]:
type(test_object)

bytes

In [81]:
test_object.decode("utf-8").split('\n')[:5]

['Nationality Visa Class Issuances',
 'Afghanistan A2 1',
 'Afghanistan B1/B2 127',
 'Afghanistan F1 18',
 'Afghanistan F2 1']

In [82]:
pd.DataFrame(test_object.decode("utf-8").split('\n')).iloc[:5]

Unnamed: 0,0
0,Nationality Visa Class Issuances
1,Afghanistan A2 1
2,Afghanistan B1/B2 127
3,Afghanistan F1 18
4,Afghanistan F2 1


In [83]:
pd.read_csv(io.BytesIO(test_object), delimiter = "\t", header = None).iloc[:5]

Unnamed: 0,0
0,Nationality Visa Class Issuances
1,Afghanistan A2 1
2,Afghanistan B1/B2 127
3,Afghanistan F1 18
4,Afghanistan F2 1


# Cleaning

## A list of raw dataframes

In [124]:
for i in range(len(niv_df_raw)):
    niv_df_raw[i].columns = ['V'] # more robust than .rename
    # or use .apply(lambda x: x.strip()), but the following is simple
    niv_df_raw[i]['V']=niv_df_raw[0]['V'].str.strip().str.upper()
    # insert an iterrows() loop to get the index of the grand total row
    for idx,row in niv_df_raw[0].iterrows():
        if 'grand total'.upper() in row['V']:
            idx_rm_below = idx
    niv_df_raw[i]=niv_df_raw[i].iloc[:idx_rm_below]
    # offset zero indexing: -(i+1), start from most recent
    niv_df_raw[i]['time'] = niv_date_to_add[-(i+1)]

## Concatenated one long dataframe

In [131]:
df_raw_niv=pd.concat([df for df in niv_df_raw])
df_niv = df_raw_niv.copy(deep = True)

## Remove non-data rows

In [132]:
niv_headers = ['NONIMMIGRANT','NATIONALITY VISA','\\(FY', '\\#SBU','PAGE','SENSITIVE']
'|'.join([h for h in niv_headers])

'NONIMMIGRANT|NATIONALITY VISA|\\(FY|\\#SBU|PAGE|SENSITIVE'

In [134]:
# niv_DF[niv_DF['V'].str.contains('|'.join(niv_headers))]
# df_niv[df_niv['V'].str.len() <=1]
df_niv[df_niv['V'].str.contains('|'.join(niv_headers))]

Unnamed: 0,V,time
0,NATIONALITY VISA CLASS ISSUANCES,2025-02-28
42,ALGERIA G2 11NONIMMIGRANT VISA ISSUANCES BY NA...,2025-02-28
43,FEBRUARY 2025 (FY 2025),2025-02-28
44,PAGE 1 OF 83,2025-02-28
45,NATIONALITY VISA CLASS ISSUANCESNONIMMIGRANT V...,2025-02-28
...,...,...
3645,NATIONALITY VISA CLASS ISSUANCESNONIMMIGRANT V...,2025-02-28
3646,FEBRUARY 2025 (FY 2025),2025-02-28
3689,PAGE 82 OF 83,2025-02-28
3690,NATIONALITY VISA CLASS ISSUANCESNONIMMIGRANT V...,2025-02-28
