# Objectives

- Built upon previous work [2023-06-30_cleaning_df_bothdata.py](https://github.com/tiangenglu/WebScrape/blob/main/06302023_cleaning_df_bothdata.py)
- Detect new raw data and ONLY process the newly scraped data
- After cleaning, append to the existing all-time visa data

In [1]:
import os
import pandas as pd
import boto3
import json
import io
import sys

In [2]:
%%time
old_data = pd.read_csv("visa_alltime.csv")

CPU times: user 184 ms, sys: 22.5 ms, total: 207 ms
Wall time: 207 ms


In [3]:
old_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488523 entries, 0 to 488522
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   nationality  488523 non-null  object
 1   visa         488523 non-null  object
 2   count        488523 non-null  int64 
 3   time         488523 non-null  object
 4   type         488523 non-null  object
dtypes: int64(1), object(4)
memory usage: 18.6+ MB


In [4]:
niv_catalog = pd.read_csv('niv_catalog.csv')
iv_catalog = pd.read_csv('iv_catalog.csv')

# Accessing scraped raw data in AWS S3

In [5]:
with open("aws_credential.txt", 'r') as file:
    aws_credential=json.load(file)
s3=boto3.Session(
    profile_name = None, 
    region_name = 'us-east-2').client(
    's3',
    aws_access_key_id=aws_credential['access_key'],
    aws_secret_access_key=aws_credential['secret_key'])

In [6]:
bucket_niv_content=s3.list_objects(Bucket = aws_credential['bucket'], Prefix ='messy_data/visa_scraped/niv/')['Contents']
bucket_iv_content=s3.list_objects(Bucket = aws_credential['bucket'], Prefix ='messy_data/visa_scraped/iv/')['Contents']

In [7]:
bucket_niv_items=[d['Key'].split('/')[-1] for d in bucket_niv_content]
bucket_iv_items=[d['Key'].split('/')[-1] for d in bucket_iv_content] 

In [8]:
bucket_iv_date=[f.split('.')[0].split('_')[-1] for f in bucket_iv_items]
bucket_niv_date=[f.split('.')[0].split('_')[-1] for f in bucket_niv_items]

# How many months (raw data) need processing? 

In [9]:
len_diff=len(bucket_iv_items) - len(old_data.time.unique())
if len_diff > 0:
    print(f'Need to append {len_diff} month(s) to existing data.')
else: print("No actions required, compiled visa data is up to date.")

Need to append 1 month(s) to existing data.


In [10]:
old_data.time.max()

'2025-01-31'

In [11]:
max(bucket_niv_date)

'2025-02-28'

In [12]:
# last item minus the difference in length
bucket_niv_date[-len_diff-1] == old_data.time.max()

True

In [13]:
niv_date_to_add=bucket_niv_date[-len_diff:]
print(f'Needs the following month(s): {niv_date_to_add}')

Needs the following month(s): ['2025-02-28']


In [14]:
bucket_niv_date[-2:]

['2025-01-31', '2025-02-28']

# Read in raw data from S3

In [15]:
niv_df_raw = [None] * len_diff

In [16]:
bucket_niv_items[-(0+1)]

'niv_2025-02-28.txt'

In [17]:
for i in range(len_diff):
    print(i)
    file = s3.get_object(Bucket = aws_credential['bucket'],
              # offset zero indexing: -(i+1), start from most recent (last in)
              Key = 'messy_data/visa_scraped/niv/'+bucket_niv_items[-(i+1)])['Body'].read()
    #niv_df_raw[i] = pd.DataFrame(file.decode("utf-8").split('\n')) # this works
    niv_df_raw[i] = pd.read_csv(io.BytesIO(file), delimiter = "\t", header = None) # also works

0


In [18]:
test_object=s3.get_object(Bucket = aws_credential['bucket'], 
              Key = 'messy_data/visa_scraped/niv/' + bucket_niv_items[-len_diff])['Body'].read()

In [19]:
type(test_object)

bytes

In [20]:
test_object.decode("utf-8").split('\n')[:5]

['Nationality Visa Class Issuances',
 'Afghanistan A2 1',
 'Afghanistan B1/B2 127',
 'Afghanistan F1 18',
 'Afghanistan F2 1']

In [21]:
pd.DataFrame(test_object.decode("utf-8").split('\n')).iloc[:5]

Unnamed: 0,0
0,Nationality Visa Class Issuances
1,Afghanistan A2 1
2,Afghanistan B1/B2 127
3,Afghanistan F1 18
4,Afghanistan F2 1


In [22]:
pd.read_csv(io.BytesIO(test_object), delimiter = "\t", header = None).iloc[:5]

Unnamed: 0,0
0,Nationality Visa Class Issuances
1,Afghanistan A2 1
2,Afghanistan B1/B2 127
3,Afghanistan F1 18
4,Afghanistan F2 1


# Cleaning

## A list of raw dataframes

In [23]:
grand_total = []
for i in range(len(niv_df_raw)):
    niv_df_raw[i].columns = ['V'] # more robust than .rename
    # or use .apply(lambda x: x.strip()), but the following is simple
    niv_df_raw[i]['V']=niv_df_raw[0]['V'].str.strip().str.upper()
    # insert an iterrows() loop to get the index of the grand total row
    for idx,row in niv_df_raw[0].iterrows():
        if 'grand total'.upper() in row['V']:
            grand_total.append(row)
            idx_rm_below = idx
    niv_df_raw[i]=niv_df_raw[i].iloc[:idx_rm_below]
    # offset zero indexing: -(i+1), start from most recent
    niv_df_raw[i]['time'] = niv_date_to_add[-(i+1)]
print(grand_total)

[V    GRAND TOTAL 914,989
Name: 3692, dtype: object]


## Concatenated one long dataframe

In [24]:
df_raw_niv=pd.concat([df for df in niv_df_raw])
df_niv = df_raw_niv.copy(deep = True)

## Remove non-data rows

In [25]:
niv_headers = ['NONIMMIGRANT','NATIONALITY VISA','\\(FY', '\\#SBU','PAGE','SENSITIVE']
'|'.join([h for h in niv_headers])

'NONIMMIGRANT|NATIONALITY VISA|\\(FY|\\#SBU|PAGE|SENSITIVE'

In [26]:
# df_niv[df_niv['V'].str.len() <=1]
df_niv_headers=df_niv[df_niv['V'].str.contains('|'.join(niv_headers))]

In [27]:
df_niv_headers.index

Index([   0,   42,   43,   44,   45,   46,   89,   90,   91,  134,
       ...
       3556, 3599, 3600, 3601, 3644, 3645, 3646, 3689, 3690, 3691],
      dtype='int64', length=249)

In [28]:
# remove rows that were headers & footers, not final yet
df_niv=df_niv.iloc[~df_niv.index.isin(df_niv_headers.index)]

## Split all-in-one column

In [29]:
df_niv.head(2)

Unnamed: 0,V,time
1,AFGHANISTAN A2 1,2025-02-28
2,AFGHANISTAN B1/B2 127,2025-02-28


In [30]:
# get rid of the warning messages
pd.options.mode.copy_on_write = True
df_niv['nationality']=[' '.join(row.split(' ')[:-2]).strip() for row in df_niv['V']]
# visa class
df_niv['visa']=[row.split(' ')[-2].strip() for row in df_niv['V']]
# remove thousand separator , from numbers
df_niv['issue']=[row.split(' ')[-1].replace(',','').strip() for row in df_niv['V']]

In [31]:
df_niv.head(2)

Unnamed: 0,V,time,nationality,visa,issue
1,AFGHANISTAN A2 1,2025-02-28,AFGHANISTAN,A2,1
2,AFGHANISTAN B1/B2 127,2025-02-28,AFGHANISTAN,B1/B2,127


# Validation

## Data type

In [32]:
# test list comprehension with a short and simple list
[s for s in ['34',23,'a1','20','b '] if not str(s).isdigit()]

['a1', 'b ']

In [33]:
# Are there non-numeric values in the visa issuance count column?
check_numeric=[s for s in df_niv['issue'] if not str(s).isdigit()]
if len(check_numeric)>0:
    print("At least one row has non-numeric values in the visa issuance column. Go back and check.")
    print(check_numeric)
    sys.exit()
else:
    print("No non-numeric values were detected in the visa issuance column. Good to proceed.")
    df_niv['issue'] = df_niv['issue'].astype(int)

No non-numeric values were detected in the visa issuance column. Good to proceed.


## Restoring rows when data got mixed with headers

- This is the most challenging part of cleaning this dataset.

In [34]:
restore_idx = []
# here's how any() works
for idx,row in df_niv_headers.iterrows():
    if any(c in row['V'] for c in df_niv.nationality.unique()):
        print(idx, row)
        restore_idx.append(idx)

42 V       ALGERIA G2 11NONIMMIGRANT VISA ISSUANCES BY NA...
time                                           2025-02-28
Name: 42, dtype: object


In [35]:
df_restore=df_niv_headers.loc[restore_idx]

In [36]:
df_restore

Unnamed: 0,V,time
42,ALGERIA G2 11NONIMMIGRANT VISA ISSUANCES BY NA...,2025-02-28


In [37]:
df_restore['nationality']=[' '.join(row.split('NONIMMIGRANT')[0].split(' ')[:-2]).strip() for row in df_restore['V']]
df_restore['visa'] = [row.split('NONIMMIGRANT')[0].split(' ')[-2].strip() for row in df_restore['V']]
df_restore['issue'] = [row.split('NONIMMIGRANT')[0].split(' ')[-1].strip().replace(',','') for row in df_restore['V']]
df_restore['issue'] = df_restore['issue'].astype(int)

In [38]:
df_restore

Unnamed: 0,V,time,nationality,visa,issue
42,ALGERIA G2 11NONIMMIGRANT VISA ISSUANCES BY NA...,2025-02-28,ALGERIA,G2,11


## Concatenating

In [39]:
col_order = ['nationality', 'visa', 'issue','time']

In [40]:
df_niv=pd.concat([df_niv, df_restore]).sort_index().drop(columns = ['V'])[col_order].drop_duplicates()

In [41]:
pd.DataFrame(grand_total)

Unnamed: 0,V
3692,"GRAND TOTAL 914,989"


In [42]:
df_niv['issue'].sum()

914989