# Objectives

- Built upon previous work [2023-06-30_cleaning_df_bothdata.py](https://github.com/tiangenglu/WebScrape/blob/main/06302023_cleaning_df_bothdata.py)
- Detect new raw data and ONLY process the newly scraped data
- After cleaning, append to the existing all-time visa data

**NOTES**: The program needs minor revisions to read "old" data from S3 instead of local disk.

In [1]:
import os
import pandas as pd
import boto3
import json
import io
import sys

In [None]:
# retire this chunk
# move aws s3 connection up
# use niv_alltime and iv_alltime
# old_data = pd.read_csv("visa_alltime.csv")

# Accessing scraped raw data in AWS S3

In [2]:
with open("aws_credential.txt", 'r') as file:
    aws_credential=json.load(file)
s3=boto3.Session(
    profile_name = None, 
    region_name = 'us-east-2').client(
    's3',
    aws_access_key_id=aws_credential['access_key'],
    aws_secret_access_key=aws_credential['secret_key'])

In [3]:
output_content=s3.list_objects(Bucket = aws_credential['bucket'], Prefix ='visa_output/')['Contents']
output_items = [d['Key'].split('/')[-1] for d in output_content]

In [4]:
output_items

['',
 'country_code_matches.csv',
 'country_list.txt',
 'df_iv.csv',
 'df_niv.csv',
 'iv_alltime.csv',
 'niv_alltime.csv',
 'time_iv.txt',
 'time_niv.txt']

**Stop here for now (2025-05-05), use iv_alltime and niv_alltime once new data are scraped**

In [None]:
bucket_niv_content=s3.list_objects(Bucket = aws_credential['bucket'], Prefix ='messy_data/visa_scraped/niv/')['Contents']
bucket_iv_content=s3.list_objects(Bucket = aws_credential['bucket'], Prefix ='messy_data/visa_scraped/iv/')['Contents']

In [None]:
bucket_niv_items=[d['Key'].split('/')[-1] for d in bucket_niv_content]
bucket_iv_items=[d['Key'].split('/')[-1] for d in bucket_iv_content] 

In [None]:
bucket_iv_date=[f.split('.')[0].split('_')[-1] for f in bucket_iv_items]
bucket_niv_date=[f.split('.')[0].split('_')[-1] for f in bucket_niv_items]

# How many months (raw data) need processing? 

In [None]:
# better to use a list of unique time stamps
len_diff=len(bucket_niv_items) - len(old_data.time.unique())
if len_diff > 0:
    print(f'Need to append {len_diff} month(s) to existing NIV data.')
else: print("No actions required, compiled NIV data is up to date.")

In [None]:
# better to use a list of unique time stamps
len_diff=len(bucket_iv_items) - len(old_data.time.unique())
if len_diff > 0:
    print(f'Need to append {len_diff} month(s) to existing IV data.')
else: 
    print("No actions required, compiled IV data is up to date.")
    sys.exit(0) # exit(0) with grace

In [None]:
old_data.time.max()

In [None]:
max(bucket_niv_date)

In [None]:
# last item minus the difference in length
bucket_niv_date[-len_diff-1] == old_data.time.max()

In [None]:
niv_date_to_add=bucket_niv_date[-len_diff:]
print(f'Needs the following month(s): {niv_date_to_add}')

In [None]:
bucket_niv_date[-2:]

In [None]:
iv_date_to_add = bucket_iv_date[-len_diff:]
print(f'Needs the following month(s): {iv_date_to_add} for IV.')

# Read in raw data from S3

## Non-immigrant data

In [None]:
niv_df_raw = [None] * len_diff

In [None]:
bucket_niv_items[-(0+1)]

In [None]:
for i in range(len_diff):
    print(f'Getting the {-(i+1)} item from the .txt folder:')
    file = s3.get_object(Bucket = aws_credential['bucket'],
              # offset zero indexing: -(i+1), start from most recent (last in)
              Key = 'messy_data/visa_scraped/niv/'+bucket_niv_items[-(i+1)])['Body'].read()
    #niv_df_raw[i] = pd.DataFrame(file.decode("utf-8").split('\n')) # this works
    niv_df_raw[i] = pd.read_csv(io.BytesIO(file), delimiter = "\t", header = None) # also works

In [None]:
test_object=s3.get_object(Bucket = aws_credential['bucket'], 
              Key = 'messy_data/visa_scraped/niv/' + bucket_niv_items[-len_diff])['Body'].read()

In [None]:
type(test_object)

In [None]:
test_object.decode("utf-8").split('\n')[:5]

In [None]:
pd.DataFrame(test_object.decode("utf-8").split('\n')).iloc[:5]

In [None]:
pd.read_csv(io.BytesIO(test_object), delimiter = "\t", header = None).iloc[:5]

## Immigrant visa data

In [None]:
iv_df_raw = [None] * len_diff

In [None]:
for i in range(len_diff):
    print(f'Getting the {-(i+1)} item from the .txt folder:')
    file = s3.get_object(Bucket = aws_credential['bucket'],
              # offset zero indexing: -(i+1), start from most recent (last in)
              Key = 'messy_data/visa_scraped/iv/'+bucket_iv_items[-(i+1)])['Body'].read()
    #niv_df_raw[i] = pd.DataFrame(file.decode("utf-8").split('\n')) # this works
    iv_df_raw[i] = pd.read_csv(io.BytesIO(file), delimiter = "\t", header = None) # also works

In [None]:
print(len(iv_df_raw))

# Cleaning

## A list of raw dataframes

In [None]:
grand_total = []
for i in range(len(niv_df_raw)):
    niv_df_raw[i].columns = ['V'] # more robust than .rename
    # or use .apply(lambda x: x.strip()), but the following is simple
    niv_df_raw[i]['V']=niv_df_raw[i]['V'].str.strip().str.upper()
    # insert an iterrows() loop to get the index of the grand total row
    for idx,row in niv_df_raw[i].iterrows():
        if 'grand total'.upper() in row['V']:
            grand_total.append(row)
            idx_rm_below = idx
    niv_df_raw[i]=niv_df_raw[i].iloc[:idx_rm_below]
    # offset zero indexing: -(i+1), start from most recent
    niv_df_raw[i]['time'] = niv_date_to_add[-(i+1)]
print(grand_total)

In [None]:
iv_grand_total = []
for i in range(len(iv_df_raw)):
    iv_df_raw[i].columns = ['V']
    iv_df_raw[i]['V']=iv_df_raw[i]['V'].str.strip().str.upper()
    for idx,row in iv_df_raw[i].iterrows():
        if 'grand total'.upper() in row['V']:
            iv_grand_total.append(row)
            idx_rm_below = idx
    iv_df_raw[i] = iv_df_raw[i].iloc[:idx_rm_below]
    iv_df_raw[i]['time'] = iv_date_to_add[-(i+1)]
print(iv_grand_total)

## Concatenated one long dataframe

In [None]:
df_raw_niv=pd.concat([df for df in niv_df_raw])
df_niv = df_raw_niv.copy(deep = True)

In [None]:
df_raw_iv=pd.concat([df for df in iv_df_raw])
df_iv=df_raw_iv.copy(deep=True)

## Remove non-data rows

### NIV

In [None]:
niv_headers = ['NONIMMIGRANT','NATIONALITY VISA','\\(FY', '\\#SBU','PAGE','SENSITIVE']
'|'.join([h for h in niv_headers])

In [None]:
# df_niv[df_niv['V'].str.len() <=1]
df_niv_headers=df_niv[df_niv['V'].str.contains('|'.join(niv_headers))]

In [None]:
df_niv_headers.index

In [None]:
# remove rows that were headers & footers, not final yet
df_niv=df_niv.iloc[~df_niv.index.isin(df_niv_headers.index)]

### IV

In [None]:
df_iv['V'] = df_iv['V'].str.strip()
df_iv = df_iv[df_iv['V'].str.len() > 1]
iv_headers = ['PAGE ', 'FOREIGN STATE OF', 'CHARGEABILITY', 
              'PLACE OF BIRTH', '\\(FY 20', '\\(FY20',
              'IMMIGRANT VISA', 'SENSITIVE']
df_iv_headers = df_iv.loc[df_iv['V'].str.contains('|'.join(iv_headers))]
df_iv = df_iv.iloc[~df_iv.index.isin(df_iv_headers.index)]

In [None]:
df_iv_headers.shape

In [None]:
df_iv.shape

## Split all-in-one column

### NIV

In [None]:
df_niv.head(2)

In [None]:
# get rid of the warning messages
pd.options.mode.copy_on_write = True
df_niv['nationality']=[' '.join(row.split(' ')[:-2]).strip() for row in df_niv['V']]
# visa class
df_niv['visa']=[row.split(' ')[-2].strip() for row in df_niv['V']]
# remove thousand separator , from numbers
df_niv['issue']=[row.split(' ')[-1].replace(',','').strip() for row in df_niv['V']]

In [None]:
df_niv.head(2)

### IV

In [None]:
df_iv['nationality']=[' '.join(row.split(' ')[:-2]).strip() for row in df_iv['V']]
df_iv['visa']=[row.split(' ')[-2].strip() for row in df_iv['V']]
df_iv['issue']=[row.split(' ')[-1].replace(',','').strip() for row in df_iv['V']]

In [None]:
df_iv.head(2)

# Validation

## Data type

In [None]:
# test list comprehension with a short and simple list
[s for s in ['34',23,'a1','20','b '] if not str(s).isdigit()]

In [None]:
# Are there non-numeric values in the visa issuance count column?
check_numeric=[s for s in df_niv['issue'] if not str(s).isdigit()]
if len(check_numeric)>0:
    print("At least one row has non-numeric values in the NIV issuance column. Go back and check.")
    print(check_numeric)
    sys.exit()
else:
    print("No non-numeric values were detected in the NIV issuance column. Good to proceed.")
    df_niv['issue'] = df_niv['issue'].astype(int)

In [None]:
# Are there non-numeric values in the visa issuance count column?
check_numeric=[s for s in df_iv['issue'] if not str(s).isdigit()]
if len(check_numeric)>0:
    print("At least one row has non-numeric values in the IV issuance column. Go back and check.")
    print(check_numeric)
    sys.exit()
else:
    print("No non-numeric values were detected in the IV issuance column. Good to proceed.")
    df_iv['issue'] = df_iv['issue'].astype(int)

## Restoring rows when data got mixed with headers

- This is the most challenging part of cleaning this dataset.
- `if any(pattern in input_text for pattern in pattern_list):`

### NIV

In [None]:
restore_idx = []
# here's how any() works
for idx,row in df_niv_headers.iterrows():
    if any(c in row['V'] for c in df_niv.nationality.unique()):
        print(idx, row)
        restore_idx.append(idx)

In [None]:
df_restore=df_niv_headers.loc[restore_idx]

In [None]:
df_restore

In [None]:
df_restore['nationality']=[' '.join(row.split('NONIMMIGRANT')[0].strip().split(' ')[:-2]).strip() for row in df_restore['V']]
df_restore['visa'] = [row.split('NONIMMIGRANT')[0].strip().split(' ')[-2].strip() for row in df_restore['V']]
df_restore['issue'] = [row.split('NONIMMIGRANT')[0].strip().split(' ')[-1].strip().replace(',','') for row in df_restore['V']]
df_restore['issue'] = df_restore['issue'].astype(int)

In [None]:
df_restore

### IV

In [None]:
restore_idx_iv = []
# here's how any() works
for idx,row in df_iv_headers.iterrows():
    if any(c in row['V'] for c in df_iv.nationality.unique()):
        print(idx, row)
        restore_idx_iv.append(idx)

In [None]:
df_restore_iv=df_iv_headers.loc[restore_idx_iv]
df_restore_iv['nationality']=[' '.join(row.split('IMMIGRANT')[0].strip().split(' ')[:-2]).strip() for row in df_restore_iv['V']]
df_restore_iv['visa'] = [row.split('IMMIGRANT')[0].strip().split(' ')[-2].strip() for row in df_restore_iv['V']]
df_restore_iv['issue'] = [row.split('IMMIGRANT')[0].strip().split(' ')[-1].strip().replace(',','') for row in df_restore_iv['V']]
df_restore_iv['issue'] = df_restore_iv['issue'].astype(int)
df_restore_iv

## Concatenating

In [None]:
col_order = ['nationality', 'visa', 'issue','time']

In [None]:
df_niv=pd.concat([df_niv, df_restore]).sort_index().drop(columns = ['V'])[col_order].drop_duplicates()

In [None]:
pd.DataFrame(grand_total)

In [None]:
df_niv['issue'].sum()

In [None]:
df_iv=pd.concat([df_iv, df_restore_iv]).sort_index().drop(columns=['V'])[col_order].drop_duplicates()

In [None]:
df_iv['issue'].sum()

In [None]:
df_niv = df_niv.rename(columns={'issue':'count'})
df_iv = df_iv.rename(columns={'issue':'count'})

# Appending to existing data

In [None]:
niv_alltime=s3.get_object(Bucket = aws_credential['bucket'], 
              Key = 'visa_output/niv_alltime.csv')['Body'].read()
df_niv_alltime = pd.read_csv(io.BytesIO(niv_alltime),low_memory=True)

In [None]:
iv_alltime=s3.get_object(Bucket = aws_credential['bucket'], 
              Key = 'visa_output/iv_alltime.csv')['Body'].read()
df_iv_alltime = pd.read_csv(io.BytesIO(iv_alltime),low_memory=True)

In [None]:
df_niv_alltime.shape

In [None]:
df_iv_alltime.shape

In [None]:
df_niv_alltime_new=pd.concat([df_niv_alltime, df_niv]).reset_index(drop=True).drop_duplicates()

In [None]:
df_iv_alltime_new=pd.concat([df_iv_alltime, df_iv]).reset_index(drop=True).drop_duplicates()

# Removing special characters

In [None]:
country_list=list(set(list(df_iv_alltime_new['nationality'].unique()) + 
                      list(df_niv_alltime_new['nationality'].unique()
                          )
                     )
                 )
print("Total unique country/nationality labels before cleaning: ",len(country_list))

In [None]:
special_chars = []
# instead of iterrows, can also work on a list of unique nationalities
for country in country_list:
    for char in country:
        if not (char.isalpha() or char == ' '):
            if char not in special_chars:
                special_chars.append(char)
                print(char, country)

In [None]:
special_chars.remove(',') # potential legit
special_chars.remove("'") # potential legit
special_chars

In [None]:
old_country_label = []
new_country_label = []
for country in country_list:
    # the following covers the case when one string contains multiple special characters: e.g., '(' and ')'
    if any(char in country for char in special_chars):
        # if it's a letter or a space, join as usual, then replace special character with a space
        new_country = ''.join([char if (char.isalpha() or char == ' ') 
                               else char.replace(char,' ') 
                               for char in country]) # replace special character with space
        new_country = ' '.join(new_country.split()).replace('BORN','').strip() # split() to remove excessive space
        old_country_label.append(country)
        new_country_label.append(new_country)
        print("\nold: ",country,'\nnew: ', new_country)

In [None]:
no_sp_char_label=dict(zip(old_country_label,new_country_label))

In [None]:
# map new country labels to a new column nationality2, then replace it with original nationality
df_niv_alltime_new['nationality2'] = df_niv_alltime_new['nationality'].map(
    no_sp_char_label).fillna(
    df_niv_alltime_new['nationality'])
df_iv_alltime_new['nationality2'] = df_iv_alltime_new['nationality'].map(
    no_sp_char_label).fillna(
    df_iv_alltime_new['nationality'])
df_niv_alltime_new = df_niv_alltime_new.drop(
    columns=['nationality']).rename(
    columns={'nationality2':'nationality'})
df_iv_alltime_new = df_iv_alltime_new.drop(
    columns=['nationality']).rename(
    columns={'nationality2':'nationality'})

In [None]:
country_list_new=list(set(list(df_iv_alltime_new['nationality'].unique()) + 
                      list(df_niv_alltime_new['nationality'].unique()
                          )
                     )
                 )
print("Total unique country/nationality labels after cleaning: ",len(country_list_new))
print(f'After removing special characters, {len(country_list) - len(country_list_new)} labels were reduced.')

# Output

## Local

In [None]:
with open('country_list.txt','w') as file:
    file.write('\n'.join(country_list_new))

In [None]:
new_col_order = ['nationality','visa', 'count', 'time']
df_niv_alltime_new = df_niv_alltime_new[new_col_order]
df_iv_alltime_new = df_iv_alltime_new[new_col_order]

In [None]:
time_niv=list(df_niv_alltime_new['time'].unique())
time_iv=list(df_iv_alltime_new['time'].unique())

In [None]:
# output a list of time stamps
with open('time_niv.txt', 'w') as file:
    file.write('\n'.join(time_niv))
with open('time_iv.txt', 'w') as file:
    file.write('\n'.join(time_iv))

## S3

In [None]:
# make list a string to upload
s3.put_object(Body = "\n".join([c for c in country_list_new]), 
              Bucket = aws_credential['bucket'], 
              Key = 'visa_output/country_list.txt')
s3.put_object(Body = "\n".join([t for t in time_niv]), 
              Bucket = aws_credential['bucket'], 
              Key = 'visa_output/time_niv.txt')
s3.put_object(Body = "\n".join([t for t in time_iv]), 
              Bucket = aws_credential['bucket'], 
              Key = 'visa_output/time_iv.txt')
# list objects after upload
output_folder_items=s3.list_objects(Bucket = aws_credential['bucket'], Prefix = 'visa_output')['Contents']
item_names=[d['Key'] for d in output_folder_items]
[item for item in item_names if item.endswith('.txt')]

In [None]:
# upload pandas dataframe to s3
csv_buffer = io.StringIO()
df_niv_alltime_new.to_csv(csv_buffer, index=False)
s3.put_object(Body = csv_buffer.getvalue(), 
              Bucket = aws_credential['bucket'], 
              Key = 'visa_output/df_niv.csv')
# create a new csv buffer object to upload a different data frame
csv_buffer = io.StringIO()
df_iv_alltime_new.to_csv(csv_buffer, index=False)
s3.put_object(Body = csv_buffer.getvalue(), 
              Bucket = aws_credential['bucket'], 
              Key = 'visa_output/df_iv.csv')
# list objects after upload
output_folder_items=s3.list_objects(Bucket = aws_credential['bucket'], Prefix = 'visa_output')['Contents']
item_names=[d['Key'] for d in output_folder_items]
[item for item in item_names if item.endswith('.csv')]