# H1B 2024

https://github.com/BloombergGraphics/2024-h1b-immigration-data/blob/main/TRK_13139_FY2024_single_reg.zip

In [3]:
%run ../../notebooks/Setup.ipynb

import pandas

In [5]:
# do some compression on the source csv
# df = pandas.read_csv(workspace_path.joinpath('data/raw/TRK_13139_FY2024_single_reg.csv'), low_memory=False)
# df.to_parquet(workspace_path.joinpath('data/raw/TRK_13139_FY2024_single_reg.parquet'), engine="pyarrow")

# load the compressed parquet file
df = pandas.read_parquet(workspace_path.joinpath('data/raw/TRK_13139_FY2024_single_reg.parquet'), engine="pyarrow")
df

Unnamed: 0,bcn,country_of_birth,country_of_nationality,ben_date_of_birth,ben_year_of_birth,gender,employer_name,FEIN,mail_addr,city,...,S1Q1B,BEN_EDUCATION_CODE,ED_LEVEL_DEFINITION,BEN_PFIELD_OF_STUDY,BEN_COMP_PAID,DOT_CODE,NAICS_CODE,S3Q1,S4Q1,T_U_VAWA_FLAG
0,(b)(6),IND,IND,(b)(6),1994,male,Mphasis Corporation,954759720,41 Madison Ave,New York,...,,,,,,,,,,
1,(b)(6),KOR,KOR,(b)(6),1994,male,"Tennessee Dental Professionals, PC",200418100,1200 Network Centre Drive,Effingham,...,,,,,,,,,,
2,(b)(6),IND,IND,(b)(6),1982,male,KPMG LLP,135565207,2323 Ross Ave,Dallas,...,,,,,,,,,,
3,(b)(6),IND,IND,(b)(6),1995,male,3D TECHNOLOGIES LLC,384053952,423 W Wheatland Rd,Duncanville,...,,,,,,,,,,
4,(b)(6),GHA,GHA,(b)(6),1990,male,"LER TechForce, LLC",352139176,1888 Poshard Drive,Columbus,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350098,(b)(6),CHN,CHN,(b)(6),1987,male,Google LLC,770493581,1600 Amphitheatre Parkway,Mountain View,...,,,,,,,,,,
350099,(b)(6),GBR,IRL,(b)(6),1989,male,KLA Corporation,042564110,One Technology Drive,Milpitas,...,,,,,,,,,,
350100,(b)(6),CHN,CHN,(b)(6),2000,female,Deloitte & Touche LLP,133891517,1700 Market Street,Philadelphia,...,,,,,,,,,,
350101,(b)(6),IND,IND,(b)(6),1987,male,PRIAMBA SOFT LLC,462435519,"517 Us Highway 1 S, Suite 1193",Iselin,...,,,,,,,,,,


In [6]:
# Define the columns that need to be converted to numeric and datetime
numeric_cols = [
    'BEN_COMP_PAID', 'NUM_OF_EMP_IN_US', 'WAGE_AMT', 'NUMBER_OF_BENEFICIARIES', 'lottery_year', 'ben_year_of_birth'
]
date_cols = [
    'valid_to', 'valid_from', 'first_decision_date', 'rec_date', 'ben_date_of_birth'
]

# Convert numeric columns: invalid parsing will result in NaN
for col in numeric_cols:
    if col in df.columns:
        df[col] = pandas.to_numeric(df[col], errors='coerce')

# Convert date columns to timestamp[us]: invalid parsing will result in NaT
# The format is assumed to be like "9/30/2026"
for col in date_cols:
    if col in df.columns:
        df[col] = pandas.to_datetime(df[col], format='%m/%d/%Y', errors='coerce').dt.tz_localize(None)

# Convert all other columns to string
special_cols = set(numeric_cols + date_cols)
other_cols = [col for col in df.columns if col not in special_cols]
df[other_cols] = df[other_cols].astype(str)

In [7]:
# write to parquet
df.to_parquet(
    workspace_path.joinpath('data/processed/h1b2024.parquet'),
    engine="pyarrow",
    coerce_timestamps="us",
    allow_truncated_timestamps=True
)