In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import glob

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 10


In [3]:
data_fn_tx = "../data/round3/transactions_normalized_2026-01-09_13-04-38.json"
data_fn_aw = "../data/round3/awards_normalized_2026-01-10_09-30-16.json"

with open(data_fn_tx, 'r') as f:
    transactions_data = json.load(f)
df_tx = pd.DataFrame(transactions_data)

with open(data_fn_aw, 'r') as f:
    awards_data = json.load(f)
df_aw = pd.DataFrame(awards_data)

In [4]:
df_tx.shape, df_aw.shape

((21067, 22), (10000, 20))

In [5]:
# new_tx

In [6]:
mask_new_tx = ((df_tx['modification_number'] == '0') | (df_tx['action_type_description'] == 'NEW'))

In [7]:
df_new_tx = df_tx[mask_new_tx]

In [8]:
df_new_tx.shape

(1643, 22)

In [9]:
# threshold amount

In [10]:
thresh_amount = 900_000
mask_amt = df_new_tx.federal_action_obligation >= thresh_amount
df_new_tx = df_new_tx[mask_amt]
df_new_tx.shape

(1548, 22)

In [11]:
# joining

In [12]:
# Check unique award_ids in each dataset
print("=== Award ID Coverage ===")
print(f"Unique award_ids in new transactions: {df_new_tx['award_id'].nunique():,}")
print(f"Unique award_ids in awards dataset: {df_aw['award_id'].nunique():,}")

# Check for overlaps
transaction_award_ids = set(df_new_tx['award_id'])
awards_award_ids = set(df_aw['award_id'])

overlap = transaction_award_ids.intersection(awards_award_ids)
only_in_transactions = transaction_award_ids - awards_award_ids
only_in_awards = awards_award_ids - transaction_award_ids

print(f"\nAward IDs in both datasets: {len(overlap):,}")
print(f"Award IDs only in transactions: {len(only_in_transactions):,}")
print(f"Award IDs only in awards: {len(only_in_awards):,}")

=== Award ID Coverage ===
Unique award_ids in new transactions: 1,522
Unique award_ids in awards dataset: 9,872

Award IDs in both datasets: 739
Award IDs only in transactions: 783
Award IDs only in awards: 9,133


In [13]:
print("=== Performing Left Join ===")
print("Joining new_transactions to awards on award_id...")

joined_df = df_new_tx.merge(
    df_aw,
    on='award_id',
    how='left',
    suffixes=('_transaction', '_award'),
    indicator=True
)

print(f"\nJoined dataframe shape: {joined_df.shape}")
print(f"Columns: {joined_df.shape[1]}")

=== Performing Left Join ===
Joining new_transactions to awards on award_id...

Joined dataframe shape: (1550, 42)
Columns: 42


In [14]:
sum(joined_df._merge == 'both'), sum(joined_df._merge == 'left_only')

(755, 795)

In [15]:
joined_df

Unnamed: 0,transaction_id,award_id,action_date,action_type,action_type_description,modification_number,federal_action_obligation,total_dollars_obligated,award_type_transaction,award_description_transaction,period_of_performance_start_date,period_of_performance_current_end_date,awarding_agency_name,awarding_sub_agency_name,funding_agency_name,recipient_name_transaction,recipient_uei_transaction,naics_code_transaction,product_or_service_code,place_of_performance_state_transaction,ingested_at_transaction,source_url_transaction,award_type_award,award_amount,award_date,start_date,end_date,last_modified_date,base_obligation_date,awarding_agency,awarding_sub_agency,funding_agency,recipient_name_award,recipient_uei_award,recipient_business_categories,award_description_award,naics_code_award,psc_code,place_of_performance_state_award,ingested_at_award,source_url_award,_merge
0,354207030,140L0626C0003,2026-01-07,,Unknown,0,4182501.33,4182501.33,DEFINITIVE CONTRACT,GAOA HOT WELL DUNES CAMPGRND IMPRVMNTS,2026-01-07,,Department of the Interior,Bureau of Land Management,Department of the Interior,CAPEX CONSTRUCTION LLC,VJWLNDEBBDM5,237990,Y1PA,AZ,2026-01-09T18:04:38.179Z,https://www.usaspending.gov/award/140L0626C0003,Unknown,4182501.33,,2026-01-07,2026-09-16,2026-01-07,2026-01-07,Department of the Interior,Bureau of Land Management,Department of the Interior,CAPEX CONSTRUCTION LLC,VJWLNDEBBDM5,[],GAOA HOT WELL DUNES CAMPGRND IMPRVMNTS,237990,,AZ,2026-01-10T14:30:16.569Z,https://www.usaspending.gov/award/354207030,both
1,354207621,36C10G26K0214,2026-01-07,,Unknown,0,31668429.52,31668429.52,DELIVERY ORDER,EXPRESS REPORT: DECEMBER FY26 Q1,2025-12-01,,Department of Veterans Affairs,Department of Veterans Affairs,Department of Veterans Affairs,TRIWEST HEALTHCARE ALLIANCE CORP,J7M9HPTGJ1S9,524292,Q999,AK,2026-01-09T18:04:38.180Z,https://www.usaspending.gov/award/36C10G26K0214,Unknown,31668429.52,,2025-12-01,2025-12-31,2026-01-07,2026-01-07,Department of Veterans Affairs,Department of Veterans Affairs,Department of Veterans Affairs,TRIWEST HEALTHCARE ALLIANCE CORP,J7M9HPTGJ1S9,[],EXPRESS REPORT: DECEMBER FY26 Q1,524292,,AK,2026-01-10T14:30:16.553Z,https://www.usaspending.gov/award/354207621,both
2,354207876,36C77626K0030,2026-01-07,,Unknown,0,914136.52,914136.52,DELIVERY ORDER,EXPRESS REPORT: DECEMBER 2025 ORDERING OFFICER...,2025-12-01,,Department of Veterans Affairs,Department of Veterans Affairs,Department of Veterans Affairs,"MANUS MEDICAL, LLC",M4UBTVGBJMT7,339113,6515,VA,2026-01-09T18:04:38.180Z,https://www.usaspending.gov/award/36C77626K0030,,,,,,,,,,,,,,,,,,,,left_only
3,354212325,70Z08526F37008B00,2026-01-07,,Unknown,0,1970353.72,1970353.72,DELIVERY ORDER,VARIOUS PARTS TO BE USED FOR MAINTENANCE OF RO...,2026-01-07,,Department of Homeland Security,U.S. Coast Guard,Department of Homeland Security,ROLLS-ROYCE SOLUTIONS AMERICA INC,HYGLK2BNFKB3,333618,2815,MI,2026-01-09T18:04:38.180Z,https://www.usaspending.gov/award/70Z08526F370...,,,,,,,,,,,,,,,,,,,,left_only
4,354212144,693KA826F00060,2026-01-07,,Unknown,0,1197780.00,1197780.00,DELIVERY ORDER,"TASK ORDER 1 83"" X 25 DEEP, WITH SPECIAL DESIG...",2026-01-07,,Department of Transportation,Federal Aviation Administration,Department of Transportation,GAW ASSOCIATES INC,HJLHM7VX3WM3,332322,7H20,NJ,2026-01-09T18:04:38.180Z,https://www.usaspending.gov/award/693KA826F00060,,,,,,,,,,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1545,280243879,36C25226N0090,2025-10-10,,Unknown,0,1387158.87,1387158.87,DELIVERY ORDER,"ORDERING PERIOD 3 PATHOLOGY, TRANSPLANT, AND E...",2025-11-01,,Department of Veterans Affairs,Department of Veterans Affairs,Department of Veterans Affairs,UNIVERSITY OF WISCONSIN HOSPITALS AND CLINICS ...,UEYKTPWSV9Q9,621511,Q515,WI,2026-01-09T18:04:38.263Z,https://www.usaspending.gov/award/36C25226N0090,,,,,,,,,,,,,,,,,,,,left_only
1546,291044254,693C7326F00001N,2025-10-10,,Unknown,0,5734999.50,5734999.50,DELIVERY ORDER,AR ERFO FS OZSTF810 2022-1(1): THE PROJECT CON...,2025-10-10,,Department of Transportation,Federal Highway Administration,Department of Transportation,"ECLIPSE COMPANIES, LLC",RMCMTCKHMFH9,237310,Y1LB,AR,2026-01-09T18:04:38.263Z,https://www.usaspending.gov/award/693C7326F00001N,Unknown,5734999.50,,2025-10-10,2027-05-10,2025-10-10,2025-10-10,Department of Transportation,Federal Highway Administration,Department of Transportation,"ECLIPSE COMPANIES, LLC",RMCMTCKHMFH9,[],AR ERFO FS OZSTF810 2022-1(1): THE PROJECT CON...,237310,,AR,2026-01-10T14:30:16.567Z,https://www.usaspending.gov/award/291044254,both
1547,280159963,36C24826N0088,2025-10-10,,Unknown,0,3796534.85,3796534.85,DELIVERY ORDER,LENALIDOMIDE,2025-10-10,,Department of Veterans Affairs,Department of Veterans Affairs,Department of Veterans Affairs,EXELAN PHARMACEUTICALS INC,FYZXNHSQTMG8,325412,6505,FL,2026-01-09T18:04:38.263Z,https://www.usaspending.gov/award/36C24826N0088,Unknown,3796534.85,,2025-10-10,2026-09-30,2025-10-10,2025-10-10,Department of Veterans Affairs,Department of Veterans Affairs,Department of Veterans Affairs,EXELAN PHARMACEUTICALS INC,FYZXNHSQTMG8,[],LENALIDOMIDE,325412,,FL,2026-01-10T14:30:16.569Z,https://www.usaspending.gov/award/280159963,both
1548,280309099,36C25726N0072,2025-10-10,,Unknown,0,2787350.00,2787350.00,DELIVERY ORDER,STX SPECIALIZED REFERENCE LABORATORY TESTING,2026-01-01,,Department of Veterans Affairs,Department of Veterans Affairs,Department of Veterans Affairs,"NEOGENOMICS LABORATORIES, INC.",ML9LFC5CK7K9,621511,Q301,FL,2026-01-09T18:04:38.263Z,https://www.usaspending.gov/award/36C25726N0072,Unknown,2787350.00,,2026-01-01,2026-12-31,2025-10-10,2025-10-10,Department of Veterans Affairs,Department of Veterans Affairs,Department of Veterans Affairs,"NEOGENOMICS LABORATORIES, INC.",ML9LFC5CK7K9,[],STX SPECIALIZED REFERENCE LABORATORY TESTING,621511,,FL,2026-01-10T14:30:16.573Z,https://www.usaspending.gov/award/280309099,both


In [18]:
sum(df_aw.base_obligation_date == df_aw.last_modified_date), len(df_aw)

(471, 10000)

In [19]:
sum(joined_df.base_obligation_date == joined_df.last_modified_date)

472

In [23]:
df_aw.last_modified_date.nunique(), df_aw.last_modified_date.min(), df_aw.last_modified_date.max()

(80, '2025-10-09', '2026-01-08')

In [10]:
df_aw.head(2)

Unnamed: 0,award_id,award_type,award_amount,award_date,start_date,end_date,last_modified_date,base_obligation_date,awarding_agency,awarding_sub_agency,funding_agency,recipient_name,recipient_uei,recipient_business_categories,award_description,naics_code,psc_code,place_of_performance_state,ingested_at,source_url
0,DEAC0500OR22725,Unknown,40399143444.29,,1999-10-15,2030-03-31,2025-12-23,1999-10-15,Department of Energy,Department of Energy,Department of Energy,UT-BATTELLE LLC,ZLHJJ57QA2H8,[],MANAGEMENT AND OPERATION OF THE OAK RIDGE NATI...,561210,,TN,2026-01-10T14:30:16.542Z,https://www.usaspending.gov/award/295476106
1,DENA0003525,Unknown,39308850163.32,,2017-01-18,2027-04-30,2025-12-30,2016-12-16,Department of Energy,Department of Energy,Department of Defense,NATIONAL TECHNOLOGY & ENGINEERING SOLUTIONS OF...,LUJEPCRRT377,[],"IGF::CL,CT::IGF CONTRACT AWARD DE-NA0003525 TO...",561210,,NM,2026-01-10T14:30:16.543Z,https://www.usaspending.gov/award/295527116


In [11]:
df_tx.head(2)

Unnamed: 0,transaction_id,award_id,action_date,action_type,action_type_description,modification_number,federal_action_obligation,total_dollars_obligated,award_type,award_description,period_of_performance_start_date,period_of_performance_current_end_date,awarding_agency_name,awarding_sub_agency_name,funding_agency_name,recipient_name,recipient_uei,naics_code,product_or_service_code,place_of_performance_state,ingested_at,source_url
0,277837703,140P2025C0024,2026-01-07,L,L,P00006,18758.44,18758.44,DEFINITIVE CONTRACT,GOGA 253655 REHABILITATE DETERIORATED CHINA BE...,2025-02-07,,Department of the Interior,National Park Service,Department of the Interior,ARANA GROUP INC,VUGLHDG2RCH9,236220,Z2JZ,CA,2026-01-09T18:04:38.178Z,https://www.usaspending.gov/award/140P2025C0024
1,291029792,6913G625C100006,2026-01-07,M,M,P00001,0.0,0.0,DEFINITIVE CONTRACT,"FMCSA PHASE IIB AWARD WITH PULSAR INFORMATICS,...",2025-04-21,,Department of Transportation,Immediate Office of the Secretary of Transport...,Department of Transportation,"PULSAR INFORMATICS, INC.",G6MEDJJBE8F7,541715,AS12,WA,2026-01-09T18:04:38.179Z,https://www.usaspending.gov/award/6913G625C100006
