In [109]:
# reconcile downloaded vs scraped
# a: downloaded, completely unfiltered
# b: scraped, partially filtered
# after applying more filters...
# count: df_a: 710 | df_b: 735
# amount: a: 6,322,104,713 | b: 6,508,169,561
# period: ~55 days

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import glob

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 10


In [None]:
# === Fetch Summary ===
# Total Transactions: 21067
# Unique Awards: 14649
# Total Obligation: $56,339,936,114.12
# Date Range: 2025-10-10 to 2026-01-08

# https://www.usaspending.gov/download_center/award_data_archive

In [4]:
data_fn_a = "../data/dl/FY2026_All_Contracts_Full_20251207_1.csv"
data_fn_b = "../data/round3/transactions_normalized_2026-01-09_13-04-38.json"

df_a = pd.read_csv(data_fn_a)

with open(data_fn_b, 'r') as f:
    transactions_data = json.load(f)
df_b = pd.DataFrame(transactions_data)

  df_a = pd.read_csv(data_fn_a)


In [5]:
df_a.shape, df_b.shape

((192181, 297), (21067, 22))

In [10]:
# both are only contracts

In [8]:
df_a.award_type.value_counts()

award_type
DELIVERY ORDER         72464
BPA CALL               50759
PURCHASE ORDER         33715
DEFINITIVE CONTRACT     6017
Name: count, dtype: int64

In [9]:
df_b.award_type.value_counts()

award_type
DELIVERY ORDER         12708
DEFINITIVE CONTRACT     5200
BPA CALL                2300
PURCHASE ORDER           859
Name: count, dtype: int64

In [24]:
# Utilities

In [11]:
def psize():
    print(f"df_a: {len(df_a):,} | df_b: {len(df_b):,}")

In [12]:
psize()

df_a: 192,181 | df_b: 21,067


In [73]:
# reset
df_a = pd.read_csv(data_fn_a)

with open(data_fn_b, 'r') as f:
    transactions_data = json.load(f)
df_b = pd.DataFrame(transactions_data)

  df_a = pd.read_csv(data_fn_a)


In [74]:
# Date Filtering

In [75]:
df_a.action_date.min(), df_b.action_date.min()

('2025-10-01', '2025-10-10')

In [76]:
df_a.action_date.max(), df_b.action_date.max()

('2025-12-04', '2026-01-07')

In [77]:
# so let's go 2025-10-11 -> 2025-12-03
start_date, end_date = '2025-10-11', '2025-12-03'

In [78]:
mask_a = (df_a.action_date >= start_date) & (df_a.action_date <= end_date)
mask_b = (df_b.action_date >= start_date) & (df_b.action_date <= end_date)

In [79]:
psize()
df_a = df_a[mask_a]
df_b = df_b[mask_b]
psize()

df_a: 192,181 | df_b: 21,067
df_a: 147,917 | df_b: 11,199


In [80]:
# Amount Filtering

In [81]:
thresh_amount = 900_000
mask_a = df_a.federal_action_obligation >= thresh_amount
mask_b = df_b.federal_action_obligation >= thresh_amount

psize()
df_a = df_a[mask_a]
df_b = df_b[mask_b]
psize()

df_a: 147,917 | df_b: 11,199
df_a: 1,978 | df_b: 1,945


In [107]:
# Comparison of id's, no shared id across datasets

In [83]:
txid_a = set(df_a.contract_transaction_unique_key)
len(txid_a)

1978

In [84]:
txid_b = set(df_b.transaction_id)
len(txid_b)

1775

In [85]:
# they are not the same form
txid_a.pop(), txid_b.pop()

('3600_3600_36C10F25N0022_P00004_36C10F23D0005_0', 279969798)

In [86]:
# new transactions

In [87]:
# action_type_description = "NEW" <-> action_type = "A", df_a doesnt have descr field

mask_a = ((df_a['modification_number'] == '0') | (df_a['action_type'] == 'A'))
mask_b = ((df_b['modification_number'] == '0') | (df_b['action_type_description'] == 'NEW'))

In [89]:
psize()
df_a = df_a[mask_a]
df_b = df_b[mask_b]
psize()

df_a: 1,978 | df_b: 1,945
df_a: 710 | df_b: 735


In [103]:
# comparison

In [105]:
total_a = df_a.federal_action_obligation.sum()
total_b = df_b.federal_action_obligation.sum()
print(f"a: {int(total_a):,} | b: {int(total_b):,}")

a: 6,322,104,713 | b: 6,508,169,561


In [106]:
# more comparisons

In [99]:
cols = ['action_date', 'federal_action_obligation']

In [101]:
df_b[cols].sort_values(by='federal_action_obligation', ascending=True)

Unnamed: 0,action_date,federal_action_obligation
11904,2025-11-25,900000.00
11605,2025-11-26,900000.00
13451,2025-11-20,900000.00
13125,2025-11-20,900628.48
11214,2025-11-26,903277.44
...,...,...
17108,2025-10-31,300000000.00
13000,2025-11-21,372832130.00
14840,2025-11-14,598444871.04
16578,2025-11-04,711388881.21


In [102]:
df_a[cols].sort_values(by='federal_action_obligation', ascending=True)

Unnamed: 0,action_date,federal_action_obligation
38637,2025-11-25,900000.00
33427,2025-11-26,900000.00
65296,2025-11-20,900000.00
66420,2025-11-20,900628.48
31094,2025-11-26,903277.44
...,...,...
112023,2025-10-31,300000000.00
60693,2025-11-21,372832130.00
86948,2025-11-14,598444871.04
104737,2025-11-04,711388881.21


In [110]:
# load new dataset

In [111]:
data_fn_c = "../data/round3/transactions_normalized_2026-01-09_16-58-06.json"

with open(data_fn_c, 'r') as f:
    transactions_data = json.load(f)
df_c = pd.DataFrame(transactions_data)

In [112]:
df_c.head()

Unnamed: 0,transaction_id,award_id,generated_internal_id,action_date,action_type,action_type_description,modification_number,federal_action_obligation,total_dollars_obligated,award_type,award_description,period_of_performance_start_date,period_of_performance_current_end_date,awarding_agency_name,awarding_sub_agency_name,funding_agency_name,recipient_name,recipient_uei,naics_code,product_or_service_code,place_of_performance_state,ingested_at,source_url
0,277837703,140P2025C0024,CONT_AWD_140P2025C0024_1443_-NONE-_-NONE-,2026-01-07,L,L,P00006,18758.44,18758.44,DEFINITIVE CONTRACT,GOGA 253655 REHABILITATE DETERIORATED CHINA BE...,2025-02-07,,Department of the Interior,National Park Service,Department of the Interior,ARANA GROUP INC,VUGLHDG2RCH9,236220,Z2JZ,CA,2026-01-09T21:58:06.923Z,https://www.usaspending.gov/award/140P2025C0024
1,291029792,6913G625C100006,CONT_AWD_6913G625C100006_6901_-NONE-_-NONE-,2026-01-07,M,M,P00001,0.0,0.0,DEFINITIVE CONTRACT,"FMCSA PHASE IIB AWARD WITH PULSAR INFORMATICS,...",2025-04-21,,Department of Transportation,Immediate Office of the Secretary of Transport...,Department of Transportation,"PULSAR INFORMATICS, INC.",G6MEDJJBE8F7,541715,AS12,WA,2026-01-09T21:58:06.924Z,https://www.usaspending.gov/award/6913G625C100006
2,277758078,140D0425F0414,CONT_AWD_140D0425F0414_1406_140D8121D0001_1406,2026-01-07,K,K,P00003,0.0,0.0,DELIVERY ORDER,FIXED WING SMOKEJUMPER FLIGHT SERVICE IN SUPPO...,2025-04-02,,Department of the Interior,Departmental Offices,Department of the Interior,BIGHORN AIRWAYS INC,C9G9FQRLDEV1,481211,V221,IN,2026-01-09T21:58:06.924Z,https://www.usaspending.gov/award/140D0425F0414
3,291259871,70US0923C70093600,CONT_AWD_70US0923C70093600_7009_-NONE-_-NONE-,2026-01-07,M,M,P00005,0.0,0.0,DEFINITIVE CONTRACT,INCORPORATE DD254,2023-09-25,,Department of Homeland Security,U.S. Secret Service,Department of Homeland Security,ACTIVE SECURITY CONSULTING LLC,S4HGJBWRBAR5,541513,R425,DC,2026-01-09T21:58:06.924Z,https://www.usaspending.gov/award/70US0923C700...
4,278279333,15JA0524F00000031,CONT_AWD_15JA0524F00000031_1501_DJF171200S0000...,2026-01-07,M,M,P00014,-208.84,-208.84,DELIVERY ORDER,USABUDGET,2023-12-29,,Department of Justice,"Offices, Boards and Divisions",Department of Justice,CBEYONDATA LLC,CLBUNVMCNN98,541512,R499,VA,2026-01-09T21:58:06.924Z,https://www.usaspending.gov/award/15JA0524F000...


In [113]:
df_c.generated_internal_id

0              CONT_AWD_140P2025C0024_1443_-NONE-_-NONE-
1            CONT_AWD_6913G625C100006_6901_-NONE-_-NONE-
2         CONT_AWD_140D0425F0414_1406_140D8121D0001_1406
3          CONT_AWD_70US0923C70093600_7009_-NONE-_-NONE-
4      CONT_AWD_15JA0524F00000031_1501_DJF171200S0000...
                             ...                        
995    CONT_AWD_6933A225F00272N_6938_693JF725D000020_...
996            CONT_AWD_36C24822P0488_3600_-NONE-_-NONE-
997            CONT_AWD_36C25720C0109_3600_-NONE-_-NONE-
998       CONT_AWD_47PJ0025F0055_4740_47PJ0020D0015_4740
999     CONT_AWD_75N95025F00037_7529_75N95021D00012_7529
Name: generated_internal_id, Length: 1000, dtype: object

In [115]:
df_a.contract_transaction_unique_key

2602         1540_-NONE-_15BBNF26C00000018_0_-NONE-_0
2606            6920_-NONE-_6973GH26C00007_0_-NONE-_0
2974      6920_6920_697DCK26F00028_0_697DCK22D00001_0
3222             3600_-NONE-_36C25926P0023_0_-NONE-_0
3365        3600_3600_36C10G26N0009_0_36C10G25D0030_0
                             ...                     
148586      3600_3600_36C25026N1001_0_36C25025D5411_0
149002      3600_3600_36C26126N0163_0_36C26120D0089_0
149299    7527_7527_75H70526F06008_0_75H70524D00003_0
149345      3600_3600_36C26226N0132_0_36C26221A0017_0
149421      3600_3600_36C24726N0034_0_36C24723D0021_0
Name: contract_transaction_unique_key, Length: 710, dtype: object

In [124]:
df_a.contract_award_unique_key

2602         CONT_AWD_15BBNF26C00000018_1540_-NONE-_-NONE-
2606            CONT_AWD_6973GH26C00007_6920_-NONE-_-NONE-
2974      CONT_AWD_697DCK26F00028_6920_697DCK22D00001_6920
3222             CONT_AWD_36C25926P0023_3600_-NONE-_-NONE-
3365        CONT_AWD_36C10G26N0009_3600_36C10G25D0030_3600
                                ...                       
148586      CONT_AWD_36C25026N1001_3600_36C25025D5411_3600
149002      CONT_AWD_36C26126N0163_3600_36C26120D0089_3600
149299    CONT_AWD_75H70526F06008_7527_75H70524D00003_7527
149345      CONT_AWD_36C26226N0132_3600_36C26221A0017_3600
149421      CONT_AWD_36C24726N0034_3600_36C24723D0021_3600
Name: contract_award_unique_key, Length: 710, dtype: object

In [127]:
df_b.columns

Index(['transaction_id', 'award_id', 'action_date', 'action_type',
       'action_type_description', 'modification_number',
       'federal_action_obligation', 'total_dollars_obligated', 'award_type',
       'award_description', 'period_of_performance_start_date',
       'period_of_performance_current_end_date', 'awarding_agency_name',
       'awarding_sub_agency_name', 'funding_agency_name', 'recipient_name',
       'recipient_uei', 'naics_code', 'product_or_service_code',
       'place_of_performance_state', 'ingested_at', 'source_url'],
      dtype='object')

In [126]:
len(set(df_a))

297

In [123]:
df_a.iloc[0]

contract_transaction_unique_key                 1540_-NONE-_15BBNF26C00000018_0_-NONE-_0
contract_award_unique_key                  CONT_AWD_15BBNF26C00000018_1540_-NONE-_-NONE-
award_id_piid                                                          15BBNF26C00000018
modification_number                                                                    0
transaction_number                                                                  0.00
                                                             ...                        
highly_compensated_officer_5_name                                                    NaN
highly_compensated_officer_5_amount                                                  NaN
usaspending_permalink                  https://www.usaspending.gov/award/CONT_AWD_15B...
initial_report_date                                                           2025-11-20
last_modified_date                                                            2025-12-03
Name: 2602, Length: 2

In [122]:
sum(df_a.contract_transaction_unique_key.map(lambda x:   "6913G625C100006" in x))

0