In [109]:
# reconcile downloaded vs scraped
# a: downloaded, completely unfiltered
# b: scraped, partially filtered
# after applying more filters...
# count: df_a: 710 | df_b: 735
# amount: a: 6,322,104,713 | b: 6,508,169,561
# period: ~55 days

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
import glob

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 10


In [None]:
# === Fetch Summary ===
# Total Transactions: 21067
# Unique Awards: 14649
# Total Obligation: $56,339,936,114.12
# Date Range: 2025-10-10 to 2026-01-08

# https://www.usaspending.gov/download_center/award_data_archive

In [4]:
data_fn_a = "../data/dl/FY2026_All_Contracts_Full_20251207_1.csv"
data_fn_b = "../data/round3/transactions_normalized_2026-01-09_13-04-38.json"

df_a = pd.read_csv(data_fn_a)

with open(data_fn_b, 'r') as f:
    transactions_data = json.load(f)
df_b = pd.DataFrame(transactions_data)

  df_a = pd.read_csv(data_fn_a)


In [5]:
df_a.shape, df_b.shape

((192181, 297), (21067, 22))

In [10]:
# both are only contracts

In [8]:
df_a.award_type.value_counts()

award_type
DELIVERY ORDER         72464
BPA CALL               50759
PURCHASE ORDER         33715
DEFINITIVE CONTRACT     6017
Name: count, dtype: int64

In [9]:
df_b.award_type.value_counts()

award_type
DELIVERY ORDER         12708
DEFINITIVE CONTRACT     5200
BPA CALL                2300
PURCHASE ORDER           859
Name: count, dtype: int64

In [24]:
# Utilities

In [11]:
def psize():
    print(f"df_a: {len(df_a):,} | df_b: {len(df_b):,}")

In [12]:
psize()

df_a: 192,181 | df_b: 21,067


In [73]:
# reset
df_a = pd.read_csv(data_fn_a)

with open(data_fn_b, 'r') as f:
    transactions_data = json.load(f)
df_b = pd.DataFrame(transactions_data)

  df_a = pd.read_csv(data_fn_a)


In [74]:
# Date Filtering

In [75]:
df_a.action_date.min(), df_b.action_date.min()

('2025-10-01', '2025-10-10')

In [76]:
df_a.action_date.max(), df_b.action_date.max()

('2025-12-04', '2026-01-07')

In [77]:
# so let's go 2025-10-11 -> 2025-12-03
start_date, end_date = '2025-10-11', '2025-12-03'

In [78]:
mask_a = (df_a.action_date >= start_date) & (df_a.action_date <= end_date)
mask_b = (df_b.action_date >= start_date) & (df_b.action_date <= end_date)

In [79]:
psize()
df_a = df_a[mask_a]
df_b = df_b[mask_b]
psize()

df_a: 192,181 | df_b: 21,067
df_a: 147,917 | df_b: 11,199


In [80]:
# Amount Filtering

In [81]:
thresh_amount = 900_000
mask_a = df_a.federal_action_obligation >= thresh_amount
mask_b = df_b.federal_action_obligation >= thresh_amount

psize()
df_a = df_a[mask_a]
df_b = df_b[mask_b]
psize()

df_a: 147,917 | df_b: 11,199
df_a: 1,978 | df_b: 1,945


In [107]:
# Comparison of id's, no shared id across datasets

In [83]:
txid_a = set(df_a.contract_transaction_unique_key)
len(txid_a)

1978

In [84]:
txid_b = set(df_b.transaction_id)
len(txid_b)

1775

In [85]:
# they are not the same form
txid_a.pop(), txid_b.pop()

('3600_3600_36C10F25N0022_P00004_36C10F23D0005_0', 279969798)

In [86]:
# new transactions

In [87]:
# action_type_description = "NEW" <-> action_type = "A", df_a doesnt have descr field

mask_a = ((df_a['modification_number'] == '0') | (df_a['action_type'] == 'A'))
mask_b = ((df_b['modification_number'] == '0') | (df_b['action_type_description'] == 'NEW'))

In [89]:
psize()
df_a = df_a[mask_a]
df_b = df_b[mask_b]
psize()

df_a: 1,978 | df_b: 1,945
df_a: 710 | df_b: 735


In [103]:
# comparison

In [105]:
total_a = df_a.federal_action_obligation.sum()
total_b = df_b.federal_action_obligation.sum()
print(f"a: {int(total_a):,} | b: {int(total_b):,}")

a: 6,322,104,713 | b: 6,508,169,561


In [106]:
# more comparisons

In [99]:
cols = ['action_date', 'federal_action_obligation']

In [101]:
df_b[cols].sort_values(by='federal_action_obligation', ascending=True)

Unnamed: 0,action_date,federal_action_obligation
11904,2025-11-25,900000.00
11605,2025-11-26,900000.00
13451,2025-11-20,900000.00
13125,2025-11-20,900628.48
11214,2025-11-26,903277.44
...,...,...
17108,2025-10-31,300000000.00
13000,2025-11-21,372832130.00
14840,2025-11-14,598444871.04
16578,2025-11-04,711388881.21


In [102]:
df_a[cols].sort_values(by='federal_action_obligation', ascending=True)

Unnamed: 0,action_date,federal_action_obligation
38637,2025-11-25,900000.00
33427,2025-11-26,900000.00
65296,2025-11-20,900000.00
66420,2025-11-20,900628.48
31094,2025-11-26,903277.44
...,...,...
112023,2025-10-31,300000000.00
60693,2025-11-21,372832130.00
86948,2025-11-14,598444871.04
104737,2025-11-04,711388881.21
