# ETL for Complete Data

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime, timedelta

## Load Data

In [3]:
# Load complete fetch data
with open('../data/test1/complete_transactions_2026-01-11_17-08-28.json', 'r') as f:
    transactions = json.load(f)
df_tx = pd.DataFrame(transactions)

with open('../data/test1/complete_awards_2026-01-11_17-08-28.json', 'r') as f:
    awards = json.load(f)
df_aw = pd.DataFrame(awards)

## Slice + Join Data

In [4]:
df_aw.shape

(891, 20)

In [5]:
df_aw = df_aw.sort_values('last_modified_date', ascending=False).drop_duplicates(subset=['award_id'], keep='first')

In [6]:
df_aw.shape

(721, 20)

In [7]:
joined = df_tx.merge(
    df_aw,
    on='award_id',
    how='left',
    suffixes=('_tx', '_aw'),
    indicator=True
)

In [8]:
joined.shape

(735, 43)

## Augment Data

In [10]:
mask_diff_recipient = joined.recipient_name_aw != joined.recipient_name_tx
mask_diff_description = joined.award_description_aw != joined.award_description_tx

In [12]:
joined['secondary_award_description'] = np.where(
    mask_diff_description,
    joined.award_description_aw,
    ""
)
joined['secondary_recipient_name'] = np.where(
    mask_diff_recipient,
    joined.recipient_name_aw,
    ""
)

In [15]:
joined['source_url_tx'] = (
    "https://www.usaspending.gov/award/" + joined['generated_internal_id'].astype(str)
)

In [24]:
# pd.set_option('display.max_colwidth', 1000)
# joined.source_url_tx[:10]

## Output Data

In [20]:
output_cols = [
    'action_date',
    'awarding_agency_name',
    'awarding_sub_agency_name',
    'award_description_tx',
    'secondary_award_description',  # _aw value, only if diff from _tx
    'recipient_name_tx',
    'secondary_recipient_name',   # _aw value, only if diff from _tx
    'naics_code_aw',
    'source_url_aw',
    'source_url_tx',  # for https://www.usaspending.gov/award/<generated_internal_id>
]

output_df = joined[output_cols]

In [23]:
# output_df

In [22]:
output_df.to_csv('../data/output_csvs/output-1.csv', index=False)