# Auto-Load Most Recent Dataset & Analyze Date Fields

This notebook:
1. Automatically finds and loads the most recent normalized dataset (awards or transactions)
2. Identifies all date-related columns
3. Analyzes date field completeness and patterns

## Setup

In [1]:
1+1

2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path
from datetime import datetime
import glob

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully!")

Libraries imported successfully!


## Auto-Detect and Load Most Recent Dataset

In [3]:
def find_most_recent_file(data_dir, pattern):
    """Find the most recent file matching pattern in data_dir"""
    files = glob.glob(str(Path(data_dir) / pattern))
    if not files:
        return None
    # Sort by modification time, most recent first
    files.sort(key=lambda x: Path(x).stat().st_mtime, reverse=True)
    return files[0]

# Look for most recent normalized datasets
data_dir = Path('../data/awards')

# awards_file = find_most_recent_file(data_dir, 'awards_normalized_*.json')
# transactions_file = None
transactions_file = find_most_recent_file(data_dir, 'transactions_normalized_*.json')
awards_file = None

print("=== Available Datasets ===")
if awards_file:
    print(f"Most recent awards: {Path(awards_file).name}")
if transactions_file:
    print(f"Most recent transactions: {Path(transactions_file).name}")

# Decide which to load (prefer transactions if available, as they have more detail)
if transactions_file:
    data_file = transactions_file
    data_type = 'transactions'
elif awards_file:
    data_file = awards_file
    data_type = 'awards'
else:
    raise FileNotFoundError("No normalized datasets found!")

print(f"\n=== Loading: {data_type.upper()} ===")
print(f"File: {Path(data_file).name}")

=== Available Datasets ===
Most recent transactions: transactions_normalized_2026-01-08_22-47-13.json

=== Loading: TRANSACTIONS ===
File: transactions_normalized_2026-01-08_22-47-13.json


In [4]:
# Load the dataset
with open(data_file, 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data)

print(f"Loaded {len(df):,} {data_type}")
print(f"Shape: {df.shape}")
print(f"Columns: {df.shape[1]}")

Loaded 1,000 transactions
Shape: (1000, 22)
Columns: 22


## Identify Date Columns

In [5]:
def identify_date_columns(df):
    """
    Identify columns that likely contain dates based on:
    1. Column name contains 'date' or 'time'
    2. Column values match date patterns
    """
    date_columns = []
    
    for col in df.columns:
        # Check column name
        col_lower = col.lower()
        name_has_date = any(keyword in col_lower for keyword in ['date', 'time', 'timestamp', 'at'])
        
        # Check if values look like dates (sample first non-null value)
        sample_val = df[col].dropna().iloc[0] if len(df[col].dropna()) > 0 else None
        value_looks_like_date = False
        
        if sample_val:
            # Check if it matches common date patterns
            val_str = str(sample_val)
            # YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS patterns
            if len(val_str) >= 8 and ('-' in val_str or 'T' in val_str):
                value_looks_like_date = True
        
        if name_has_date or value_looks_like_date:
            date_columns.append(col)
    
    return date_columns

# Find date columns
date_cols = identify_date_columns(df)

print(f"=== DATE COLUMNS FOUND: {len(date_cols)} ===")
for i, col in enumerate(date_cols, 1):
    print(f"{i}. {col}")

=== DATE COLUMNS FOUND: 12 ===
1. action_date
2. action_type_description
3. modification_number
4. federal_action_obligation
5. total_dollars_obligated
6. award_description
7. period_of_performance_start_date
8. period_of_performance_current_end_date
9. recipient_name
10. recipient_uei
11. place_of_performance_state
12. ingested_at


In [6]:
[col for col in date_cols if 'date' in col.lower()]

['action_date',
 'period_of_performance_start_date',
 'period_of_performance_current_end_date']

## Dates Analysis

In [7]:
# Display preview of dataframe with only the date columns
# Filter to actual date columns (not misidentified ones)
actual_date_cols = [col for col in date_cols if 'date' in col.lower() or col == 'ingested_at']

print("=== PREVIEW: Date Columns ===")
print(f"Showing {len(actual_date_cols)} date columns\n")

# Show just the date columns
df[actual_date_cols].head(10)

=== PREVIEW: Date Columns ===
Showing 4 date columns



Unnamed: 0,action_date,period_of_performance_start_date,period_of_performance_current_end_date,ingested_at
0,2026-01-06,2022-07-01,,2026-01-09T03:47:13.583Z
1,2026-01-06,2025-09-03,,2026-01-09T03:47:13.583Z
2,2026-01-06,2025-10-01,,2026-01-09T03:47:13.583Z
3,2026-01-06,2025-01-01,,2026-01-09T03:47:13.583Z
4,2026-01-06,2021-09-24,,2026-01-09T03:47:13.583Z
5,2026-01-06,2025-02-18,,2026-01-09T03:47:13.583Z
6,2026-01-06,2026-01-12,,2026-01-09T03:47:13.583Z
7,2026-01-06,2022-06-01,,2026-01-09T03:47:13.583Z
8,2026-01-06,2023-09-30,,2026-01-09T03:47:13.583Z
9,2026-01-06,2021-02-01,,2026-01-09T03:47:13.583Z


In [8]:
# Display statistics for date columns: min, max, unique count
# Filter to actual date columns
actual_date_cols = [col for col in date_cols if 'date' in col.lower() or col == 'ingested_at']

print("=== DATE COLUMN STATISTICS ===")
print()

date_stats = []

for col in actual_date_cols:
    # Try to parse as dates
    date_series = pd.to_datetime(df[col].replace('', pd.NA), errors='coerce')
    valid_dates = date_series.dropna()
    
    if len(valid_dates) > 0:
        min_date = valid_dates.min()
        max_date = valid_dates.max()
        unique_count = valid_dates.nunique()
        
        # Calculate average (mean date)
        avg_timestamp = valid_dates.astype('int64').mean()
        avg_date = pd.to_datetime(avg_timestamp)
        
        date_stats.append({
            'Column': col,
            'Min': min_date.strftime('%Y-%m-%d'),
            'Max': max_date.strftime('%Y-%m-%d'),
            'Avg': avg_date.strftime('%Y-%m-%d'),
            'Unique Values': unique_count,
            'Valid Records': len(valid_dates)
        })
    else:
        date_stats.append({
            'Column': col,
            'Min': 'N/A',
            'Max': 'N/A',
            'Avg': 'N/A',
            'Unique Values': 0,
            'Valid Records': 0
        })

# Create and display summary dataframe
stats_df = pd.DataFrame(date_stats)
print(stats_df.to_string(index=False))

=== DATE COLUMN STATISTICS ===

                                Column        Min        Max        Avg  Unique Values  Valid Records
                           action_date 2025-12-31 2026-01-06 2026-01-03              7           1000
      period_of_performance_start_date 2007-02-24 2026-01-30 2023-06-18            525           1000
period_of_performance_current_end_date        N/A        N/A        N/A              0              0
                           ingested_at 2026-01-09 2026-01-09 2026-01-09              3           1000


## General

In [9]:
df.head()

Unnamed: 0,transaction_id,award_id,action_date,action_type,action_type_description,modification_number,federal_action_obligation,total_dollars_obligated,award_type,award_description,period_of_performance_start_date,period_of_performance_current_end_date,awarding_agency_name,awarding_sub_agency_name,funding_agency_name,recipient_name,recipient_uei,naics_code,product_or_service_code,place_of_performance_state,ingested_at,source_url
0,280404420,36C26222N0660,2026-01-06,B,CONTINUATION,P00007,-216974.33,-216974.33,DELIVERY ORDER,36C26222N0660 - OB# 600C20345 OB# 600C30063 OB...,2022-07-01,,Department of Veterans Affairs,Department of Veterans Affairs,Department of Veterans Affairs,GLORY TO THE LORD INVESTMENTS INC,DY6NBGBTCLG3,561720,S201,CA,2026-01-09T03:47:13.583Z,https://www.usaspending.gov/award/36C26222N0660
1,277757699,140D0425C0081,2026-01-06,M,M,P00002,0.0,0.0,DEFINITIVE CONTRACT,THIS CONTRACT IS TO PROVIDE PROJECT DELIVERY S...,2025-09-03,,Department of the Interior,Departmental Offices,Department of Veterans Affairs,ONES PLEXUS JOINT VENTURE LLC,UHNZUQL3ACW6,541330,R425,OK,2026-01-09T03:47:13.583Z,https://www.usaspending.gov/award/140D0425C0081
2,291848799,80NSSC25C0504,2026-01-06,M,M,P00001,0.0,0.0,DEFINITIVE CONTRACT,SBIR PHASE III XOGDOR VEHICLE ENHANCEMENTS,2025-10-01,,National Aeronautics and Space Administration,National Aeronautics and Space Administration,National Aeronautics and Space Administration,ASTROBOTIC TECHNOLOGY INC,JAQ3W2MGVNV1,541715,AJ14,PA,2026-01-09T03:47:13.583Z,https://www.usaspending.gov/award/80NSSC25C0504
3,278171310,15B61525F00000051,2026-01-06,C,REVISION,P00008,5806.0,5806.0,DELIVERY ORDER,B2 / FY25 / JANUARY / J. ALLEN / OUTSIDE MEDIC...,2025-01-01,,Department of Justice,Federal Prison System / Bureau of Prisons,Department of Justice,"J ALLEN AND ASSOCIATES OF TEXAS, LLC",CKPHNT439N69,622110,Q999,AZ,2026-01-09T03:47:13.583Z,https://www.usaspending.gov/award/15B61525F000...
4,281083758,47QFLA21F0245,2026-01-06,C,REVISION,P00090,5826923.08,5826923.08,DELIVERY ORDER,SMC ZA DIGITAL ENGINEERING ENVIRONMENT DEE TO...,2021-09-24,,General Services Administration,Federal Acquisition Service,Department of Defense,"SABEL SYSTEMS TECHNOLOGY SOLUTIONS, LLC",F65SGFW2TUC6,541511,DA10,CA,2026-01-09T03:47:13.583Z,https://www.usaspending.gov/award/47QFLA21F0245


## Analyze Each Date Column

In [10]:
# Create detailed analysis of each date column
date_analysis = []

for col in date_cols:
    non_null = df[col].notna().sum()
    null_count = df[col].isna().sum()
    unique_count = df[col].nunique()
    
    # Get sample values (first 3 non-null)
    samples = df[col].dropna().head(3).tolist()
    
    # Try to parse as date to check format
    date_format = "Unknown"
    if len(samples) > 0:
        sample = str(samples[0])
        if 'T' in sample:
            date_format = "ISO 8601 (with time)"
        elif len(sample) == 10 and sample.count('-') == 2:
            date_format = "YYYY-MM-DD"
        elif len(sample) == 0:
            date_format = "Empty string"
    
    date_analysis.append({
        'column': col,
        'non_null_count': non_null,
        'null_count': null_count,
        'pct_filled': f"{(non_null/len(df)*100):.1f}%",
        'unique_values': unique_count,
        'format': date_format,
        'sample_1': samples[0] if len(samples) > 0 else None,
        'sample_2': samples[1] if len(samples) > 1 else None,
        'sample_3': samples[2] if len(samples) > 2 else None,
    })

# Create summary dataframe
date_summary_df = pd.DataFrame(date_analysis)

print("=== DATE COLUMN SUMMARY ===")
print(date_summary_df.to_string(index=False))

=== DATE COLUMN SUMMARY ===
                                column  non_null_count  null_count pct_filled  unique_values               format                                                                                                                                          sample_1                                                                                      sample_2                                   sample_3
                           action_date            1000           0     100.0%              7           YYYY-MM-DD                                                                                                                                        2026-01-06                                                                                    2026-01-06                                 2026-01-06
               action_type_description            1000           0     100.0%             11 ISO 8601 (with time)                                                                 

## Detailed Analysis of Each Date Field

In [11]:
# Detailed look at each date column
for col in date_cols:
    print(f"\n{'='*80}")
    print(f"Column: {col}")
    print(f"{'='*80}")
    
    print(f"\nData Type: {df[col].dtype}")
    print(f"Total Records: {len(df):,}")
    print(f"Non-null: {df[col].notna().sum():,} ({df[col].notna().sum()/len(df)*100:.1f}%)")
    print(f"Null: {df[col].isna().sum():,} ({df[col].isna().sum()/len(df)*100:.1f}%)")
    print(f"Unique values: {df[col].nunique():,}")
    
    # Show value counts for top 5
    print(f"\nTop 5 values:")
    print(df[col].value_counts().head(5))
    
    # Show some examples
    print(f"\nSample values:")
    samples = df[col].dropna().head(10).tolist()
    for i, val in enumerate(samples[:5], 1):
        print(f"  {i}. {val}")


Column: action_date

Data Type: object
Total Records: 1,000
Non-null: 1,000 (100.0%)
Null: 0 (0.0%)
Unique values: 7

Top 5 values:
action_date
2026-01-06    316
2025-12-31    292
2026-01-05    274
2026-01-02     92
2026-01-04     16
Name: count, dtype: int64

Sample values:
  1. 2026-01-06
  2. 2026-01-06
  3. 2026-01-06
  4. 2026-01-06
  5. 2026-01-06

Column: action_type_description

Data Type: object
Total Records: 1,000
Non-null: 1,000 (100.0%)
Null: 0 (0.0%)
Unique values: 11

Top 5 values:
action_type_description
REVISION        264
CONTINUATION    213
M               182
G               150
Unknown          87
Name: count, dtype: int64

Sample values:
  1. CONTINUATION
  2. M
  3. M
  4. REVISION
  5. REVISION

Column: modification_number

Data Type: object
Total Records: 1,000
Non-null: 1,000 (100.0%)
Null: 0 (0.0%)
Unique values: 142

Top 5 values:
modification_number
0         87
P00001    82
P00002    67
P00003    55
P00006    54
Name: count, dtype: int64

Sample values:
 

## Parse Dates and Analyze Date Ranges

In [None]:
# Try to parse date columns and analyze ranges
print("=== DATE RANGE ANALYSIS ===")

for col in date_cols:
    # Try to convert to datetime
    try:
        # Handle empty strings as NaN
        date_series = df[col].replace('', pd.NA)
        date_series = pd.to_datetime(date_series, errors='coerce')
        
        # Get valid dates only
        valid_dates = date_series.dropna()
        
        if len(valid_dates) > 0:
            print(f"\n{col}:")
            print(f"  Valid dates: {len(valid_dates):,}")
            print(f"  Earliest: {valid_dates.min()}")
            print(f"  Latest: {valid_dates.max()}")
            print(f"  Span: {(valid_dates.max() - valid_dates.min()).days:,} days")
        else:
            print(f"\n{col}: No valid dates could be parsed")
    except Exception as e:
        print(f"\n{col}: Error parsing dates - {str(e)}")

## Visualize Date Distribution

In [None]:
# Pick the most useful date column for visualization
# Priority: action_date > award_date > start_date > any other date
viz_col = None
priority_cols = ['action_date', 'award_date', 'start_date']

for pcol in priority_cols:
    if pcol in date_cols:
        viz_col = pcol
        break

if not viz_col and date_cols:
    viz_col = date_cols[0]

if viz_col:
    print(f"Visualizing: {viz_col}")
    
    # Parse dates
    df['_parsed_date'] = pd.to_datetime(df[viz_col].replace('', pd.NA), errors='coerce')
    
    # Plot distribution
    fig, ax = plt.subplots(figsize=(14, 6))
    
    valid_dates = df['_parsed_date'].dropna()
    if len(valid_dates) > 0:
        ax.hist(valid_dates, bins=50, edgecolor='black', alpha=0.7, color='#3498db')
        ax.set_xlabel('Date')
        ax.set_ylabel('Count')
        ax.set_title(f'Distribution of {viz_col}', fontweight='bold', fontsize=14)
        ax.grid(True, alpha=0.3)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
        
        # Clean up temp column
        df.drop('_parsed_date', axis=1, inplace=True)
    else:
        print("No valid dates to visualize")
else:
    print("No date columns available for visualization")

## Summary & Export

In [None]:
# Export date column analysis
output_file = Path('../data/awards/date_columns_analysis.csv')
date_summary_df.to_csv(output_file, index=False)

print(f"=== SUMMARY ===")
print(f"Dataset type: {data_type.upper()}")
print(f"Total records: {len(df):,}")
print(f"Total columns: {df.shape[1]}")
print(f"Date columns found: {len(date_cols)}")
print(f"\nDate columns: {', '.join(date_cols)}")
print(f"\nAnalysis exported to: {output_file}")

## Dataset Preview

In [None]:
# Show first few rows with date columns highlighted
if date_cols:
    print("Date columns + key identifiers:")
    
    # Build column list: identifiers + date columns
    id_cols = []
    if 'award_id' in df.columns:
        id_cols.append('award_id')
    if 'transaction_id' in df.columns:
        id_cols.append('transaction_id')
    if 'recipient_name' in df.columns:
        id_cols.append('recipient_name')
    
    display_cols = id_cols + date_cols
    print(df[display_cols].head(10))
else:
    print("Full dataset preview:")
    print(df.head())