In [None]:
# No DynamoDB table used in this part.

# 1. Convert all XLSX files in the salary directory to CSV
import os
import csv
from openpyxl import load_workbook
from concurrent.futures import ThreadPoolExecutor
import threading

input_dir = "Datasets/salary"
output_dir = "Datasets/csv_output"
os.makedirs(output_dir, exist_ok=True)
print_lock = threading.Lock()

def log(msg):
    with print_lock:
        print(msg)

def convert_file(filename):
    if not filename.endswith(".xlsx"):
        return
    input_path = os.path.join(input_dir, filename)
    output_filename = os.path.splitext(filename)[0] + ".csv"
    output_path = os.path.join(output_dir, output_filename)
    log(f"🔄 START: {filename}")
    try:
        wb = load_workbook(filename=input_path, read_only=True)
        ws = wb.active
        with open(output_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            for row in ws.iter_rows(values_only=True):
                writer.writerow(row)
        log(f"✅ DONE:  {filename} → {output_filename}")
    except Exception as e:
        log(f"❌ ERROR: {filename}: {e}")

files = [f for f in os.listdir(input_dir) if f.endswith(".xlsx")]

log(f"\n🚀 Processing {len(files)} files from '{input_dir}' to '{output_dir}'...\n")
with ThreadPoolExecutor() as executor:
    executor.map(convert_file, files)
log("\n🎯 All XLSX files processed.\n")

# 2. Convert the resulting CSV files to DynamoDB-ready format
import os
import csv

input_dir = "Datasets/csv_output"
output_dir = "Datasets/dynamodb_ready_by_year"
os.makedirs(output_dir, exist_ok=True)

fieldnames = [
    "occ_code",
    "salary_key",
    "a_median",
    "m_median",
    "m_pct10",
    "m_pct90"
]

def safe_div(val):
    try:
        f = float(val)
        return f"{f / 12:.2f}"
    except (ValueError, TypeError):
        return ""

for filename in os.listdir(input_dir):
    if not filename.endswith(".csv"):
        continue

    # Extract year from filename, else "unknown"
    basename = os.path.splitext(filename)[0]
    year = None
    for part in basename.split('_'):
        if part.isdigit() and len(part) == 4:
            year = part
            break
    if not year:
        year = "unknown"

    input_path = os.path.join(input_dir, filename)
    output_path = os.path.join(output_dir, f"{basename}_dynamodb_ready.csv")

    print(f"🔄 Converting {filename} to DynamoDB format as {output_path}...")

    with open(input_path, mode='r', encoding='utf-8') as infile, \
         open(output_path, mode='w', newline='', encoding='utf-8') as outfile:

        reader = csv.DictReader(infile)
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        count = 0
        log_interval = 10000

        for row_num, row in enumerate(reader, 2):
            try:
                occ_code = row.get('occ_code') or row.get('OCC_CODE')
                a_median = row.get('a_median') or row.get('A_MEDIAN')
                a_pct10 = row.get('a_pct10') or row.get('A_PCT10')
                a_pct90 = row.get('a_pct90') or row.get('A_PCT90')
                # Convert to float if possible; otherwise, default to 0
                try:
                    a_median = float(a_median)
                except (TypeError, ValueError):
                    a_median = 0

                # Key fields
                area_title = row.get('area_title') or row.get('AREA_TITLE') or 'U.S.'
                naics_title = row.get('naics_title') or row.get('NAICS_TITLE') or 'Cross-Industry'
                salary_key = f"{year}#{area_title}#{naics_title}"

                m_median = safe_div(a_median)
                m_pct10 = safe_div(a_pct10)
                m_pct90 = safe_div(a_pct90)

                writer.writerow({
                    "occ_code": occ_code,
                    "salary_key": salary_key,
                    "a_median": a_median,
                    "m_median": m_median,
                    "m_pct10": m_pct10,
                    "m_pct90": m_pct90,
                })
                count += 1

                if count % log_interval == 0:
                    print(f"   ...Processed {count} rows so far in {filename}")

            except Exception as e:
                print(f"❌ Failed on row {row_num} in {filename}: {e}")

    print(f"✅ Wrote {count} rows to {output_path}")

print("\n🎯 All files converted and saved to:", output_dir)

🔄 Converting national_M_2016_dl.csv to DynamoDB format as Datasets/dynamodb_ready_by_year\national_M_2016_dl_dynamodb_ready.csv...
✅ Wrote 1394 rows to Datasets/dynamodb_ready_by_year\national_M_2016_dl_dynamodb_ready.csv
🔄 Converting national_M_2017_dl.csv to DynamoDB format as Datasets/dynamodb_ready_by_year\national_M_2017_dl_dynamodb_ready.csv...
✅ Wrote 1382 rows to Datasets/dynamodb_ready_by_year\national_M_2017_dl_dynamodb_ready.csv
🔄 Converting national_M_2018_dl.csv to DynamoDB format as Datasets/dynamodb_ready_by_year\national_M_2018_dl_dynamodb_ready.csv...
✅ Wrote 1379 rows to Datasets/dynamodb_ready_by_year\national_M_2018_dl_dynamodb_ready.csv
🔄 Converting national_M_2019_dl.csv to DynamoDB format as Datasets/dynamodb_ready_by_year\national_M_2019_dl_dynamodb_ready.csv...
✅ Wrote 1329 rows to Datasets/dynamodb_ready_by_year\national_M_2019_dl_dynamodb_ready.csv
🔄 Converting national_M_2020_dl.csv to DynamoDB format as Datasets/dynamodb_ready_by_year\national_M_2020_dl_dyn