In [1]:
#API_ID = "yb45tcct7vg3le2oc3cxt3hdoy" #staging
API_ID = "alek4h7jlreffeoe5tocxgnx2u" #Trupti
CAREER_SALARY_TABLE      = f"careerSalary-{API_ID}-NONE"
CAREER_EDUCATION_TABLE   = f"careerEducation-{API_ID}-NONE"
CAREER_SKILLS_TABLE      = f"careerSkills-{API_ID}-NONE"
CAREER_DESCRIPTION_TABLE = f"careerDescription-{API_ID}-NONE"
SOC_CODES_TABLE          = f"socCodes-{API_ID}-NONE"
SOC_CODE_SINGLE_LINE_TABLE = f"socCodesCompressed-{API_ID}-NONE"

In [2]:
import os
import csv
import time
import threading
from decimal import Decimal
from datetime import datetime, timezone
import boto3
from queue import Queue

# Config
CLEANED_DIR = "Datasets/dynamodb_ready_by_year"
REGION = "us-west-2"
BATCH_SIZE = 1000
NUM_THREADS = 4
REJECTED_FILE = "rejected_dynamodb_rows.csv"

# AWS setup
dynamodb = boto3.resource("dynamodb", region_name=REGION)
table = dynamodb.Table(CAREER_SALARY_TABLE)

queue = Queue()
rejected_rows = []
lock = threading.Lock()

def to_decimal(val):
    try:
        return Decimal(str(round(float(val), 2))) if val not in [None, '', 'null'] else None
    except:
        return None

def current_timestamp():
    return datetime.now(timezone.utc).isoformat()

def process_batch(batch, batch_number):
    start = time.time()
    try:
        with table.batch_writer(overwrite_by_pkeys=['occCode']) as writer:
            for item in batch:
                try:
                    writer.put_item(Item=item)
                except Exception as e:
                    with lock:
                        rejected_rows.append({**item, 'error': str(e)})
        print(f"✅ Batch {batch_number} inserted in {time.time() - start:.2f}s ({len(batch)} records)")
    except Exception as e:
        with lock:
            for item in batch:
                rejected_rows.append({**item, 'error': str(e)})
        print(f"❌ Batch {batch_number} failed with error: {str(e)}")

def worker():
    batch_number = 1
    while True:
        batch = queue.get()
        if batch is None:
            break
        process_batch(batch, batch_number)
        batch_number += 1
        queue.task_done()

def read_and_queue_csv(filepath):
    with open(filepath, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        batch = []
        for row in reader:
            try:
                now = current_timestamp()
                item = {
                    'occCode': str(row['occ_code']),
                    'salaryKey': str(row['salary_key']),
                    'aMedian': to_decimal(row.get('a_median')),
                    'mMedian': to_decimal(row.get('m_median')),
                    'mPct10': to_decimal(row.get('m_pct10')),
                    'mPct90': to_decimal(row.get('m_pct90')),
                    'createdAt': now,
                    'updatedAt': now,
                }
                # Remove None fields
                item = {k: v for k, v in item.items() if v is not None}
                batch.append(item)
                if len(batch) == BATCH_SIZE:
                    queue.put(batch)
                    batch = []
            except Exception as e:
                with lock:
                    rejected_rows.append({**row, 'error': str(e)})

        if batch:
            queue.put(batch)

def save_rejected_rows():
    if rejected_rows:
        keys = rejected_rows[0].keys()
        with open(REJECTED_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(rejected_rows)
        print(f"⚠️ {len(rejected_rows)} records rejected. Saved to {REJECTED_FILE}")

# Main
print(f"🚀 Starting upload from all CSVs in: {CLEANED_DIR}")
start = time.time()

threads = []
for _ in range(NUM_THREADS):
    t = threading.Thread(target=worker)
    t.start()
    threads.append(t)

for filename in os.listdir(CLEANED_DIR):
    if filename.endswith("_dynamodb_ready.csv"):
        filepath = os.path.join(CLEANED_DIR, filename)
        print(f"📂 Queuing data from {filename}")
        read_and_queue_csv(filepath)

queue.join()

# Stop workers
for _ in range(NUM_THREADS):
    queue.put(None)
for t in threads:
    t.join()

save_rejected_rows()
print(f"🎉 All batches uploaded in {time.time() - start:.2f} seconds.")


🚀 Starting upload from all CSVs in: Datasets/dynamodb_ready_by_year
📂 Queuing data from national_M_2016_dl_dynamodb_ready.csv
📂 Queuing data from national_M_2017_dl_dynamodb_ready.csv
📂 Queuing data from national_M_2018_dl_dynamodb_ready.csv
📂 Queuing data from national_M_2019_dl_dynamodb_ready.csv
📂 Queuing data from national_M_2020_dl_dynamodb_ready.csv
📂 Queuing data from national_M_2021_dl_dynamodb_ready.csv
📂 Queuing data from national_M_2022_dl_dynamodb_ready.csv
📂 Queuing data from national_M_2023_dl_dynamodb_ready.csv
📂 Queuing data from national_M_2024_dl_dynamodb_ready.csv
📂 Queuing data from oes_research_2016_allsectors_dynamodb_ready.csv
✅ Batch 1 inserted in 2.87s (394 records)
✅ Batch 1 inserted in 2.74s (382 records)
✅ Batch 2 inserted in 1.37s (379 records)
✅ Batch 1 inserted in 4.98s (1000 records)
✅ Batch 1 inserted in 5.26s (1000 records)
✅ Batch 2 inserted in 3.38s (1000 records)
✅ Batch 2 inserted in 1.24s (329 records)
✅ Batch 3 inserted in 0.92s (329 records)
✅ B

In [3]:
#insert education data
import os
import csv
import time
import threading
from collections import defaultdict
from datetime import datetime, timezone
import boto3
from concurrent.futures import ThreadPoolExecutor, as_completed
from botocore.exceptions import ClientError

# Input CSV and table config
input_file = "Datasets/education_data.csv"
region = "us-west-2"

# Updated field mapping to match new schema (camelCase)
CODE_TO_FIELD = {
    'Less_than_hs': 'lessThanHs',
    'hs_or_eq': 'hsOrEq',
    'Associate_degree': 'associateDegree',
    'Bachelor_degree': 'bachelorDegree',
    'Master_degree': 'masterDegree',
    'Doctorate_degree': 'doctorateDegree',
    'No_requirement': 'noRequirement',
    'Professional_degree': 'professionalDegree',
}

# Thread-safe logging
print_lock = threading.Lock()
def log(msg):
    with print_lock:
        print(msg)

def current_timestamp():
    return datetime.now(timezone.utc).isoformat()

def read_and_reshape(input_file):
    """Reads the CSV and pivots to {occCode: {fields...}} for DynamoDB"""
    edu_data = defaultdict(dict)
    for row in csv.DictReader(open(input_file, newline='', encoding='utf-8')):
        occ = row['SOC']
        code = row['ESTIMATECODE']
        field = CODE_TO_FIELD.get(code)
        if not field:
            continue
        value = row['ESTIMATE']
        edu_data[occ].setdefault('occCode', occ)
        edu_data[occ][field] = value

    # Add timestamps to each record
    now = current_timestamp()
    for record in edu_data.values():
        record['createdAt'] = now
        record['updatedAt'] = now

    return list(edu_data.values())

def dynamodb_batch_write(table, items):
    with table.batch_writer(overwrite_by_pkeys=['occCode']) as batch:
        for item in items:
            batch.put_item(Item=item)

def batch_iterable(iterable, batch_size):
    batch = []
    for item in iterable:
        batch.append(item)
        if len(batch) == batch_size:
            yield batch
            batch = []
    if batch:
        yield batch

def insert_batches_singlethreaded(items, table):
    start = time.time()
    total = len(items)
    batch_num = 0
    for batch in batch_iterable(items, 25):
        batch_num += 1
        try:
            dynamodb_batch_write(table, batch)
            log(f"✅ Batch {batch_num} ({len(batch)}) inserted [{(batch_num-1)*25+1}-{batch_num*25}]")
        except ClientError as e:
            log(f"❌ Batch {batch_num} error: {e}")
    log(f"Total items inserted: {total}")
    return time.time() - start

def insert_batches_multithreaded(items, table, n_workers=4):
    start = time.time()
    batches = list(batch_iterable(items, 100))
    batch_num = 0
    def upload_batch(batch):
        nonlocal batch_num
        batch_num += 1
        try:
            dynamodb_batch_write(table, batch)
            log(f"✅ Batch {batch_num} ({len(batch)}) inserted")
        except ClientError as e:
            log(f"❌ Batch {batch_num} error: {e}")

    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        futures = [executor.submit(upload_batch, batch) for batch in batches]
        for f in as_completed(futures):
            pass
    log(f"Total items inserted: {len(items)}")
    return time.time() - start

if __name__ == "__main__":
    # 1. Read and reshape input
    log(f"🔄 Reading and pivoting CSV: {input_file}")
    items = read_and_reshape(input_file)
    log(f"📦 Total DynamoDB items to insert: {len(items)}")

    # 2. Setup DynamoDB
    session = boto3.Session(region_name=region)
    dynamodb = session.resource('dynamodb')
    table = dynamodb.Table(CAREER_EDUCATION_TABLE)

    # 3. Single-threaded insert
    log("🚀 Starting single-threaded DynamoDB insert...")
    t1 = insert_batches_singlethreaded(items, table)
    log(f"⏱️ Single-threaded insert time: {t1:.2f} sec")

    # 4. Multi-threaded insert
    log("🚀 Starting multi-threaded DynamoDB insert (4 threads)...")
    t2 = insert_batches_multithreaded(items, table, n_workers=4)
    log(f"⏱️ Multi-threaded insert time: {t2:.2f} sec")

    log("🎯 Script completed.")

🔄 Reading and pivoting CSV: Datasets/education_data.csv
📦 Total DynamoDB items to insert: 175
🚀 Starting single-threaded DynamoDB insert...
✅ Batch 1 (25) inserted [1-25]
✅ Batch 2 (25) inserted [26-50]
✅ Batch 3 (25) inserted [51-75]
✅ Batch 4 (25) inserted [76-100]
✅ Batch 5 (25) inserted [101-125]
✅ Batch 6 (25) inserted [126-150]
✅ Batch 7 (25) inserted [151-175]
Total items inserted: 175
⏱️ Single-threaded insert time: 1.37 sec
🚀 Starting multi-threaded DynamoDB insert (4 threads)...
✅ Batch 2 (100) inserted
✅ Batch 2 (75) inserted
Total items inserted: 175
⏱️ Multi-threaded insert time: 1.22 sec
🎯 Script completed.


In [4]:
#insert career skills data
import os
import csv
import ast
import time
import threading
from datetime import datetime, timezone
import boto3
from queue import Queue

# CONFIGURATION
CSV_FILE = "Datasets/skills_data.csv"
REGION = "us-west-2"
BATCH_SIZE = 100
NUM_THREADS = 4
REJECTED_FILE = "rejected_career_skills.csv"

# AWS setup
dynamodb = boto3.resource("dynamodb", region_name=REGION)
table = dynamodb.Table(CAREER_SKILLS_TABLE)

queue = Queue()
rejected_rows = []
lock = threading.Lock()

def current_timestamp():
    return datetime.now(timezone.utc).isoformat()

def process_batch(batch, batch_number):
    start = time.time()
    try:
        with table.batch_writer(overwrite_by_pkeys=['occCode']) as writer:
            for item in batch:
                try:
                    writer.put_item(Item=item)
                except Exception as e:
                    with lock:
                        rejected_rows.append({**item, 'error': str(e)})
        print(f"✅ Batch {batch_number} inserted in {time.time() - start:.2f}s ({len(batch)} records)")
    except Exception as e:
        with lock:
            for item in batch:
                rejected_rows.append({**item, 'error': str(e)})
        print(f"❌ Batch {batch_number} failed with error: {str(e)}")

def worker():
    batch_number = 1
    while True:
        batch = queue.get()
        if batch is None:
            break
        process_batch(batch, batch_number)
        batch_number += 1
        queue.task_done()

def read_and_queue_csv(filepath):
    with open(filepath, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        batch = []
        for row_num, row in enumerate(reader, 2):
            try:
                occ_code = str(row['SOC_CODE'])
                skills_str = row['TYPICAL_SKILLS']
                skills = ast.literal_eval(skills_str)
                now = current_timestamp()
                item = {
                    'occCode': occ_code,
                    'skills': skills,
                    'createdAt': now,
                    'updatedAt': now
                }
                batch.append(item)
                if len(batch) == BATCH_SIZE:
                    queue.put(batch)
                    batch = []
                if row_num % 1000 == 0:
                    print(f"   ...Processed {row_num} rows so far")
            except Exception as e:
                with lock:
                    rejected_rows.append({**row, 'error': str(e)})
        if batch:
            queue.put(batch)

def save_rejected_rows():
    if rejected_rows:
        keys = rejected_rows[0].keys()
        with open(REJECTED_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(rejected_rows)
        print(f"⚠️ {len(rejected_rows)} records rejected. Saved to {REJECTED_FILE}")

# Main
print(f"🚀 Starting upload from: {CSV_FILE}")
start = time.time()

threads = []
for _ in range(NUM_THREADS):
    t = threading.Thread(target=worker)
    t.start()
    threads.append(t)

read_and_queue_csv(CSV_FILE)
queue.join()

# Stop workers
for _ in range(NUM_THREADS):
    queue.put(None)
for t in threads:
    t.join()

save_rejected_rows()
print(f"🎉 All batches uploaded in {time.time() - start:.2f} seconds.")


🚀 Starting upload from: Datasets/skills_data.csv
✅ Batch 1 inserted in 1.74s (100 records)
✅ Batch 1 inserted in 1.86s (100 records)
✅ Batch 2 inserted in 0.38s (100 records)
✅ Batch 1 inserted in 2.01s (100 records)
✅ Batch 1 inserted in 2.04s (100 records)
✅ Batch 2 inserted in 0.25s (63 records)
✅ Batch 2 inserted in 0.46s (100 records)
✅ Batch 3 inserted in 0.33s (100 records)
🎉 All batches uploaded in 3.64 seconds.


In [5]:
#inserts career description data
import os
import csv
import time
import threading
from datetime import datetime, timezone
import boto3
from queue import Queue

# CONFIGURATION
CSV_FILE = "Datasets/description.csv"
REGION = "us-west-2"
BATCH_SIZE = 100
NUM_THREADS = 4
REJECTED_FILE = "rejected_career_description.csv"

# AWS setup
dynamodb = boto3.resource("dynamodb", region_name=REGION)
table = dynamodb.Table(CAREER_DESCRIPTION_TABLE)

queue = Queue()
rejected_rows = []
lock = threading.Lock()

def current_timestamp():
    return datetime.now(timezone.utc).isoformat()

def process_batch(batch, batch_number):
    start = time.time()
    try:
        with table.batch_writer(overwrite_by_pkeys=['occCode']) as writer:
            for item in batch:
                try:
                    writer.put_item(Item=item)
                except Exception as e:
                    with lock:
                        rejected_rows.append({**item, 'error': str(e)})
        print(f"✅ Batch {batch_number} inserted in {time.time() - start:.2f}s ({len(batch)} records)")
    except Exception as e:
        with lock:
            for item in batch:
                rejected_rows.append({**item, 'error': str(e)})
        print(f"❌ Batch {batch_number} failed with error: {str(e)}")

def worker():
    batch_number = 1
    while True:
        batch = queue.get()
        if batch is None:
            break
        process_batch(batch, batch_number)
        batch_number += 1
        queue.task_done()

def read_and_queue_csv(filepath):
    with open(filepath, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        batch = []
        for row_num, row in enumerate(reader, 2):
            try:
                now = current_timestamp()
                item = {
                    'occCode': str(row['Code']),
                    'description': str(row['Description']),
                    'createdAt': now,
                    'updatedAt': now,
                }
                batch.append(item)
                if len(batch) == BATCH_SIZE:
                    queue.put(batch)
                    batch = []
                if row_num % 1000 == 0:
                    print(f"   ...Processed {row_num} rows so far")
            except Exception as e:
                with lock:
                    rejected_rows.append({**row, 'error': str(e)})
        if batch:
            queue.put(batch)

def save_rejected_rows():
    if rejected_rows:
        keys = rejected_rows[0].keys()
        with open(REJECTED_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(rejected_rows)
        print(f"⚠️ {len(rejected_rows)} records rejected. Saved to {REJECTED_FILE}")

# Main
print(f"🚀 Starting upload from: {CSV_FILE}")
start = time.time()

threads = []
for _ in range(NUM_THREADS):
    t = threading.Thread(target=worker)
    t.start()
    threads.append(t)

read_and_queue_csv(CSV_FILE)
queue.join()

# Stop workers
for _ in range(NUM_THREADS):
    queue.put(None)
for t in threads:
    t.join()

save_rejected_rows()
print(f"🎉 All batches uploaded in {time.time() - start:.2f} seconds.")



🚀 Starting upload from: Datasets/description.csv
   ...Processed 1000 rows so far
✅ Batch 1 inserted in 1.36s (100 records)
✅ Batch 1 inserted in 1.41s (100 records)
✅ Batch 1 inserted in 1.43s (100 records)
✅ Batch 1 inserted in 1.43s (100 records)
✅ Batch 2 inserted in 0.14s (100 records)
✅ Batch 2 inserted in 0.15s (100 records)✅ Batch 2 inserted in 0.18s (100 records)

✅ Batch 3 inserted in 0.05s (16 records)
✅ Batch 2 inserted in 0.20s (100 records)
✅ Batch 3 inserted in 0.19s (100 records)
✅ Batch 3 inserted in 0.16s (100 records)
🎉 All batches uploaded in 2.60 seconds.


In [6]:
#insert soc codes data
import os
import csv
import time
import threading
from datetime import datetime, timezone
import boto3
from queue import Queue

# CONFIGURATION
CSV_FILE = "Datasets/unique_occ_codes.csv"
REGION = "us-west-2"
BATCH_SIZE = 100
NUM_THREADS = 4
REJECTED_FILE = "rejected_soc_codes.csv"

# AWS setup
dynamodb = boto3.resource("dynamodb", region_name=REGION)
table = dynamodb.Table(SOC_CODES_TABLE)

queue = Queue()
rejected_rows = []
lock = threading.Lock()

def current_timestamp():
    return datetime.now(timezone.utc).isoformat()

def process_batch(batch, batch_number):
    start = time.time()
    try:
        with table.batch_writer(overwrite_by_pkeys=['occCode']) as writer:
            for item in batch:
                try:
                    writer.put_item(Item=item)
                except Exception as e:
                    with lock:
                        rejected_rows.append({**item, 'error': str(e)})
        print(f"✅ Batch {batch_number} inserted in {time.time() - start:.2f}s ({len(batch)} records)")
    except Exception as e:
        with lock:
            for item in batch:
                rejected_rows.append({**item, 'error': str(e)})
        print(f"❌ Batch {batch_number} failed with error: {str(e)}")

def worker():
    batch_number = 1
    while True:
        batch = queue.get()
        if batch is None:
            break
        process_batch(batch, batch_number)
        batch_number += 1
        queue.task_done()

def read_and_queue_csv(filepath):
    with open(filepath, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        batch = []
        for row_num, row in enumerate(reader, 2):
            try:
                now = current_timestamp()
                item = {
                    'occCode': str(row['OCC_CODE']),
                    'occTitle': str(row['OCC_TITLE']),
                    'createdAt': now,
                    'updatedAt': now,
                }
                batch.append(item)
                if len(batch) == BATCH_SIZE:
                    queue.put(batch)
                    batch = []
                if row_num % 1000 == 0:
                    print(f"   ...Processed {row_num} rows so far")
            except Exception as e:
                with lock:
                    rejected_rows.append({**row, 'error': str(e)})
        if batch:
            queue.put(batch)

def save_rejected_rows():
    if rejected_rows:
        keys = rejected_rows[0].keys()
        with open(REJECTED_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(rejected_rows)
        print(f"⚠️ {len(rejected_rows)} records rejected. Saved to {REJECTED_FILE}")

# Main
print(f"🚀 Starting upload from: {CSV_FILE}")
start = time.time()

threads = []
for _ in range(NUM_THREADS):
    t = threading.Thread(target=worker)
    t.start()
    threads.append(t)

read_and_queue_csv(CSV_FILE)
queue.join()

# Stop workers
for _ in range(NUM_THREADS):
    queue.put(None)
for t in threads:
    t.join()

save_rejected_rows()
print(f"🎉 All batches uploaded in {time.time() - start:.2f} seconds.")


🚀 Starting upload from: Datasets/unique_occ_codes.csv
   ...Processed 1000 rows so far
✅ Batch 1 inserted in 1.34s (100 records)
✅ Batch 1 inserted in 1.50s (100 records)
✅ Batch 1 inserted in 1.55s (100 records)
✅ Batch 1 inserted in 1.55s (100 records)
✅ Batch 2 inserted in 0.25s (100 records)
✅ Batch 2 inserted in 0.18s (100 records)
✅ Batch 2 inserted in 0.18s (100 records)
✅ Batch 2 inserted in 0.17s (100 records)
✅ Batch 3 inserted in 0.15s (100 records)
✅ Batch 3 inserted in 0.07s (43 records)
✅ Batch 3 inserted in 0.13s (100 records)
🎉 All batches uploaded in 2.03 seconds.


In [None]:
import boto3
from datetime import datetime, timezone

# Config
REGION = "us-west-2"
FIELD_NAME = "occList"

# Helper to generate ISO timestamp
def current_timestamp():
    return datetime.now(timezone.utc).isoformat()

# Read and process CSV file (only the first line in this case)
file_path = "Datasets/compressed-soc-codes.csv"
with open(file_path, encoding='utf-8') as f:
    header_line = f.readline().strip()

# Initialize DynamoDB
dynamodb = boto3.resource("dynamodb", region_name=REGION)
table = dynamodb.Table(SOC_CODE_SINGLE_LINE_TABLE)

# Create the item with the compressed SOC code list
item = {
    "id": "compressed_soc_codes",  # ✅ fixed: use string key
    FIELD_NAME: header_line,
    "createdAt": current_timestamp(),
    "updatedAt": current_timestamp(),
}

# Insert into the table
try:
    table.put_item(Item=item)
    print("✅ SOC codes inserted successfully.")
except Exception as e:
    print(f"❌ Failed to insert: {e}")


❌ Failed to insert: An error occurred (ResourceNotFoundException) when calling the PutItem operation: Requested resource not found
