In [None]:
# only run this script 1 time per CSV dataset downloaded from here: https://www.bls.gov/oes/tables.htm only download AllData XLSX files for ech year.
# once run, you can discard the shitty XLSX files and use the CSV files instead, discard those pieces of shit for good!
import os
import csv
from openpyxl import load_workbook
from concurrent.futures import ThreadPoolExecutor
import threading

input_dir = "Datasets/salary"
output_dir = "Datasets/csv_output"
os.makedirs(output_dir, exist_ok=True)
print_lock = threading.Lock()

def log(msg):
    with print_lock:
        print(msg)

def convert_file(filename):
    if not filename.endswith(".xlsx"):
        return
    input_path = os.path.join(input_dir, filename)
    output_filename = os.path.splitext(filename)[0] + ".csv"
    output_path = os.path.join(output_dir, output_filename)
    log(f"🔄 START: {filename}")
    try:
        wb = load_workbook(filename=input_path, read_only=True)
        ws = wb.active
        with open(output_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            for row in ws.iter_rows(values_only=True):
                writer.writerow(row)
        log(f"✅ DONE:  {filename} → {output_filename}")
    except Exception as e:
        log(f"❌ ERROR: {filename}: {e}")

files = [f for f in os.listdir(input_dir) if f.endswith(".xlsx")]

log(f"\n🚀 Processing {len(files)} files from '{input_dir}' to '{output_dir}'...\n")
with ThreadPoolExecutor() as executor:
    executor.map(convert_file, files)
log("\n🎯 All XLSX files processed.\n")



In [None]:
import os
import csv

input_dir = "Datasets/csv_output"
output_dir = "Datasets/dynamodb_ready_by_year"
os.makedirs(output_dir, exist_ok=True)

fieldnames = [
    "occ_code",
    "salary_key",
    "a_median",
    "m_median",
    "m_pct10",
    "m_pct90"
]

def safe_div(val):
    try:
        f = float(val)
        return f"{f / 12:.2f}"
    except (ValueError, TypeError):
        return ""

for filename in os.listdir(input_dir):
    if not filename.endswith(".csv"):
        continue

    # Extract year from filename, else "unknown"
    basename = os.path.splitext(filename)[0]
    year = None
    for part in basename.split('_'):
        if part.isdigit() and len(part) == 4:
            year = part
            break
    if not year:
        year = "unknown"

    input_path = os.path.join(input_dir, filename)
    output_path = os.path.join(output_dir, f"{basename}_dynamodb_ready.csv")

    print(f"🔄 Converting {filename} to DynamoDB format as {output_path}...")

    with open(input_path, mode='r', encoding='utf-8') as infile, \
         open(output_path, mode='w', newline='', encoding='utf-8') as outfile:

        reader = csv.DictReader(infile)
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        count = 0
        log_interval = 10000

        for row_num, row in enumerate(reader, 2):
            try:
                occ_code = row.get('occ_code') or row.get('OCC_CODE')
                a_median = row.get('a_median') or row.get('A_MEDIAN')
                a_pct10 = row.get('a_pct10') or row.get('A_PCT10')
                a_pct90 = row.get('a_pct90') or row.get('A_PCT90')
                # Key fields
                area_title = row.get('area_title') or row.get('AREA_TITLE')
                naics_title = row.get('naics_title') or row.get('NAICS_TITLE')
                salary_key = f"{year}#{area_title}#{naics_title}"

                m_median = safe_div(a_median)
                m_pct10 = safe_div(a_pct10)
                m_pct90 = safe_div(a_pct90)

                writer.writerow({
                    "occ_code": occ_code,
                    "salary_key": salary_key,
                    "a_median": a_median,
                    "m_median": m_median,
                    "m_pct10": m_pct10,
                    "m_pct90": m_pct90,
                })
                count += 1

                if count % log_interval == 0:
                    print(f"   ...Processed {count} rows so far in {filename}")

            except Exception as e:
                print(f"❌ Failed on row {row_num} in {filename}: {e}")

    print(f"✅ Wrote {count} rows to {output_path}")

print("\n🎯 All files converted and saved to:", output_dir)


In [None]:
import os
import csv
import time
import threading
from decimal import Decimal
import boto3
from queue import Queue

# Config
CLEANED_DIR = "Datasets/dynamodb_ready_by_year"
TABLE_NAME = "careerSalary-alek4h7jlreffeoe5tocxgnx2u-NONE"
REGION = "us-west-2"
BATCH_SIZE = 1000
NUM_THREADS = 4
REJECTED_FILE = "rejected_dynamodb_rows.csv"

# AWS setup
dynamodb = boto3.resource("dynamodb", region_name=REGION)
table = dynamodb.Table(TABLE_NAME)

queue = Queue()
rejected_rows = []
lock = threading.Lock()

def to_decimal(val):
    try:
        return Decimal(str(round(float(val), 2))) if val not in [None, '', 'null'] else None
    except:
        return None

def process_batch(batch, batch_number):
    start = time.time()
    try:
        with table.batch_writer(overwrite_by_pkeys=['occ_code']) as writer:
            for item in batch:
                try:
                    writer.put_item(Item=item)
                except Exception as e:
                    with lock:
                        rejected_rows.append({**item, 'error': str(e)})
        print(f"✅ Batch {batch_number} inserted in {time.time() - start:.2f}s ({len(batch)} records)")
    except Exception as e:
        with lock:
            for item in batch:
                rejected_rows.append({**item, 'error': str(e)})
        print(f"❌ Batch {batch_number} failed with error: {str(e)}")

def worker():
    batch_number = 1
    while True:
        batch = queue.get()
        if batch is None:
            break
        process_batch(batch, batch_number)
        batch_number += 1
        queue.task_done()

def read_and_queue_csv(filepath):
    with open(filepath, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        batch = []
        for row in reader:
            try:
                item = {
                    'occ_code': str(row['occ_code']),
                    'salary_key': str(row['salary_key']),
                    'a_median': to_decimal(row.get('a_median')),
                    'm_median': to_decimal(row.get('m_median')),
                    'm_pct10': to_decimal(row.get('m_pct10')),
                    'm_pct90': to_decimal(row.get('m_pct90')),
                }
                # Remove None fields for optional floats (cleaner for DynamoDB)
                item = {k: v for k, v in item.items() if v is not None}
                batch.append(item)
                if len(batch) == BATCH_SIZE:
                    queue.put(batch)
                    batch = []
            except Exception as e:
                with lock:
                    rejected_rows.append({**row, 'error': str(e)})

        if batch:
            queue.put(batch)

def save_rejected_rows():
    if rejected_rows:
        keys = rejected_rows[0].keys()
        with open(REJECTED_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(rejected_rows)
        print(f"⚠️ {len(rejected_rows)} records rejected. Saved to {REJECTED_FILE}")

# Main
print(f"🚀 Starting upload from all CSVs in: {CLEANED_DIR}")
start = time.time()

threads = []
for _ in range(NUM_THREADS):
    t = threading.Thread(target=worker)
    t.start()
    threads.append(t)

for filename in os.listdir(CLEANED_DIR):
    if filename.endswith("_dynamodb_ready.csv"):
        filepath = os.path.join(CLEANED_DIR, filename)
        print(f"📂 Queuing data from {filename}")
        read_and_queue_csv(filepath)

queue.join()

# Stop workers
for _ in range(NUM_THREADS):
    queue.put(None)
for t in threads:
    t.join()

save_rejected_rows()
print(f"🎉 All batches uploaded in {time.time() - start:.2f} seconds.")


In [None]:
# Insert education data in DynamoDB
import os
import csv
import time
import threading
from collections import defaultdict
import boto3
from concurrent.futures import ThreadPoolExecutor, as_completed
from botocore.exceptions import ClientError

# Input CSV and table config
input_file = "Datasets/education_data.csv"
dynamodb_table = "careerEducation-alek4h7jlreffeoe5tocxgnx2u-NONE" #replace this line with your table name
region = "us-west-2"

# Map ESTIMATECODE to fieldnames in schema
CODE_TO_FIELD = {
    'Less_than_hs': 'less_than_highschool',
    'hs_or_eq': 'high_school_or_equivalent',
    'Associate_degree': 'associate_degree',
    'Bachelor_degree': 'bachelor_degree',
    'Master_degree': 'master_degree',
    'Doctorate_degree': 'doctorate_degree',
    'No_requirement': 'no_requirement',
    'Professional_degree': 'professional_degree',
}

# Thread-safe logging
print_lock = threading.Lock()
def log(msg):
    with print_lock:
        print(msg)

def read_and_reshape(input_file):
    """ Reads the CSV and pivots to {occ_code: {fields...}} for DynamoDB """
    edu_data = defaultdict(dict)
    with open(input_file, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            occ = row['SOC']
            code = row['ESTIMATECODE']
            field = CODE_TO_FIELD.get(code)
            if not field:
                continue
            value = row['ESTIMATE']
            edu_data[occ].setdefault('occ_code', occ)
            edu_data[occ][field] = value
    return list(edu_data.values())

def dynamodb_batch_write(table, items):
    with table.batch_writer(overwrite_by_pkeys=['occ_code']) as batch:
        for item in items:
            batch.put_item(Item=item)

def batch_iterable(iterable, batch_size):
    batch = []
    for item in iterable:
        batch.append(item)
        if len(batch) == batch_size:
            yield batch
            batch = []
    if batch:
        yield batch

def insert_batches_singlethreaded(items, table):
    start = time.time()
    total = len(items)
    batch_num = 0
    for batch in batch_iterable(items, 25):
        batch_num += 1
        try:
            dynamodb_batch_write(table, batch)
            log(f"✅ Batch {batch_num} ({len(batch)}) inserted [{(batch_num-1)*25+1}-{batch_num*25}]")
        except ClientError as e:
            log(f"❌ Batch {batch_num} error: {e}")
    log(f"Total items inserted: {total}")
    return time.time() - start

def insert_batches_multithreaded(items, table, n_workers=4):
    start = time.time()
    batches = list(batch_iterable(items, 100))
    batch_num = 0
    def upload_batch(batch):
        nonlocal batch_num
        batch_num += 1
        try:
            dynamodb_batch_write(table, batch)
            log(f"✅ Batch {batch_num} ({len(batch)}) inserted")
        except ClientError as e:
            log(f"❌ Batch {batch_num} error: {e}")

    with ThreadPoolExecutor(max_workers=n_workers) as executor:
        futures = [executor.submit(upload_batch, batch) for batch in batches]
        for f in as_completed(futures):
            pass
    log(f"Total items inserted: {len(items)}")
    return time.time() - start

if __name__ == "__main__":
    # 1. Read and reshape input
    log(f"🔄 Reading and pivoting CSV: {input_file}")
    items = read_and_reshape(input_file)
    log(f"📦 Total DynamoDB items to insert: {len(items)}")

    # 2. Setup DynamoDB
    session = boto3.Session(region_name=region)
    dynamodb = session.resource('dynamodb')
    table = dynamodb.Table(dynamodb_table)

    # 3. Single-threaded insert and timing
    log("🚀 Starting single-threaded DynamoDB insert...")
    t1 = insert_batches_singlethreaded(items, table)
    log(f"⏱️ Single-threaded insert time: {t1:.2f} sec")

    # 4. Multi-threaded insert and timing
    log("🚀 Starting multi-threaded DynamoDB insert (4 threads)...")
    t2 = insert_batches_multithreaded(items, table, n_workers=4)
    log(f"⏱️ Multi-threaded insert time: {t2:.2f} sec")

    log("🎯 Script completed.")


In [None]:
#insert skills data in the database
import os
import csv
import ast
import time
import threading
import boto3
from decimal import Decimal
from queue import Queue

# CONFIGURATION
CSV_FILE = "Datasets/skills_data.csv"   # Path to your skills CSV file
TABLE_NAME = "careerSkills-alek4h7jlreffeoe5tocxgnx2u-NONE"  # DynamoDB table name
REGION = "us-west-2"
BATCH_SIZE = 100  # DynamoDB max per batch
NUM_THREADS = 4  # Adjust as needed
REJECTED_FILE = "rejected_career_skills.csv"

# AWS setup
dynamodb = boto3.resource("dynamodb", region_name=REGION)
table = dynamodb.Table(TABLE_NAME)

queue = Queue()
rejected_rows = []
lock = threading.Lock()

def process_batch(batch, batch_number):
    start = time.time()
    try:
        with table.batch_writer(overwrite_by_pkeys=['occ_code']) as writer:
            for item in batch:
                try:
                    writer.put_item(Item=item)
                except Exception as e:
                    with lock:
                        rejected_rows.append({**item, 'error': str(e)})
        print(f"✅ Batch {batch_number} inserted in {time.time() - start:.2f}s ({len(batch)} records)")
    except Exception as e:
        with lock:
            for item in batch:
                rejected_rows.append({**item, 'error': str(e)})
        print(f"❌ Batch {batch_number} failed with error: {str(e)}")

def worker():
    batch_number = 1
    while True:
        batch = queue.get()
        if batch is None:
            break
        process_batch(batch, batch_number)
        batch_number += 1
        queue.task_done()

def read_and_queue_csv(filepath):
    with open(filepath, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        batch = []
        for row_num, row in enumerate(reader, 2):
            try:
                occ_code = str(row['SOC_CODE'])
                skills_str = row['TYPICAL_SKILLS']
                # Convert the stringified Python list to an actual list
                skills = ast.literal_eval(skills_str)
                item = {
                    'occ_code': occ_code,
                    'skills': skills
                }
                batch.append(item)
                if len(batch) == BATCH_SIZE:
                    queue.put(batch)
                    batch = []
                if row_num % 1000 == 0:
                    print(f"   ...Processed {row_num} rows so far")
            except Exception as e:
                with lock:
                    rejected_rows.append({**row, 'error': str(e)})
        if batch:
            queue.put(batch)

def save_rejected_rows():
    if rejected_rows:
        keys = rejected_rows[0].keys()
        with open(REJECTED_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(rejected_rows)
        print(f"⚠️ {len(rejected_rows)} records rejected. Saved to {REJECTED_FILE}")

# Main
print(f"🚀 Starting upload from: {CSV_FILE}")
start = time.time()

threads = []
for _ in range(NUM_THREADS):
    t = threading.Thread(target=worker)
    t.start()
    threads.append(t)

read_and_queue_csv(CSV_FILE)
queue.join()

# Stop workers
for _ in range(NUM_THREADS):
    queue.put(None)
for t in threads:
    t.join()

save_rejected_rows()
print(f"🎉 All batches uploaded in {time.time() - start:.2f} seconds.")


In [None]:
#inserts career description data
import os
import csv
import time
import threading
import boto3
from queue import Queue

# CONFIGURATION
CSV_FILE = "Datasets/description.csv"   # Update this to your CSV file
TABLE_NAME = "careerDescription-alek4h7jlreffeoe5tocxgnx2u-NONE"  # Update to your DynamoDB table name
REGION = "us-west-2"
BATCH_SIZE = 100  # DynamoDB max per batch write
NUM_THREADS = 4
REJECTED_FILE = "rejected_career_description.csv"

# AWS setup
dynamodb = boto3.resource("dynamodb", region_name=REGION)
table = dynamodb.Table(TABLE_NAME)

queue = Queue()
rejected_rows = []
lock = threading.Lock()

def process_batch(batch, batch_number):
    start = time.time()
    try:
        with table.batch_writer(overwrite_by_pkeys=['occ_code']) as writer:
            for item in batch:
                try:
                    writer.put_item(Item=item)
                except Exception as e:
                    with lock:
                        rejected_rows.append({**item, 'error': str(e)})
        print(f"✅ Batch {batch_number} inserted in {time.time() - start:.2f}s ({len(batch)} records)")
    except Exception as e:
        with lock:
            for item in batch:
                rejected_rows.append({**item, 'error': str(e)})
        print(f"❌ Batch {batch_number} failed with error: {str(e)}")

def worker():
    batch_number = 1
    while True:
        batch = queue.get()
        if batch is None:
            break
        process_batch(batch, batch_number)
        batch_number += 1
        queue.task_done()

def read_and_queue_csv(filepath):
    with open(filepath, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        batch = []
        for row_num, row in enumerate(reader, 2):
            try:
                item = {
                    'occ_code': str(row['Code']),
                    'description': str(row['Description']),
                }
                batch.append(item)
                if len(batch) == BATCH_SIZE:
                    queue.put(batch)
                    batch = []
                if row_num % 1000 == 0:
                    print(f"   ...Processed {row_num} rows so far")
            except Exception as e:
                with lock:
                    rejected_rows.append({**row, 'error': str(e)})
        if batch:
            queue.put(batch)

def save_rejected_rows():
    if rejected_rows:
        keys = rejected_rows[0].keys()
        with open(REJECTED_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(rejected_rows)
        print(f"⚠️ {len(rejected_rows)} records rejected. Saved to {REJECTED_FILE}")

# Main
print(f"🚀 Starting upload from: {CSV_FILE}")
start = time.time()

threads = []
for _ in range(NUM_THREADS):
    t = threading.Thread(target=worker)
    t.start()
    threads.append(t)

read_and_queue_csv(CSV_FILE)
queue.join()

# Stop workers
for _ in range(NUM_THREADS):
    queue.put(None)
for t in threads:
    t.join()

save_rejected_rows()
print(f"🎉 All batches uploaded in {time.time() - start:.2f} seconds.")


In [None]:
# inserts soc codes data in the database
import os
import csv
import time
import threading
import boto3
from queue import Queue

# CONFIGURATION
CSV_FILE = "Datasets/unique_occ_codes.csv"   # Update this to your CSV file
TABLE_NAME = "socCodes-alek4h7jlreffeoe5tocxgnx2u-NONE"  # Update to your DynamoDB table name
REGION = "us-west-2"
BATCH_SIZE = 100
NUM_THREADS = 4
REJECTED_FILE = "rejected_soc_codes.csv"

# AWS setup
dynamodb = boto3.resource("dynamodb", region_name=REGION)
table = dynamodb.Table(TABLE_NAME)

queue = Queue()
rejected_rows = []
lock = threading.Lock()

def process_batch(batch, batch_number):
    start = time.time()
    try:
        with table.batch_writer(overwrite_by_pkeys=['occ_code']) as writer:
            for item in batch:
                try:
                    writer.put_item(Item=item)
                except Exception as e:
                    with lock:
                        rejected_rows.append({**item, 'error': str(e)})
        print(f"✅ Batch {batch_number} inserted in {time.time() - start:.2f}s ({len(batch)} records)")
    except Exception as e:
        with lock:
            for item in batch:
                rejected_rows.append({**item, 'error': str(e)})
        print(f"❌ Batch {batch_number} failed with error: {str(e)}")

def worker():
    batch_number = 1
    while True:
        batch = queue.get()
        if batch is None:
            break
        process_batch(batch, batch_number)
        batch_number += 1
        queue.task_done()

def read_and_queue_csv(filepath):
    with open(filepath, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        batch = []
        for row_num, row in enumerate(reader, 2):
            try:
                item = {
                    'occ_code': str(row['OCC_CODE']),
                    'occ_title': str(row['OCC_TITLE']),
                }
                batch.append(item)
                if len(batch) == BATCH_SIZE:
                    queue.put(batch)
                    batch = []
                if row_num % 1000 == 0:
                    print(f"   ...Processed {row_num} rows so far")
            except Exception as e:
                with lock:
                    rejected_rows.append({**row, 'error': str(e)})
        if batch:
            queue.put(batch)

def save_rejected_rows():
    if rejected_rows:
        keys = rejected_rows[0].keys()
        with open(REJECTED_FILE, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(rejected_rows)
        print(f"⚠️ {len(rejected_rows)} records rejected. Saved to {REJECTED_FILE}")

# Main
print(f"🚀 Starting upload from: {CSV_FILE}")
start = time.time()

threads = []
for _ in range(NUM_THREADS):
    t = threading.Thread(target=worker)
    t.start()
    threads.append(t)

read_and_queue_csv(CSV_FILE)
queue.join()

# Stop workers
for _ in range(NUM_THREADS):
    queue.put(None)
for t in threads:
    t.join()

save_rejected_rows()
print(f"🎉 All batches uploaded in {time.time() - start:.2f} seconds.")
