In [1]:
# Converts the salary data from Excel files to CSV format for the next script
import pandas as pd
import os
from glob import glob

# Define input and output folders
input_folder = "Datasets/salary"
output_folder = "Datasets/Salaries"

# Ensure output directory exists
os.makedirs(output_folder, exist_ok=True)

# Required columns in lowercase
required_columns = [
    "occ_code", "occ_title", "area_title",
    "naics_title", "a_median", "a_pct10", "a_pct90"
]

# Get all Excel files from the folder
excel_files = glob(os.path.join(input_folder, "*.xlsx"))

# Process each file
for file_path in excel_files:
    try:
        filename = os.path.basename(file_path)
        year = next((part for part in filename.split('_') if part.isdigit()), None)

        # Read the Excel file
        df = pd.read_excel(file_path)

        # Convert column names to lowercase
        df.columns = [col.lower() for col in df.columns]

        # Add year column
        df["year"] = year

        # Reorder and filter columns (now all lowercase)
        filtered_df = df[required_columns + ["year"]]

        # Write to CSV
        output_path = os.path.join(output_folder, f"filtered_occupation_data_{year}.csv")
        filtered_df.to_csv(output_path, index=False)

        # Uncomment for debug:
        print(f"✅ Processed and saved: {output_path}")

    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")


KeyboardInterrupt: 

In [1]:
# import pandas as pd
# import json
# import csv
# import os
# from glob import glob

# # Directory setup
# dataset_dir = "Datasets/"
# salary_dir = f"{dataset_dir}Salaries/"
# salary_files = sorted(glob(os.path.join(salary_dir, "filtered_occupation_data_*.csv")))

# # Check salary files
# if not salary_files:
#     raise FileNotFoundError("❌ No salary_data_*.csv files found.")

# print(f"✅ Found salary files: {[os.path.basename(f) for f in salary_files]}")

# # Load all salary CSVs
# salary_dfs = []
# for file in salary_files:
#     try:
#         df = pd.read_csv(file)
#         df["YEAR"] = int(file.split("_")[-1].split(".")[0])
#         salary_dfs.append(df)
#     except Exception as e:
#         print(f"⚠️ Error reading {file}: {e}")

# salary_df = pd.concat(salary_dfs, ignore_index=True)

# # Load education, skills, and descriptions
# education_df = pd.read_csv(os.path.join(dataset_dir, "education_data.csv"))
# skills_df = pd.read_csv(os.path.join(dataset_dir, "skills_data.csv"))
# description_df = pd.read_csv(os.path.join(dataset_dir, "description.csv"))
# description_df["Code"] = description_df["Code"].astype(str)
# description_map = description_df.set_index("Code")["Description"].to_dict()

# # Ensure numeric conversion
# salary_df["a_pct10"] = pd.to_numeric(salary_df["a_pct10"], errors="coerce")
# salary_df["a_median"] = pd.to_numeric(salary_df["a_median"], errors="coerce")
# salary_df["a_pct90"] = pd.to_numeric(salary_df["a_pct90"], errors="coerce")

# # Monthly salaries
# salary_df["M_PCT10"] = (salary_df["a_pct10"] / 12).round(2)
# salary_df["M_MEDIAN"] = (salary_df["a_median"] / 12).round(2)
# salary_df["M_PCT90"] = (salary_df["a_pct90"] / 12).round(2)

# # Education mapping
# edu_map = {
#     "Less_than_hs": "LESS_THAN_HS",
#     "hs_or_eq": "HIGH_SCHOOL",
#     "Associate_degree": "ASSOCIATE",
#     "Bachelor_degree": "BACHELOR",
#     "Master_degree": "MASTERS",
#     "Doctorate_degree": "DOCTORATE",
#     "No_requirement": "NO_REQ",
#     "Professional_degree": "PROFESSIONAL"
# }

# # Build records
# records = []
# grouped_salary = salary_df.groupby(["occ_code", "year"])

# for (soc_code, year), group in grouped_salary:
#     existing_record = next((r for r in records if r["soc_code"] == soc_code), None)

#     if not existing_record:
#         desc = description_map.get(soc_code, "")
#         existing_record = {
#             "soc_code": soc_code,
#             "title": group.iloc[0]["occ_title"],
#             "description": description_map.get(soc_code, ""),
#             "salary": {},
#             "education": {key: "" for key in edu_map.values()},
#             "typicalSkills": []
#         }
#         records.append(existing_record)

#     year_str = str(year)
#     existing_record["salary"].setdefault(year_str, {})

#     for _, row in group.iterrows():
#         state = row["area_title"]
#         industry = row["naics_title"]
#         existing_record["salary"][year_str].setdefault(state, {})[industry] = {
#             "A_MEDIAN": float(row["a_median"]),
#             "M_PCT10": float(row["M_PCT10"]),
#             "M_MEDIAN": float(row["M_MEDIAN"]),
#             "M_PCT90": float(row["M_PCT90"])
#         }

# # Add education data
# for record in records:
#     soc_code = record["soc_code"]
#     edu_rows = education_df[(education_df["SOC"] == soc_code) | (education_df["SOC"] == "00-0000")]
#     for _, edu_row in edu_rows.iterrows():
#         est_code = edu_row["ESTIMATECODE"]
#         est_value = str(edu_row["ESTIMATE"])
#         if est_code in edu_map:
#             record["education"][edu_map[est_code]] = est_value

# # Add skills data
# for record in records:
#     soc_code = record["soc_code"]
#     skills_row = skills_df[skills_df["SOC_CODE"] == soc_code]
#     if not skills_row.empty:
#         raw_skills = skills_row.iloc[0]["TYPICAL_SKILLS"].replace("'", "\"")
#         try:
#             skills_list = json.loads(raw_skills)
#             record["typicalSkills"] = sorted(set(skills_list))
#         except json.JSONDecodeError:
#             print(f"⚠️ Error decoding skills for SOC {soc_code}")
#             record["typicalSkills"] = []

# # Write to output CSV
# output_file = "soc_compiled_output.csv"
# with open(output_file, "w", newline='', encoding="utf-8") as csvfile:
#     fieldnames = ["soc_code", "title", "description", "salary", "education", "typicalSkills"]
#     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#     writer.writeheader()
#     for rec in records:
#         writer.writerow({
#             "soc_code": rec["soc_code"],
#             "title": rec["title"],
#             "description": rec["description"],
#             "salary": json.dumps(rec["salary"]),
#             "education": json.dumps(rec["education"]),
#             "typicalSkills": json.dumps(rec["typicalSkills"])
#         })

# print(f"✅ Done! Output written to '{output_file}' with {len(records)} SOC records.")

import pandas as pd
import json
import csv
import os
from glob import glob
from datetime import datetime

# Directory setup
dataset_dir = "Datasets/"
salary_dir = f"{dataset_dir}Salaries/"
salary_files = sorted(glob(os.path.join(salary_dir, "filtered_occupation_data_*.csv")))

# Check salary files
if not salary_files:
    raise FileNotFoundError("❌ No salary_data_*.csv files found.")

print(f"✅ Found salary files: {[os.path.basename(f) for f in salary_files]}")

# Load all salary CSVs
salary_dfs = []
for file in salary_files:
    try:
        df = pd.read_csv(file)
        df["YEAR"] = int(file.split("_")[-1].split(".")[0])
        salary_dfs.append(df)
    except Exception as e:
        print(f"⚠️ Error reading {file}: {e}")

salary_df = pd.concat(salary_dfs, ignore_index=True)

# Load education, skills, and descriptions
education_df = pd.read_csv(os.path.join(dataset_dir, "education_data.csv"))
skills_df = pd.read_csv(os.path.join(dataset_dir, "skills_data.csv"))
description_df = pd.read_csv(os.path.join(dataset_dir, "description.csv"))
description_df["Code"] = description_df["Code"].astype(str)
description_map = description_df.set_index("Code")["Description"].to_dict()

# Ensure numeric conversion
salary_df["a_pct10"] = pd.to_numeric(salary_df["a_pct10"], errors="coerce")
salary_df["a_median"] = pd.to_numeric(salary_df["a_median"], errors="coerce")
salary_df["a_pct90"] = pd.to_numeric(salary_df["a_pct90"], errors="coerce")

# Monthly salaries
salary_df["M_PCT10"] = (salary_df["a_pct10"] / 12).round(2)
salary_df["M_MEDIAN"] = (salary_df["a_median"] / 12).round(2)
salary_df["M_PCT90"] = (salary_df["a_pct90"] / 12).round(2)

# Education mapping
edu_map = {
    "Less_than_hs": "LESS_THAN_HS",
    "hs_or_eq": "HIGH_SCHOOL",
    "Associate_degree": "ASSOCIATE",
    "Bachelor_degree": "BACHELOR",
    "Master_degree": "MASTERS",
    "Doctorate_degree": "DOCTORATE",
    "No_requirement": "NO_REQ",
    "Professional_degree": "PROFESSIONAL"
}

# Build records
records = []
grouped_salary = salary_df.groupby(["occ_code", "year"])

for (soc_code, year), group in grouped_salary:
    existing_record = next((r for r in records if r["soc_code"] == soc_code), None)

    if not existing_record:
        existing_record = {
            "soc_code": soc_code,
            "title": group.iloc[0]["occ_title"],
            "description": description_map.get(soc_code, ""),
            "salary": {},
            "education": {key: "" for key in edu_map.values()},
            "typicalSkills": []
        }
        records.append(existing_record)

    year_str = str(year)
    existing_record["salary"].setdefault(year_str, {})

    for _, row in group.iterrows():
        state = row["area_title"]
        industry = row["naics_title"]
        existing_record["salary"][year_str].setdefault(state, {})[industry] = {
            "A_MEDIAN": float(row["a_median"]),
            "M_PCT10": float(row["M_PCT10"]),
            "M_MEDIAN": float(row["M_MEDIAN"]),
            "M_PCT90": float(row["M_PCT90"])
        }

# Add education data
for record in records:
    soc_code = record["soc_code"]
    edu_rows = education_df[(education_df["SOC"] == soc_code) | (education_df["SOC"] == "00-0000")]
    for _, edu_row in edu_rows.iterrows():
        est_code = edu_row["ESTIMATECODE"]
        est_value = str(edu_row["ESTIMATE"])
        if est_code in edu_map:
            record["education"][edu_map[est_code]] = est_value

# Add skills data
for record in records:
    soc_code = record["soc_code"]
    skills_row = skills_df[skills_df["SOC_CODE"] == soc_code]
    if not skills_row.empty:
        raw_skills = skills_row.iloc[0]["TYPICAL_SKILLS"].replace("'", "\"")
        try:
            skills_list = json.loads(raw_skills)
            record["typicalSkills"] = sorted(set(skills_list))
        except json.JSONDecodeError:
            print(f"⚠️ Error decoding skills for SOC {soc_code}")
            record["typicalSkills"] = []

# Add timestamps
timestamp = datetime.utcnow().isoformat()

# Write to DynamoDB-ready CSV
output_file = "dynamodb_ready_soc_output.csv"
with open(output_file, "w", newline='', encoding="utf-8") as csvfile:
    fieldnames = [
        "soc_code",       # Partition key
        "title",
        "description",
        "salary",         # JSON string
        "education",      # JSON string
        "typicalSkills",  # JSON string
        "createdAt",
        "updatedAt"
    ]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for rec in records:
        writer.writerow({
            "soc_code": rec["soc_code"],
            "title": rec["title"],
            "description": rec["description"],
            "salary": json.dumps(rec["salary"], ensure_ascii=False),
            "education": json.dumps(rec["education"], ensure_ascii=False),
            "typicalSkills": json.dumps(rec["typicalSkills"], ensure_ascii=False),
            "createdAt": timestamp,
            "updatedAt": timestamp
        })

print(f"✅ DynamoDB-ready CSV written to '{output_file}' with {len(records)} records.")

✅ Found salary files: ['filtered_occupation_data_2016.csv', 'filtered_occupation_data_2017.csv', 'filtered_occupation_data_2018.csv', 'filtered_occupation_data_2019.csv', 'filtered_occupation_data_2020.csv', 'filtered_occupation_data_2021.csv', 'filtered_occupation_data_2022.csv', 'filtered_occupation_data_2023.csv', 'filtered_occupation_data_2024.csv']


  timestamp = datetime.utcnow().isoformat()


✅ DynamoDB-ready CSV written to 'dynamodb_ready_soc_output.csv' with 1550 records.


In [None]:
# # Cleans up the CSV file generated
# import csv
# import json
# import sys

# # ✅ Raise max CSV field size limit (handles very large salary fields)
# csv.field_size_limit(sys.maxsize)

# # ✅ File paths
# INPUT_CSV = "soc_compiled_output.csv"
# OUTPUT_CSV = "dynamodb_ready_output.csv"
# OUTPUT_JSON = "dynamodb_ready_output.json"

# # ✅ Utility functions
# def clean_json_field(raw_value):
#     try:
#         if raw_value.startswith('"') and raw_value.endswith('"'):
#             raw_value = raw_value[1:-1]
#         return json.loads(raw_value.replace('\\"', '"'))
#     except Exception as e:
#         print(f"⚠️ Failed to parse JSON object: {raw_value[:100]}... — {e}")
#         return {}

# def clean_array_field(raw_value):
#     try:
#         if raw_value.startswith('"') and raw_value.endswith('"'):
#             raw_value = raw_value[1:-1]
#         return json.loads(raw_value.replace('\\"', '"'))
#     except Exception as e:
#         print(f"⚠️ Failed to parse JSON array: {raw_value[:100]}... — {e}")
#         return []

# # ✅ Clean the data
# cleaned_rows = []

# with open(INPUT_CSV, "r", encoding="utf-8") as infile:
#     reader = csv.DictReader(infile)
#     for row in reader:
#         row["education"] = clean_json_field(row["education"])
#         row["salary"] = clean_json_field(row["salary"])
#         row["typicalSkills"] = clean_array_field(row["typicalSkills"])
#         cleaned_rows.append(row)

# # ✅ Write cleaned CSV
# with open(OUTPUT_CSV, "w", newline='', encoding="utf-8") as outfile:
#     fieldnames = cleaned_rows[0].keys()
#     writer = csv.DictWriter(outfile, fieldnames=fieldnames)
#     writer.writeheader()
#     for row in cleaned_rows:
#         row["education"] = json.dumps(row["education"], ensure_ascii=False)
#         row["salary"] = json.dumps(row["salary"], ensure_ascii=False)
#         row["typicalSkills"] = json.dumps(row["typicalSkills"], ensure_ascii=False)
#         writer.writerow(row)

# print(f"✅ Cleaned CSV written to: {OUTPUT_CSV}")

# # # ✅ Write cleaned JSON
# # with open(OUTPUT_JSON, "w", encoding="utf-8") as jsonfile:
# #     json.dump(cleaned_rows, jsonfile, indent=2, ensure_ascii=False)

# # print(f"✅ DynamoDB-ready JSON written to: {OUTPUT_JSON}")


⚠️ Failed to parse JSON object: {"2016": {"U.S.": {"Cross-industry": {"A_MEDIAN": 53640.0, "M_PCT10": 2530.83, "M_MEDIAN": 4470.0, "... — Invalid control character at: line 1 column 407243 (char 407242)
✅ Cleaned CSV written to: dynamodb_ready_output.csv


In [None]:
# import csv
# import json
# import sys

# csv.field_size_limit(sys.maxsize)

# INPUT_CSV = "dynamodb_ready_output.csv"
# OUTPUT_CSV = "dynamodb_ready_output_fixed.csv"

# def clean_json_field(raw_value):
#     try:
#         if raw_value.startswith('"') and raw_value.endswith('"'):
#             raw_value = raw_value[1:-1]
#         return json.loads(raw_value.replace('\\"', '"'))
#     except Exception as e:
#         print(f"⚠️ Failed to parse JSON object: {raw_value[:100]}... — {e}")
#         return {}

# def clean_array_field(raw_value):
#     try:
#         if raw_value.startswith('"') and raw_value.endswith('"'):
#             raw_value = raw_value[1:-1]
#         return json.loads(raw_value.replace('\\"', '"'))
#     except Exception as e:
#         print(f"⚠️ Failed to parse JSON array: {raw_value[:100]}... — {e}")
#         return []

# cleaned_rows = []

# with open(INPUT_CSV, "r", encoding="utf-8") as infile:
#     reader = csv.DictReader(infile)
#     for row in reader:
#         row["education"] = clean_json_field(row.get("education", ""))
#         row["salary"] = clean_json_field(row.get("salary", ""))
#         row["typicalSkills"] = clean_array_field(row.get("typicalSkills", "[]"))
#         cleaned_rows.append(row)

# with open(OUTPUT_CSV, "w", newline='', encoding="utf-8") as outfile:
#     fieldnames = cleaned_rows[0].keys()
#     writer = csv.DictWriter(outfile, fieldnames=fieldnames)
#     writer.writeheader()
#     for row in cleaned_rows:
#         row["education"] = json.dumps(row["education"], ensure_ascii=False)
#         row["salary"] = json.dumps(row["salary"], ensure_ascii=False)
#         row["typicalSkills"] = json.dumps(row["typicalSkills"], ensure_ascii=False)
#         writer.writerow(row)

# print(f"✅ Cleaned file written to: {OUTPUT_CSV}")


✅ Cleaned file written to: dynamodb_ready_output_fixed.csv


In [2]:
#Inserts the csv file into the DynamoDB database
import csv
import json
import boto3
from datetime import datetime

# Configuration
REGION = 'us-west-2'
#Chetna Table name: careerData-eh7gingt5zao7c7znywp6lvh7q-NONE
#Gary Table name: careerData-5oaexymterhv5cq3utxzzbxrqu-NONE
#Trupti Table name: careerData-mnwl2lvfqzhuzcdsnpnfazqvfm-NONE
TABLE_NAME = 'careerData-mnwl2lvfqzhuzcdsnpnfazqvfm-NONE'  
CSV_FILE_PATH = 'dynamodb_ready_soc_output.csv'
REJECTED_FILE_PATH = f'rejected-{CSV_FILE_PATH}'

# DynamoDB client
dynamodb = boto3.resource('dynamodb', region_name=REGION)
table = dynamodb.Table(TABLE_NAME)

# Tracking
total_uploaded = 0
total_rejected = 0
rejected_rows = []

# Upload a single record
def upload_record(row):
    global total_uploaded, total_rejected

    try:
        education = json.loads(row['education'] or '{}')
        salary = json.loads(row['salary'] or '{}')
    except Exception as e:
        rejected_rows.append(row)
        total_rejected += 1
        return

    item = {
        'occ_code': row['soc_code'],
        'occ_title': row['title'],
        'description': row.get('description', ''),
        'education': education,
        'salary': salary,
        'createdAt': datetime.utcnow().isoformat(),
        'updatedAt': datetime.utcnow().isoformat(),
    }

    try:
        table.put_item(Item=item)
        total_uploaded += 1
    except Exception as e:
        rejected_rows.append(row)
        total_rejected += 1

# Main CSV upload process
def upload_csv(filepath):
    with open(filepath, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            upload_record(row)

    # Write rejected rows if any
    if rejected_rows:
        with open(REJECTED_FILE_PATH, 'w', newline='', encoding='utf-8') as reject_file:
            writer = csv.DictWriter(reject_file, fieldnames=rejected_rows[0].keys())
            writer.writeheader()
            writer.writerows(rejected_rows)
        print(f"⚠️ {total_rejected} records rejected. Saved to {REJECTED_FILE_PATH}")

    print(f"✅ Upload complete. {total_uploaded} records uploaded, {total_rejected} rejected.")

# Start the process
if __name__ == '__main__':
    upload_csv(CSV_FILE_PATH)


Error: field larger than field limit (131072)

In [None]:
import pandas as pd
import json
import csv
import os
from glob import glob
from datetime import datetime, timezone
import boto3
from botocore.exceptions import BotoCoreError, ClientError

# DynamoDB Table Name
TABLE_NAME = 'careerData-mnwl2lvfqzhuzcdsnpnfazqvfm-NONE'

# Directory setup
dataset_dir = "Datasets/"
salary_dir = f"{dataset_dir}Salaries/"
salary_files = sorted(glob(os.path.join(salary_dir, "filtered_occupation_data_*.csv")))

# Check salary files
if not salary_files:
    raise FileNotFoundError("❌ No salary_data_*.csv files found.")

print(f"✅ Found salary files: {[os.path.basename(f) for f in salary_files]}")

# Load all salary CSVs
salary_dfs = []
for file in salary_files:
    try:
        df = pd.read_csv(file)
        df["YEAR"] = int(file.split("_")[-1].split(".")[0])
        salary_dfs.append(df)
    except Exception as e:
        print(f"⚠️ Error reading {file}: {e}")

salary_df = pd.concat(salary_dfs, ignore_index=True)

# Load education, skills, and descriptions
education_df = pd.read_csv(os.path.join(dataset_dir, "education_data.csv"))
skills_df = pd.read_csv(os.path.join(dataset_dir, "skills_data.csv"))
description_df = pd.read_csv(os.path.join(dataset_dir, "description.csv"))
description_df["Code"] = description_df["Code"].astype(str)
description_map = description_df.set_index("Code")["Description"].to_dict()

# Ensure numeric conversion
salary_df["a_pct10"] = pd.to_numeric(salary_df["a_pct10"], errors="coerce")
salary_df["a_median"] = pd.to_numeric(salary_df["a_median"], errors="coerce")
salary_df["a_pct90"] = pd.to_numeric(salary_df["a_pct90"], errors="coerce")

# Monthly salaries
salary_df["M_PCT10"] = (salary_df["a_pct10"] / 12).round(2)
salary_df["M_MEDIAN"] = (salary_df["a_median"] / 12).round(2)
salary_df["M_PCT90"] = (salary_df["a_pct90"] / 12).round(2)

# Education mapping
edu_map = {
    "Less_than_hs": "LESS_THAN_HS",
    "hs_or_eq": "HIGH_SCHOOL",
    "Associate_degree": "ASSOCIATE",
    "Bachelor_degree": "BACHELOR",
    "Master_degree": "MASTERS",
    "Doctorate_degree": "DOCTORATE",
    "No_requirement": "NO_REQ",
    "Professional_degree": "PROFESSIONAL"
}

# Build records
records = []
grouped_salary = salary_df.groupby(["occ_code", "year"])

for (soc_code, year), group in grouped_salary:
    existing_record = next((r for r in records if r["soc_code"] == soc_code), None)

    if not existing_record:
        existing_record = {
            "soc_code": soc_code,
            "title": group.iloc[0]["occ_title"],
            "description": description_map.get(soc_code, ""),
            "salary": {},
            "education": {key: "" for key in edu_map.values()},
            "typicalSkills": []
        }
        records.append(existing_record)

    year_str = str(year)
    existing_record["salary"].setdefault(year_str, {})

    for _, row in group.iterrows():
        state = row["area_title"]
        industry = row["naics_title"]
        existing_record["salary"][year_str].setdefault(state, {})[industry] = {
            "A_MEDIAN": float(row["a_median"]),
            "M_PCT10": float(row["M_PCT10"]),
            "M_MEDIAN": float(row["M_MEDIAN"]),
            "M_PCT90": float(row["M_PCT90"])
        }

# Add education data
for record in records:
    soc_code = record["soc_code"]
    edu_rows = education_df[(education_df["SOC"] == soc_code) | (education_df["SOC"] == "00-0000")]
    for _, edu_row in edu_rows.iterrows():
        est_code = edu_row["ESTIMATECODE"]
        est_value = str(edu_row["ESTIMATE"])
        if est_code in edu_map:
            record["education"][edu_map[est_code]] = est_value

# Add skills data
for record in records:
    soc_code = record["soc_code"]
    skills_row = skills_df[skills_df["SOC_CODE"] == soc_code]
    if not skills_row.empty:
        raw_skills = skills_row.iloc[0]["TYPICAL_SKILLS"].replace("'", "\"")
        try:
            skills_list = json.loads(raw_skills)
            record["typicalSkills"] = sorted(set(skills_list))
        except json.JSONDecodeError:
            print(f"⚠️ Error decoding skills for SOC {soc_code}")
            record["typicalSkills"] = []

# Add timestamps
timestamp = datetime.now(timezone.utc).isoformat()

# Write to CSV (optional, for backup)
output_file = "dynamodb_ready_soc_output.csv"
with open(output_file, "w", newline='', encoding="utf-8") as csvfile:
    fieldnames = [
        "soc_code", "title", "description", "salary", "education", "typicalSkills", "createdAt", "updatedAt"
    ]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for rec in records:
        writer.writerow({
            "soc_code": rec["soc_code"],
            "title": rec["title"],
            "description": rec["description"],
            "salary": json.dumps(rec["salary"], ensure_ascii=False),
            "education": json.dumps(rec["education"], ensure_ascii=False),
            "typicalSkills": json.dumps(rec["typicalSkills"], ensure_ascii=False),
            "createdAt": timestamp,
            "updatedAt": timestamp
        })

print(f"✅ DynamoDB-ready CSV written to '{output_file}' with {len(records)} records.")

# Upload to DynamoDB
def batch_write_to_dynamodb(table_name, items):
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table(table_name)

    with table.batch_writer(overwrite_by_pkeys=["soc_code"]) as batch:
        for item in items:
            try:
                batch.put_item(Item=item)
            except (BotoCoreError, ClientError) as e:
                print(f"❌ Failed to insert item {item.get('soc_code')}: {e}")

# Format items for DynamoDB insertion
dynamodb_items = []
for rec in records:
    item = {
        "soc_code": rec["soc_code"],
        "title": rec["title"],
        "description": rec["description"],
        "salary": json.dumps(rec["salary"], ensure_ascii=False),
        "education": json.dumps(rec["education"], ensure_ascii=False),
        "typicalSkills": json.dumps(rec["typicalSkills"], ensure_ascii=False),
        "createdAt": timestamp,
        "updatedAt": timestamp
    }
    dynamodb_items.append(item)

print(f"☁️ Uploading {len(dynamodb_items)} records to DynamoDB table '{TABLE_NAME}'...")
batch_write_to_dynamodb(TABLE_NAME, dynamodb_items)
print(f"✅ Upload complete!")


✅ Found salary files: ['filtered_occupation_data_2016.csv', 'filtered_occupation_data_2017.csv', 'filtered_occupation_data_2018.csv', 'filtered_occupation_data_2019.csv', 'filtered_occupation_data_2020.csv', 'filtered_occupation_data_2021.csv', 'filtered_occupation_data_2022.csv', 'filtered_occupation_data_2023.csv', 'filtered_occupation_data_2024.csv']


  timestamp = datetime.utcnow().isoformat()


✅ DynamoDB-ready CSV written to 'dynamodb_ready_soc_output.csv' with 1550 records.
☁️ Uploading 1550 records to DynamoDB table 'careerData-mnwl2lvfqzhuzcdsnpnfazqvfm-NONE'...
❌ Failed to insert item 11-3020: An error occurred (413) when calling the BatchWriteItem operation: 
❌ Failed to insert item 11-9039: An error occurred (ValidationException) when calling the BatchWriteItem operation: Item size has exceeded the maximum allowed size
❌ Failed to insert item 11-9171: An error occurred (ValidationException) when calling the BatchWriteItem operation: Item size has exceeded the maximum allowed size
❌ Failed to insert item 13-1081: An error occurred (ValidationException) when calling the BatchWriteItem operation: Item size has exceeded the maximum allowed size
❌ Failed to insert item 13-2041: An error occurred (413) when calling the BatchWriteItem operation: 
❌ Failed to insert item 15-1131: An error occurred (ValidationException) when calling the BatchWriteItem operation: Item size has e

In [None]:
import math
import pandas as pd
import json
import csv
import os
from glob import glob
from datetime import datetime, timezone
import uuid
import boto3
from botocore.exceptions import ClientError, BotoCoreError
import time
from decimal import Decimal

# Insert your table names here
CAREER_TABLE = 'your-careerData-table-name'
SALARY_TABLE = 'your-salaryData-table-name'

# Directory setup
dataset_dir = "Datasets/"
salary_dir = f"{dataset_dir}Salaries/"
salary_files = sorted(glob(os.path.join(salary_dir, "filtered_occupation_data_*.csv")))

if not salary_files:
    raise FileNotFoundError("❌ No salary_data_*.csv files found.")

print(f"✅ Found salary files: {[os.path.basename(f) for f in salary_files]}")

def safe_decimal(val):
    try:
        if val is None:
            return None
        dec = Decimal(str(val))
        if math.isnan(float(dec)) or math.isinf(float(dec)):
            return None
        return dec
    except (InvalidOperation, ValueError, TypeError):
        return None

# Load salary data
salary_dfs = []
for file in salary_files:
    try:
        df = pd.read_csv(file)
        df["YEAR"] = int(file.split("_")[-1].split(".")[0])
        salary_dfs.append(df)
    except Exception as e:
        print(f"⚠️ Error reading {file}: {e}")

salary_df = pd.concat(salary_dfs, ignore_index=True)

# Load metadata
education_df = pd.read_csv(os.path.join(dataset_dir, "education_data.csv"))
skills_df = pd.read_csv(os.path.join(dataset_dir, "skills_data.csv"))
description_df = pd.read_csv(os.path.join(dataset_dir, "description.csv"))
description_df["Code"] = description_df["Code"].astype(str)
description_map = description_df.set_index("Code")["Description"].to_dict()

# Convert salary fields to numeric
salary_df["a_pct10"] = pd.to_numeric(salary_df["a_pct10"], errors="coerce")
salary_df["a_median"] = pd.to_numeric(salary_df["a_median"], errors="coerce")
salary_df["a_pct90"] = pd.to_numeric(salary_df["a_pct90"], errors="coerce")
salary_df["M_PCT10"] = (salary_df["a_pct10"] / 12).round(2)
salary_df["M_MEDIAN"] = (salary_df["a_median"] / 12).round(2)
salary_df["M_PCT90"] = (salary_df["a_pct90"] / 12).round(2)

# Education mapping
edu_map = {
    "Less_than_hs": "LESS_THAN_HS",
    "hs_or_eq": "HIGH_SCHOOL",
    "Associate_degree": "ASSOCIATE",
    "Bachelor_degree": "BACHELOR",
    "Master_degree": "MASTERS",
    "Doctorate_degree": "DOCTORATE",
    "No_requirement": "NO_REQ",
    "Professional_degree": "PROFESSIONAL"
}

# Timestamps
timestamp = datetime.now(timezone.utc).isoformat()

# Generate salaryData records and build salary_map
career_records = []
salary_records = []
salary_map_by_occ = {}

for _, row in salary_df.iterrows():
    occ_code = row["occ_code"]
    year = str(row["YEAR"])
    area = row["area_title"]
    industry = row["naics_title"]

    guid = str(uuid.uuid4())

    # Create salary record
    salary_records.append({
        "guid": guid,
        "year": year,
        "area": area,
        "occupation": occ_code,
        "industry": industry,
        "annual_median": safe_decimal(str(row["a_median"])),
        "monthly_median": safe_decimal(str(row["M_MEDIAN"])),
        "monthly_pct10": safe_decimal(str(row["M_PCT10"])),
        "monthly_pct90": safe_decimal(str(row["M_PCT90"]))
    })

    # Build mapping for careerData
    if occ_code not in salary_map_by_occ:
        salary_map_by_occ[occ_code] = {}
    if year not in salary_map_by_occ[occ_code]:
        salary_map_by_occ[occ_code][year] = []
    salary_map_by_occ[occ_code][year].append(guid)

# Build careerData records
unique_occ_codes = salary_df["occ_code"].unique()

for occ_code in unique_occ_codes:
    title = salary_df[salary_df["occ_code"] == occ_code].iloc[0]["occ_title"]
    description = description_map.get(occ_code, "")

    # Education
    education = {val: "" for val in edu_map.values()}
    edu_rows = education_df[(education_df["SOC"] == occ_code) | (education_df["SOC"] == "00-0000")]
    for _, edu_row in edu_rows.iterrows():
        est_code = edu_row["ESTIMATECODE"]
        if est_code in edu_map:
            education[edu_map[est_code]] = str(edu_row["ESTIMATE"])

    # Skills
    skills_row = skills_df[skills_df["SOC_CODE"] == occ_code]
    typicalSkills = []
    if not skills_row.empty:
        raw_skills = skills_row.iloc[0]["TYPICAL_SKILLS"].replace("'", "\"")
        try:
            typicalSkills = sorted(set(json.loads(raw_skills)))
        except json.JSONDecodeError:
            print(f"⚠️ Error decoding skills for SOC {occ_code}")

    # Final record
    career_records.append({
        "occ_code": occ_code,
        "occ_title": title,
        "description": description,
        "salary": salary_map_by_occ.get(occ_code, {}),
        "education": education,
        "skills": typicalSkills
    })

# Optional backup to CSV
with open("careerData_output.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=[
        "occ_code", "occ_title", "description", "salary", "education", "skills"
    ])
    writer.writeheader()
    for rec in career_records:
        writer.writerow({
            "occ_code": rec["occ_code"],
            "occ_title": rec["occ_title"],
            "description": rec["description"],
            "salary": json.dumps(rec["salary"]),
            "education": json.dumps(rec["education"]),
            "skills": json.dumps(rec["skills"])
        })

with open("salaryData_output.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=[
        "guid", "year", "area", "occupation", "industry",
        "annual_median", "monthly_median", "monthly_pct10", "monthly_pct90"
    ])
    writer.writeheader()
    for rec in salary_records:
        writer.writerow(rec)

print("✅ Backup files written: 'careerData_output.csv', 'salaryData_output.csv'")

# Upload function
def batch_write(table_name, items, key_field, max_retries=3):
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table(table_name)
    failed_items = []

    with table.batch_writer(overwrite_by_pkeys=[key_field]) as batch:
        for item in items:
            retries = 0
            while retries < max_retries:
                try:
                    batch.put_item(Item=item)
                    break
                except (ClientError, BotoCoreError) as e:
                    retries += 1
                    print(f"❌ Retry {retries}/{max_retries} for item {item.get(key_field)} — {e}")
                    time.sleep(1)
            else:
                failed_items.append(item.get(key_field))

    if failed_items:
        print(f"⚠️ Failed to upload {len(failed_items)} items: {failed_items}")
    else:
        print(f"✅ All items uploaded successfully to {table_name}")

# Upload salaryData
print(f"☁️ Uploading {len(salary_records)} salaryData items to DynamoDB...")
batch_write(SALARY_TABLE, salary_records, key_field="guid")

# Upload careerData
print(f"☁️ Uploading {len(career_records)} careerData items to DynamoDB...")
batch_write(CAREER_TABLE, career_records, key_field="occ_code")

print("🎉 Done! All records processed and uploaded.")

✅ Found salary files: ['filtered_occupation_data_2016.csv', 'filtered_occupation_data_2017.csv', 'filtered_occupation_data_2018.csv', 'filtered_occupation_data_2019.csv', 'filtered_occupation_data_2020.csv', 'filtered_occupation_data_2021.csv', 'filtered_occupation_data_2022.csv', 'filtered_occupation_data_2023.csv', 'filtered_occupation_data_2024.csv']
