In [1]:
import pandas as pd
import json
import os

def load_file(filepath):
    try:
        df = pd.read_csv(filepath, low_memory=False)
        df = clean_column_names(df)
        df = normalize_column_aliases(df)
        return df
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return pd.DataFrame()

def load_education(base_path='Datasets/'): 
    try:
        edu_df = pd.read_csv(os.path.join(base_path, 'ors_data.csv'), dtype=str, low_memory=False)
        edu_df = clean_column_names(edu_df)

        if 'SOC' not in edu_df.columns or 'ESTIMATECODE' not in edu_df.columns or 'ESTIMATE' not in edu_df.columns:
            print("❌ Required columns not found in ORS file.")
            return {}

        # Define mapping from raw ESTIMATECODE to standard keys
        code_map = {
            'Less_than_hs': 'LESS_THAN_HS',
            'hs_or_eq': 'HIGH_SCHOOL',
            'Associate_degree': 'ASSOCIATE',
            'Bachelor_degree': 'BACHELOR',
            'Master_degree': 'MASTERS',
            'Doctorate_degree': 'DOCTORATE',
            'No_requirement': 'NO_REQ',
            'Professional_degree': 'PROFESSIONAL'
        }

        edu_df = edu_df[edu_df['ESTIMATECODE'].isin(code_map.keys())]
        edu_df['ESTIMATECODE'] = edu_df['ESTIMATECODE'].map(code_map)

        # Remove < signs and convert to numeric safely
        edu_df['ESTIMATE'] = edu_df['ESTIMATE'].str.replace('<', '', regex=False).str.strip()

        pivot_df = edu_df.pivot_table(index='SOC', columns='ESTIMATECODE', values='ESTIMATE', aggfunc='first').fillna('')
        pivot_df.index.name = 'OCC_CODE'

        return pivot_df.to_dict('index')

    except Exception as e:
        print(f"Error loading education file: {e}")
        return {}

def clean_column_names(df):
    df.columns = [c.strip().upper().replace(' ', '_') for c in df.columns]
    return df

def normalize_column_aliases(df):
    column_renames = {
        'STATE': 'AREA_TITLE',
        'area_title': 'AREA_TITLE',
        'naics_title': 'NAICS_TITLE'
    }
    df.rename(columns={k: v for k, v in column_renames.items() if k in df.columns}, inplace=True)
    return df

def process_national(df, year, education_data):
    return process_generic(df, year, education_data, area_col='AREA_TITLE', industry_col=None, industry_default='Cross-industry')

def process_state(df, year, education_data):
    return process_generic(df, year, education_data, area_col='AREA_TITLE', industry_col=None, industry_default='Cross-industry')

def process_industry(df, year, education_data):
    return process_generic(df, year, education_data, area_col='AREA_TITLE', industry_col='NAICS_TITLE', industry_default='Unknown')

def process_allsectors(df, year, education_data):
    return process_generic(df, year, education_data, area_col='AREA_TITLE', industry_col=None, industry_default='Cross-industry')

def process_generic(df, year, education_data, area_col='AREA_TITLE', industry_col=None, industry_default=None):
    records = {}
    for _, row in df.iterrows():
        occ_code = str(row.get('OCC_CODE', '')).strip()
        occ_title = str(row.get('OCC_TITLE', '')).strip()
        area = str(row.get(area_col, 'U.S.')).strip()
        industry = str(row.get(industry_col, industry_default)) if industry_col else industry_default

        if not occ_code:
            continue

        key = (occ_code, occ_title)

        if key not in records:
            education_entry = education_data.get(occ_code, {})
            records[key] = {
                'occ_code': occ_code,
                'occ_title': occ_title,
                'salary': {},
                'education': {
                    'LESS_THAN_HS': education_entry.get('LESS_THAN_HS', ''),
                    'HIGH_SCHOOL': education_entry.get('HIGH_SCHOOL', ''),
                    'ASSOCIATE': education_entry.get('ASSOCIATE', ''),
                    'BACHELOR': education_entry.get('BACHELOR', ''),
                    'MASTERS': education_entry.get('MASTERS', ''),
                    'DOCTORATE': education_entry.get('DOCTORATE', ''),
                    'NO_REQ': education_entry.get('NO_REQ', ''),
                    'PROFESSIONAL': education_entry.get('PROFESSIONAL', '')
                }
            }

        salary_entry = {
            'A_MEDIAN': try_float(row.get('A_MEDIAN')),
            'M_PCT10': safe_divide_by_12(row.get('A_PCT10', '')),
            'M_MEDIAN': safe_divide_by_12(row.get('A_MEDIAN', '')),
            'M_PCT90': safe_divide_by_12(row.get('A_PCT90', '')),
        }

        if area not in records[key]['salary']:
            records[key]['salary'][area] = {}

        if year not in records[key]['salary']:
            records[key]['salary'][year] = {}
        if area not in records[key]['salary'][year]:
            records[key]['salary'][year][area] = {}
            records[key]['salary'][year][area][industry] = salary_entry
    
        return records

def try_float(val):
    try:
        return float(val)
    except (ValueError, TypeError):
        return None
    
def safe_divide_by_12(value):
    try:
        return str(round(float(value) / 12, 2))
    except:
        return ""

def merge_records(record_sets):
    merged = {}
    for record_set in record_sets:
        for key, value in record_set.items():
            if key not in merged:
                merged[key] = value
            else:
                for area, industries in value['salary'].items():
                    if area not in merged[key]['salary']:
                        merged[key]['salary'][area] = {}
                    merged[key]['salary'][area].update(industries)
    return merged

def generate_output(merged_records, output_csv_path):
    output_rows = []

    for record in merged_records.values():
        output_rows.append({
            'occ_code': record['occ_code'],
            'occ_title': record['occ_title'],
            'salary': json.dumps(record['salary']).replace('"', '\\"'),
            'education': json.dumps(record['education']).replace('"', '\\"')
        })

    df_output = pd.DataFrame(output_rows)
    df_output.to_csv(output_csv_path, index=False)
    print(f"✅ Output CSV generated: {output_csv_path}")

def load_skills(filepath='Datasets/grouped_skills.csv'):
    try:
        skills_df = pd.read_csv(filepath, dtype=str, low_memory=False)
        skills_df = clean_column_names(skills_df)
        print (f"✅ Skills file loaded: {skills_df.columns}")
        # Ensure column names are consistent
        # Check for the presence of 'SOC_CODE' or 'SOC' columns
        if 'SOC_CODE' not in skills_df.columns and 'SOC' not in skills_df.columns:
            print(f"❌ 'SOC_CODE' or 'SOC' column not found in skills file. Available columns: {skills_df.columns.tolist()}")
            return {}
        if 'SOC' in skills_df.columns:
            skills_df.rename(columns={'SOC': 'SOC_CODE'}, inplace=True)

        if 'TYPICAL_SKILLS' not in skills_df.columns:
            print("❌ 'TYPICAL_SKILLS' column not found in skills file.")
            return {}

        skills_map = skills_df.set_index('SOC_CODE')['TYPICAL_SKILLS'].to_dict()
        return skills_map
    except Exception as e:
        print(f"Error loading skills file: {e}")
        return {}

if __name__ == "__main__":
    combined_records = []
    education_data = load_education()  # Load once for all years
    skills_data = load_skills()  # Load skills data

    for year in range(2016, 2017):
        print(f"\n🔄 Processing year: {year}")
        base_path = f'Datasets/{year}'
        
        national_df = load_file(os.path.join(base_path, f'national_{year}.csv'))
        state_df = load_file(os.path.join(base_path, f'state_{year}.csv'))
        industry_df = load_file(os.path.join(base_path, f'industry_{year}.csv'))
        allsectors_df = load_file(os.path.join(base_path, f'allsectors_{year}.csv'))

        records_national = process_national(national_df, year, education_data)
        records_state = process_state(state_df, year, education_data)
        records_industry = process_industry(industry_df, year, education_data)
        records_allsectors = process_allsectors(allsectors_df, year, education_data)

        merged = merge_records([records_national, records_state, records_industry, records_allsectors])
        combined_records.append(merged)

    # Merge across all years
    all_years_merged = merge_records(combined_records)

    # Append skills data to the merged records
    for key, record in all_years_merged.items():
        occ_code = record['occ_code']
        record['TYPICAL_SKILLS'] = skills_data.get(occ_code, '')

    output_path = 'dynamodb_insert_2016_2024_combined.csv'
    generate_output(all_years_merged, output_path)


Error loading education file: [Errno 2] No such file or directory: 'Datasets/ors_data.csv'
Error loading skills file: [Errno 2] No such file or directory: 'Datasets/grouped_skills.csv'

🔄 Processing year: 2016
Error loading Datasets/2016/national_2016.csv: [Errno 2] No such file or directory: 'Datasets/2016/national_2016.csv'
Error loading Datasets/2016/state_2016.csv: [Errno 2] No such file or directory: 'Datasets/2016/state_2016.csv'
Error loading Datasets/2016/industry_2016.csv: [Errno 2] No such file or directory: 'Datasets/2016/industry_2016.csv'
Error loading Datasets/2016/allsectors_2016.csv: [Errno 2] No such file or directory: 'Datasets/2016/allsectors_2016.csv'


AttributeError: 'NoneType' object has no attribute 'items'