In [2]:
import csv
import json

def csv_to_json(input_file, output_file):
    valid_statuses = {'OnSite', 'Remote', 'Hybrid'}
    output_data = []

    with open(input_file, 'r', newline='', encoding='utf-8') as infile:
        reader = csv.reader(infile)
        for row_num, row in enumerate(reader, 1):
            # Skip header row if present
            if row_num == 1:
                continue

            if len(row) != 3:
                print(f"Invalid row number: {row_num} expected 3 columns, got {len(row)} columns, skipped")
                continue

            id_col, text_col, status_col = row
            cleaned_id = id_col.strip().replace('\n', ' ')
            cleaned_text = text_col.strip().replace('\n', ' ')

            if status_col not in valid_statuses:
                print(f"Invalid row number: {row_num} invalid status '{status_col}', skipped")
                continue

            # Create a dictionary for the valid row
            row_dict = {
                "id": cleaned_id,
                "text": cleaned_text,
                "status": status_col
            }
            output_data.append(row_dict)

    # Write the collected data as JSON with pretty formatting
    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(output_data, outfile, ensure_ascii=False, indent=4)


if __name__ == "__main__":
    csv_to_json('../raw_data/work_arrangements_development_set.csv',
                '../processed_data/processed_work_arrangements_development_set.json')
