This code works, do experiments below

// careers can be entered with a bachelor’s degree (highlighted in BLUE), 
// and those whose careers require a graduate degree (highlighted in GREEN).

In [6]:
import re
import json
from docx import Document
from docx.oxml.ns import qn

# Function to check if run is highlighted and get the highlight color
def get_highlight_color(run):
    highlight_elements = run.element.xpath('.//w:highlight')
    if highlight_elements:
        highlight_element = highlight_elements[0]
        color = highlight_element.get(qn('w:val'))
        return color
    return None

def read_docx(file_path, start_line, end_line):
    doc = Document(file_path)
    lines = []
    line_count = 0
    for paragraph in doc.paragraphs:
        if start_line <= line_count < end_line:
            para_text = paragraph.text.strip()
            highlight_color = None
            for run in paragraph.runs:
                color = get_highlight_color(run)
                if color:
                    highlight_color = color  # Capture the first highlighted color in the paragraph
                    break
            lines.append((para_text, highlight_color))  # Store line and highlight color as a tuple
        line_count += 1
    return lines

def parse_lines(lines):
    url_pattern = re.compile(r'https?://\S+')
    youtube_pattern = re.compile(r'(https?://(?:www\.)?youtube\.com/watch\?v=[\w-]+|https?://(?:www\.)?youtu\.be/[\w-]+)')
    data = []
    current_main_category = None
    current_job = None
    jobs = {}

    for line, highlight_color in lines:
        youtube_match = youtube_pattern.search(line)
        if youtube_match:
            # Extract the YouTube URL
            youtube_url = youtube_match.group()
            video_id = re.search(r'(?:v=|youtu\.be/)([\w-]+)', youtube_url).group(1)
            
            # Store the video ID instead of the iframe HTML
            if current_job and current_job in jobs:
                jobs[current_job]['videos'].append({
                    'video_id': video_id,  # Store only the video ID
                    'url': youtube_url      # Optionally store the original URL
                })
        elif url_pattern.search(line):
            # Extract the URL
            url_match = url_pattern.search(line)
            url = url_match.group()
            
            # Extract the category (everything before the URL)
            category = line[:url_match.start()].strip()

            if current_job and current_job in jobs:
                jobs[current_job]['links'].append({
                    'url': url,
                    'category': category
                })
        elif line.lower().startswith("undefined"):
            # Skip lines starting with "undefined"
            continue
        else:
            if current_main_category is None:
                # Set the main category
                current_main_category = line
            elif current_job is None:
                # Set the job title (sub_category)
                current_job = line
                jobs[current_job] = {
                    'links': [],
                    'videos': [],
                    'degree_required': "Bachelor's" if highlight_color == 'cyan' else "Graduate Degree" if highlight_color == 'green' else ''
                }
            else:
                # Handle a new main category if a new line appears
                if line.strip() == "":
                    if current_main_category:
                        # Save the current main category and its jobs
                        data.append({
                            'main_category': current_main_category,
                            'jobs': jobs
                        })
                        # Reset for the next main category
                        current_main_category = None
                        jobs = {}
                        current_job = None
                else:
                    # If it's neither a URL nor an empty line, it might be a new job
                    if current_job:
                        # Make sure to add the current job to jobs before changing
                        current_job = line
                        jobs[current_job] = {
                            'links': [],
                            'videos': [],
                            'degree_required': "Bachelor's" if highlight_color == 'cyan' else "Graduate Degree" if highlight_color == 'green' else ''
                        }

    # Handle the last main category and jobs if they exist
    if current_main_category and jobs:
        data.append({
            'main_category': current_main_category,
            'jobs': jobs
        })

    return data

def save_to_json(data, output_file):
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=4)

def main(file_path, start_line, end_line, output_file):
    lines = read_docx(file_path, start_line, end_line)
    data = parse_lines(lines)
    save_to_json(data, output_file)

# Specify the file path and line range
file_path = 'careers.docx'
start_line = 171  # Starting line (inclusive)
end_line = start_line + 3000  # Adjust the ending line as needed
output_file = 'jobs.json'

main(file_path, start_line, end_line, output_file)


// careers can be entered with a bachelor’s degree (highlighted in BLUE), 
// and those whose careers require a graduate degree (highlighted in GREEN).

## Generating a site with jobs tables in json file