In [3]:
import re
import json
from docx import Document

def read_docx(file_path, start_line, end_line):
    doc = Document(file_path)
    lines = []
    line_count = 0
    for paragraph in doc.paragraphs:
        if start_line <= line_count < end_line:
            lines.append(paragraph.text.strip())
        line_count += 1
    return lines

def parse_lines(lines):
    url_pattern = re.compile(r'https?://\S+')
    data = []
    current_main_category = None
    current_job = None
    jobs = {}

    for line in lines:
        if url_pattern.search(line):
            # Extract the URL
            url_match = url_pattern.search(line)
            url = url_match.group()
            
            # Extract the category (everything before the URL)
            category = line[:url_match.start()].strip()

            if current_job and current_job in jobs:
                jobs[current_job]['links'].append({
                    'url': url,
                    'category': category
                })
        elif line.lower().startswith("undefined"):
            # Skip lines starting with "undefined"
            continue
        else:
            if current_main_category is None:
                # Set the main category
                current_main_category = line
            elif current_job is None:
                # Set the job title (sub_category)
                current_job = line
                jobs[current_job] = {
                    'links': []
                }
            else:
                # Handle a new main category if a new line appears
                if line.strip() == "":
                    if current_main_category:
                        # Save the current main category and its jobs
                        data.append({
                            'main_category': current_main_category,
                            'jobs': jobs
                        })
                        # Reset for the next main category
                        current_main_category = None
                        jobs = {}
                        current_job = None
                else:
                    # If it's neither a URL nor an empty line, it might be a new job
                    if current_job:
                        # Make sure to add the current job to jobs before changing
                        current_job = line
                        jobs[current_job] = {
                            'links': []
                        }

    # Handle the last main category and jobs if they exist
    if current_main_category and jobs:
        data.append({
            'main_category': current_main_category,
            'jobs': jobs
        })

    return data

def save_to_json(data, output_file):
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=4)

def main(file_path, start_line, end_line, output_file):
    lines = read_docx(file_path, start_line, end_line)
    data = parse_lines(lines)
    save_to_json(data, output_file)

# Specify the file path and line range
file_path = 'careers.docx'
start_line = 171  # Starting line (inclusive)
end_line = start_line + 3000  # Adjust the ending line as needed
output_file = 'jobs.json'

main(file_path, start_line, end_line, output_file)


In [4]:
from docx import Document
from docx.oxml.ns import qn

# Function to check if run is highlighted
def is_highlighted(run):
    highlight = run.element.xpath('.//w:highlight')
    return bool(highlight)

# Load the document
doc = Document('careers.docx')
file_path = 'careers.docx'
# Iterate through paragraphs and runs to check for highlighted text
for para in doc.paragraphs:
    for run in para.runs:
        if is_highlighted(run):
            print(f"Highlighted text: {run.text}")

# Checking text in tables
for table in doc.tables:
    for row in table.rows:
        for cell in row.cells:
            for para in cell.paragraphs:
                for run in para.runs:
                    if is_highlighted(run):
                        print(f"Highlighted text in table: {run.text}")


Highlighted text: YELLOW
Highlighted text: BLUE
Highlighted text: GREEN
Highlighted text: Business
Highlighted text: , Advertising,
Highlighted text:  and Finance
Highlighted text: Advertising Sales Representative 
Highlighted text: / Agent / Executive
Highlighted text: Assistant Bank Manager
Highlighted text: Claims Supervisor
Highlighted text: Customer Service Representative
Highlighted text: Customer Service Representative Supervisor
Highlighted text: Department Manager
Highlighted text:  
Highlighted text: Financial Advisor
Highlighted text:  
Highlighted text: Fund Raiser
Highlighted text: General Operations Manager
Highlighted text: Loan Counselor
Highlighted text: Loan Officer
Highlighted text: Loss Prevention Manager
Highlighted text: Management Analyst
Highlighted text: Market Research Analyst
Highlighted text: Medical 
Highlighted text: and Health 
Highlighted text: Services Manager
Highlighted text: Media Buyer
Highlighted text: Pharmaceutical Sales Representative
Highlighte

In [5]:
from docx import Document
from docx.oxml.ns import qn

# Function to check if run is highlighted and get the highlight color
def get_highlight_color(run):
    highlight_elements = run.element.xpath('.//w:highlight')
    if highlight_elements:
        highlight_element = highlight_elements[0]
        color = highlight_element.get(qn('w:val'))
        return color
    return None

# Load the document
doc = Document('careers.docx')
'careers.docx'
# Iterate through paragraphs and runs to check for highlighted text and color
for para in doc.paragraphs:
    for run in para.runs:
        highlight_color = get_highlight_color(run)
        if highlight_color:
            print(f"Highlighted text: {run.text}, Color: {highlight_color}")

# Checking text in tables
for table in doc.tables:
    for row in table.rows:
        for cell in row.cells:
            for para in cell.paragraphs:
                for run in para.runs:
                    highlight_color = get_highlight_color(run)
                    if highlight_color:
                        print(f"Highlighted text in table: {run.text}, Color: {highlight_color}")


Highlighted text: YELLOW, Color: yellow
Highlighted text: BLUE, Color: cyan
Highlighted text: GREEN, Color: green
Highlighted text: Business, Color: yellow
Highlighted text: , Advertising,, Color: yellow
Highlighted text:  and Finance, Color: yellow
Highlighted text: Advertising Sales Representative , Color: cyan
Highlighted text: / Agent / Executive, Color: cyan
Highlighted text: Assistant Bank Manager, Color: cyan
Highlighted text: Claims Supervisor, Color: cyan
Highlighted text: Customer Service Representative, Color: cyan
Highlighted text: Customer Service Representative Supervisor, Color: cyan
Highlighted text: Department Manager, Color: cyan
Highlighted text:  , Color: yellow
Highlighted text: Financial Advisor, Color: cyan
Highlighted text:  , Color: cyan
Highlighted text: Fund Raiser, Color: cyan
Highlighted text: General Operations Manager, Color: cyan
Highlighted text: Loan Counselor, Color: cyan
Highlighted text: Loan Officer, Color: cyan
Highlighted text: Loss Prevention Ma

## MAke website with youtube embedded videos

In [6]:
import re
import json
from docx import Document

def read_docx(file_path, start_line, end_line):
    doc = Document(file_path)
    lines = []
    line_count = 0
    for paragraph in doc.paragraphs:
        if start_line <= line_count < end_line:
            lines.append(paragraph.text.strip())
        line_count += 1
    return lines

def parse_lines(lines):
    url_pattern = re.compile(r'https?://\S+')
    youtube_pattern = re.compile(r'(https?://(?:www\.)?youtube\.com/watch\?v=[\w-]+|https?://(?:www\.)?youtu\.be/[\w-]+)')
    data = []
    current_main_category = None
    current_job = None
    jobs = {}

    for line in lines:
        youtube_match = youtube_pattern.search(line)
        if youtube_match:
            # Extract the YouTube URL
            youtube_url = youtube_match.group()
            
            # Transform the YouTube URL into an iframe HTML
            video_id = re.search(r'(?:v=|youtu\.be/)([\w-]+)', youtube_url).group(1)
            iframe_html = f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{video_id}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>'
            
            # Extract the category (everything before the URL)
            category = line[:youtube_match.start()].strip()

            if current_job and current_job in jobs:
                jobs[current_job]['videos'].append({
                    'iframe': iframe_html,
                    'category': category
                })
        elif url_pattern.search(line):
            # Extract the URL
            url_match = url_pattern.search(line)
            url = url_match.group()
            
            # Extract the category (everything before the URL)
            category = line[:url_match.start()].strip()

            if current_job and current_job in jobs:
                jobs[current_job]['links'].append({
                    'url': url,
                    'category': category
                })
        elif line.lower().startswith("undefined"):
            # Skip lines starting with "undefined"
            continue
        else:
            if current_main_category is None:
                # Set the main category
                current_main_category = line
            elif current_job is None:
                # Set the job title (sub_category)
                current_job = line
                jobs[current_job] = {
                    'links': [],
                    'videos': []
                }
            else:
                # Handle a new main category if a new line appears
                if line.strip() == "":
                    if current_main_category:
                        # Save the current main category and its jobs
                        data.append({
                            'main_category': current_main_category,
                            'jobs': jobs
                        })
                        # Reset for the next main category
                        current_main_category = None
                        jobs = {}
                        current_job = None
                else:
                    # If it's neither a URL nor an empty line, it might be a new job
                    if current_job:
                        # Make sure to add the current job to jobs before changing
                        current_job = line
                        jobs[current_job] = {
                            'links': [],
                            'videos': []
                        }

    # Handle the last main category and jobs if they exist
    if current_main_category and jobs:
        data.append({
            'main_category': current_main_category,
            'jobs': jobs
        })

    return data

def save_to_json(data, output_file):
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=4)

def main(file_path, start_line, end_line, output_file):
    lines = read_docx(file_path, start_line, end_line)
    data = parse_lines(lines)
    save_to_json(data, output_file)

# Specify the file path and line range
file_path = 'careers.docx'
start_line = 171  # Starting line (inclusive)
end_line = start_line + 3000  # Adjust the ending line as needed
output_file = 'jobs.json'

main(file_path, start_line, end_line, output_file)
