In [164]:
from docx import Document
import os

with open(os.path.join('data', 'dossier.docx'), 'rb') as f:
    dossier = Document(f)

with open(os.path.join('data', 'CV.docx'), 'rb') as f:
    cv = Document(f)    

In [165]:
# "Directed Student Learning"
# use the dossier details and insert here

# Dossier: "Supervision of, and Membership on, Graduate and Undergraduate Dissertations, Theses, Projects, Monographs, Performances, Productions, and Exhibitions Required for Degrees; Types of Degrees and Years Granted"
doss_start = "Supervision of, and Membership on, Graduate and Undergraduate Dissertations, Theses, Projects, Monographs, Performances, Productions, and Exhibitions Required for Degrees; Types of Degrees and Years Granted"
doss_end = "Professional Development Activities Related to Teaching Responsibilities"

p_start = 0
for p, it in enumerate(dossier.paragraphs):
    if doss_start.lower() == it.text.lower():
        p_start = p
    if doss_end.lower() == it.text.lower():
        if p_start == 0:
            raise ValueError('Never encountered the start token in Dossier.')
        p_doss = range(p_start+1, p)
        break
        
# CV: "Directed Student Learning"
cv_start = "Directed Student Learning"
cv_end = "RESEARCH"

p_start = 0
for p, it in enumerate(cv.paragraphs):
    if cv_start.lower() == it.text.lower():
        p_start = p
    if cv_end.lower() == it.text.lower():
        if p_start == 0:
            raise ValueError('Never encountered the start token in CV.')
        p_cv = range(p_start+1, p)
        break

In [166]:
def remove_paragraphs_by_index(doc, paragraph_indices):

    # Sort the indices in reverse order to avoid shifting issues
    paragraph_indices.sort(reverse=True)

    # Remove paragraphs by index
    for index in paragraph_indices:
        if 0 <= index < len(doc.paragraphs):
            p = doc.paragraphs[index]._element
            p.getparent().remove(p)
    
    return doc

cv = remove_paragraphs_by_index(cv, list(p_cv))

In [167]:
from docx.shared import Pt

def insert_paragraphs_at_index(doc, new_content, before):
    for insert_para in new_content:
        # Insert an empty paragraph before the specified paragraph
        new_p = before.insert_paragraph_before()

        # Copy runs with their bold formatting
        for run in insert_para.runs:
            # n_tabs = (1 + int(',' in run.text))
            # tabs = ''.join(['\t']*n_tabs)
            new_run = new_p.add_run(run.text)
            new_run.bold = ',' not in run.text

        # Set paragraph formatting to have a hanging indent
        tab_size = 18
        if ',' in insert_para.text:  # Negative indent for the first line
            new_p.paragraph_format.left_indent = Pt(3*tab_size)  # Indent the entire paragraph
            new_p.paragraph_format.first_line_indent = Pt(-tab_size)  
        else:
            new_p.paragraph_format.left_indent = Pt(tab_size)

    return doc

new_content = [dossier.paragraphs[p] for p in p_doss]
before_para = [p for p in cv.paragraphs if p.text.lower() == cv_end.lower()][0]
cv = insert_paragraphs_at_index(cv, new_content, before_para)

In [168]:
# "Teaching Experience"
# use the dossier details and insert here 
# clean up unnecessary details as appropriate

# Dossier: "Supervision of, and Membership on, Graduate and Undergraduate Dissertations, Theses, Projects, Monographs, Performances, Productions, and Exhibitions Required for Degrees; Types of Degrees and Years Granted"
doss_start = "List of Credit Courses Taught at Penn State for Each Semester with Enrollments in Each Course"
doss_end = "Concise Compilation of Results of Student Feedback from Multiple Sources, Documented Evaluation of Candidate’s Programs, Activities, and Skills in Relating to Clientele"

p_start = 0
for p, it in enumerate(dossier.paragraphs):
    if doss_start.lower() == it.text.lower():
        p_start = p
    if doss_end.lower() == it.text.lower():
        if p_start == 0:
            raise ValueError('Never encountered the start token in Dossier.')
        p_doss = range(p_start+1, p)
        break
        
# CV: "Directed Student Learning"
cv_start = "Teaching Experience"
cv_end = "Directed Student Learning"

p_start = 0
for p, it in enumerate(cv.paragraphs):
    if cv_start.lower() == it.text.lower():
        p_start = p
    if cv_end.lower() == it.text.lower():
        if p_start == 0:
            raise ValueError('Never encountered the start token in CV.')
        p_cv = range(p_start+1, p)
        break

In [169]:
cv = remove_paragraphs_by_index(cv, list(p_cv))

In [170]:
import json
import re

def titleize(text, exceptions):
    return ' '.join([word.lower() if word in exceptions else word.capitalize() for word in text.split()])

def is_course_format(s):
    # Define the regex pattern
    # pattern = r'^[A-Z]{2,5} \d{3}-\d{3}$'
    pattern = r'^[A-Z]{2,5} \d{3}[A-Z]?-\d{3}[A-Z]?$'
    
    # Use re.match to check if the string matches the pattern
    if re.match(pattern, s):
        return True
    else:
        return False

def is_study_topic(run):
    return 'study topic' in run.text

def is_semester(run):
    semesters = ['Fall', 'Spring', 'Summer']
    return any([s in run.text for s in semesters])

def extract_course_number(text):
    dept_num = text.split(',')[0].split('-')[0]
    return dept_num

def check_for_course_and_replace(run):
    if ',' not in run.text:
        return run.text
    potential_course_number = run.text.split(',')[0]
    if not is_course_format(potential_course_number):
        return run.text
    number = extract_course_number(potential_course_number)
    if number in course_names.keys():
        return f"{number}, {course_names[number]}"
    else:
        return ""

def insert_teaching_at_index(doc, new_content, before):
    for insert_para in new_content:
        # Insert an empty paragraph before the specified paragraph
        new_p = before.insert_paragraph_before()

        # Copy runs with their bold formatting
        for run in insert_para.runs:
            # n_tabs = (1 + int(',' in run.text))
            # tabs = ''.join(['\t']*n_tabs)
            run_text = check_for_course_and_replace(run)
            if len(run_text) == 0:
                continue
            if is_study_topic(run):
                run_text = titleize(run.text, exceptions=['of', 'the', 'in', 'to', 'for',  'and']) + '\n'
            new_run = new_p.add_run(run_text)
            new_run.bold = is_semester(run)
            new_run.italic = is_study_topic(run)    

        # Set paragraph formatting to have a hanging indent
        tab_size = 18
        if is_semester(insert_para):  # Negative indent for the first line
            new_p.paragraph_format.left_indent = Pt(tab_size)
        else:
            new_p.paragraph_format.left_indent = Pt(2*tab_size)  # Indent the entire paragraph

    return doc

with open(os.path.join('data', 'course_names.json'), 'r') as fid:
    course_names = json.load(fid)  

new_content = [dossier.paragraphs[p] for p in p_doss]
before_para = [p for p in cv.paragraphs if p.text.lower() == cv_end.lower()][0]
cv = insert_teaching_at_index(cv, new_content, before_para)

In [171]:
# remove duplicates (from hybrid/online)
# remove multiple blank lines
# remove empty semesters

# CV: "Directed Student Learning"
cv_start = "Teaching Experience"
cv_end = "Directed Student Learning"

p_start = False
this_semester = None
breaks = []
courses = []
p_cv = []
for p, it in enumerate(cv.paragraphs):
    if cv_start.lower() == it.text.lower():
        p_start = True
    if len(it.runs) == 0:
        continue
    if is_semester(it.runs[0]):
        # clean up previous semester first
        if this_semester is not None:
            if len(courses) == 0:
                p_cv.append(p)  # delete the semester line
                p_cv += breaks
            elif len(breaks) > 1:
                p_cv += breaks[1:]
        this_semester = p
        breaks = []
    elif len(it.text) == 0:
        breaks.append(p)
    # elif it.text in courses:
    #     breaks.append(p)  # remove duplicate courses
    else:
        courses.append(it.text)
    if cv_end.lower() == it.text.lower():
        if p_start is False:
            raise ValueError('Never encountered the start token in CV.')
        break
        
# cv = remove_paragraphs_by_index(cv, p_cv)

In [172]:
def find_indices_of_multiple_newlines(doc):
    """
    This function returns a list of indices for paragraphs that should be removed
    to ensure only a single empty paragraph (newline) exists between sections.
    """
    indices_to_remove = []
    paragraphs = doc.paragraphs
    
    last_was_empty = False
    for i, para in enumerate(paragraphs):
        if not para.text.strip():  # Check if the paragraph is empty
            if last_was_empty:
                indices_to_remove.append(i)  # Mark this empty paragraph for removal
            last_was_empty = True
        else:
            last_was_empty = False
    
    return indices_to_remove

# p_idx = find_indices_of_multiple_newlines(cv)
# cv = remove_paragraphs_by_index(cv, p_idx)

In [173]:
def identify_bad_teaching_paragraphs(doc):
    paragraphs = doc.paragraphs
    indices_to_remove = []

    i = 0
    while i < len(paragraphs):
        para = paragraphs[i]

        # Check if the paragraph is a semester heading
        if is_semester(para):
            # Check if there are courses or study topics under this semester
            has_courses = False
            j = i + 1
            while j < len(paragraphs) and not is_semester(paragraphs[j]):
                if is_course_format(paragraphs[j].text) or is_study_topic(paragraphs[j]):
                    has_courses = True
                    break
                j += 1

            # If no courses are found, mark the semester heading for removal
            if not has_courses:
                indices_to_remove.append(i)

        i += 1

    return indices_to_remove

# p_idx = identify_bad_teaching_paragraphs(cv)
# cv = remove_paragraphs_by_index(cv, p_idx)

In [174]:
# publications
# remove %
# number the papers in reverse chronological order 

# CV: "Intellectual Contributions"
cv_start = "Intellectual Contributions"
cv_end = "Editorial and Advisory Boards"

p_start = 0
for p, it in enumerate(cv.paragraphs):
    if cv_start.lower() == it.text.lower():
        p_start = p
    if p_start > 0:
        if '%)' in it.text:
            print('detected code in:', it.text)
    if cv_end.lower() == it.text.lower():
        if p_start == 0:
            raise ValueError('Never encountered the start token in CV.')
        p_cv = range(p_start+1, p)
        break

In [175]:
def is_run_bold(run):
    # Check direct formatting
    if run.bold is not None:
        return run.bold

    # Check inherited formatting from run style
    if run.style and run.style.font and run.style.font.bold:
        return True

    # Check inherited formatting from paragraph style
    paragraph_style = run._parent.style
    if paragraph_style and paragraph_style.font and paragraph_style.font.bold:
        return True

    # If all checks fail, assume not bold
    return False

# is_run_bold(cv.paragraphs[342].runs[0])

In [176]:
# CV: "Intellectual Contributions"
cv_start = "Intellectual Contributions"
cv_end = "Editorial and Advisory Boards"

sections = {}

author = 'Napolitano'

p_start = 0
this_section = None
these_entries = None
for p, it in enumerate(cv.paragraphs):
    if cv_start.lower() == it.text.lower():
        p_start = p
        continue
    if len(it.text) == 0:
        continue
    if p_start > 0:
        if it.style and it.style.font and it.style.font.bold:
            if this_section is not None:
                sections[this_section] = these_entries
            this_section = it.text
            these_entries = []
        elif author in it.text:  # this is a paper entry
            these_entries.append(it)
    if cv_end.lower() == it.text.lower():
        if p_start == 0:
            raise ValueError('Never encountered the start token in CV.')
        p_cv = range(p_start+1, p)
        break

In [177]:
len(sections['Articles Published in Refereed Journals'])

41

In [178]:
cv = remove_paragraphs_by_index(cv, list(p_cv))

In [179]:
def insert_pubs_at_index(doc, sections, before):
    tab_size = 18
    
    for key, new_content in sections.items():
        _ = before.insert_paragraph_before('')
        new_p = before.insert_paragraph_before()
        new_run = new_p.add_run(key)
        new_run.bold = True
        new_p.paragraph_format.left_indent = Pt(tab_size)
        
        n = len(new_content)
        for i, insert_para in enumerate(new_content):
            # Insert an empty paragraph before the specified paragraph
            _ = before.insert_paragraph_before('')  # blank line to separate entries
            new_p = before.insert_paragraph_before()
    
            _ = new_p.add_run(f'{n-i}. ')  # number entries in reverse order
    
            # Copy runs with their bold formatting
            for run in insert_para.runs:
                new_run = new_p.add_run(run.text)
                new_run.bold = run.bold
                new_run.italic = run.italic
    
            # Set paragraph formatting to have a hanging indent
            new_p.paragraph_format.left_indent = Pt(3*tab_size)
            new_p.paragraph_format.first_line_indent = Pt(-tab_size)
            
    _ = before.insert_paragraph_before('')
            
    return doc

with open(os.path.join('data', 'course_names.json'), 'r') as fid:
    course_names = json.load(fid)  

before_para = [p for p in cv.paragraphs if p.text.lower() == cv_end.lower()][0]
cv = insert_pubs_at_index(cv, sections, before_para)

In [180]:
cv.save(os.path.join('data', 'modified-CV.docx'))

In [319]:
# TODO
# order for directed student learning:  phd dissertation advisor, phd dissertation coadviser at top
# number the students in reverse order