In [320]:
from docx import Document
import os

with open(os.path.join('data', 'dossier.docx'), 'rb') as f:
    dossier = Document(f)

with open(os.path.join('data', 'CV.docx'), 'rb') as f:
    cv = Document(f)    

In [321]:
# "Directed Student Learning"
# use the dossier details and insert here

# Dossier: "Supervision of, and Membership on, Graduate and Undergraduate Dissertations, Theses, Projects, Monographs, Performances, Productions, and Exhibitions Required for Degrees; Types of Degrees and Years Granted"
doss_start = "Supervision of, and Membership on, Graduate and Undergraduate Dissertations, Theses, Projects, Monographs, Performances, Productions, and Exhibitions Required for Degrees; Types of Degrees and Years Granted"
doss_end = "Professional Development Activities Related to Teaching Responsibilities"

p_start = 0
for p, it in enumerate(dossier.paragraphs):
    if doss_start.lower() == it.text.lower():
        p_start = p
    if doss_end.lower() == it.text.lower():
        if p_start == 0:
            raise ValueError('Never encountered the start token in Dossier.')
        p_doss = range(p_start+1, p)
        break
        
# CV: "Directed Student Learning"
cv_start = "Directed Student Learning"
cv_end = "RESEARCH"

p_start = 0
for p, it in enumerate(cv.paragraphs):
    if cv_start.lower() == it.text.lower():
        p_start = p
    if cv_end.lower() == it.text.lower():
        if p_start == 0:
            raise ValueError('Never encountered the start token in CV.')
        p_cv = range(p_start+1, p)
        break

In [322]:
def remove_paragraphs_by_index(doc, paragraph_indices):

    # Sort the indices in reverse order to avoid shifting issues
    paragraph_indices.sort(reverse=True)

    # Remove paragraphs by index
    for index in paragraph_indices:
        if 0 <= index < len(doc.paragraphs):
            p = doc.paragraphs[index]._element
            p.getparent().remove(p)
    
    return doc

cv = remove_paragraphs_by_index(cv, list(p_cv))

In [323]:
from docx.shared import Pt

def insert_paragraphs_at_index(doc, new_content, before):
    for insert_para in new_content:
        # Insert an empty paragraph before the specified paragraph
        new_p = before.insert_paragraph_before()

        # Copy runs with their bold formatting
        for run in insert_para.runs:
            # n_tabs = (1 + int(',' in run.text))
            # tabs = ''.join(['\t']*n_tabs)
            new_run = new_p.add_run(run.text)
            new_run.bold = ',' not in run.text

        # Set paragraph formatting to have a hanging indent
        tab_size = 18
        if ',' in insert_para.text:  # Negative indent for the first line
            new_p.paragraph_format.left_indent = Pt(3*tab_size)  # Indent the entire paragraph
            new_p.paragraph_format.first_line_indent = Pt(-tab_size)  
        else:
            new_p.paragraph_format.left_indent = Pt(tab_size)

    return doc

new_content = [dossier.paragraphs[p] for p in p_doss]
before_para = [p for p in cv.paragraphs if p.text.lower() == cv_end.lower()][0]
cv = insert_paragraphs_at_index(cv, new_content, before_para)

In [324]:
# "Teaching Experience"
# use the dossier details and insert here 
# clean up unnecessary details as appropriate

# Dossier: "Supervision of, and Membership on, Graduate and Undergraduate Dissertations, Theses, Projects, Monographs, Performances, Productions, and Exhibitions Required for Degrees; Types of Degrees and Years Granted"
doss_start = "List of Credit Courses Taught at Penn State for Each Semester with Enrollments in Each Course"
doss_end = "Concise Compilation of Results of Student Feedback from Multiple Sources, Documented Evaluation of Candidate’s Programs, Activities, and Skills in Relating to Clientele"

p_start = 0
for p, it in enumerate(dossier.paragraphs):
    if doss_start.lower() == it.text.lower():
        p_start = p
    if doss_end.lower() == it.text.lower():
        if p_start == 0:
            raise ValueError('Never encountered the start token in Dossier.')
        p_doss = range(p_start+1, p)
        break
        
# CV: "Directed Student Learning"
cv_start = "Teaching Experience"
cv_end = "Directed Student Learning"

p_start = 0
for p, it in enumerate(cv.paragraphs):
    if cv_start.lower() == it.text.lower():
        p_start = p
    if cv_end.lower() == it.text.lower():
        if p_start == 0:
            raise ValueError('Never encountered the start token in CV.')
        p_cv = range(p_start+1, p)
        break

In [325]:
cv = remove_paragraphs_by_index(cv, list(p_cv))

In [326]:
import json

def is_study_topic(run):
    return 'study topic' in run.text

def is_semester(run):
    semesters = ['Fall', 'Spring', 'Summer']
    return any([s in run.text for s in semesters])

def extract_course_number(run):
    run.text.split(',')[0]
    return 

def insert_teaching_at_index(doc, new_content, before):
    for insert_para in new_content:
        # Insert an empty paragraph before the specified paragraph
        new_p = before.insert_paragraph_before()

        # Copy runs with their bold formatting
        for run in insert_para.runs:
            # n_tabs = (1 + int(',' in run.text))
            # tabs = ''.join(['\t']*n_tabs)
            new_run = new_p.add_run(run.text)
            new_run.bold = is_semester(run)
            new_run.italic = is_study_topic(run)

        # Set paragraph formatting to have a hanging indent
        tab_size = 18
        if is_semester(insert_para):  # Negative indent for the first line
            new_p.paragraph_format.left_indent = Pt(tab_size)
        else:
            new_p.paragraph_format.left_indent = Pt(3*tab_size)  # Indent the entire paragraph
            new_p.paragraph_format.first_line_indent = Pt(-tab_size)  

    return doc

with open(os.path.join('data', 'course_names.json'), 'r') as fid:
    course_names = json.load(fid)  

new_content = [dossier.paragraphs[p] for p in p_doss]
before_para = [p for p in cv.paragraphs if p.text.lower() == cv_end.lower()][0]
cv = insert_teaching_at_index(cv, new_content, before_para)

In [327]:
cv.save(os.path.join('data', 'modified-CV.docx'))