## Syllabus Parsing and Event Creation

In [5]:
pip install python-docx


Note: you may need to restart the kernel to use updated packages.


In [40]:
from docx import Document
def read_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)
file_path = 'CAP4530 001-syllabus_Fall2024.docx'

# Read and print the content of the DOCX file
document_content = read_docx(file_path)
print(document_content)







Welcome!
Dear students welcome to the COP 4530 class. I hope we will have very productive semester. 

University Course Description
Understand and implement fundamentals of concise data structure and organization for program efficiency, clarity and simplification. Implementation of different data types and structures. Understanding of current data structures. Functional programming concepts will be covered.
Course Prerequisites
COP 3514, CDA 3103
Course Purpose 
A data structure is a specialized format for organizing, processing, retrieving and storing data. While there are several basic and advanced structure types, any data structure is designed to arrange data to suit a specific purpose so that it can be accessed and worked with in appropriate ways.
Instructor Contact Information and Communication 
The students can reach me regarding any questions through office hours. Also students can use the email system through Canvas or USF email. I will check the email every working day (

### Extraction of dates from the table and paragraphs

In [41]:
import re
from docx import Document

def extract_dates_from_docx(docx_file):
    doc = Document(docx_file)
    text_content = []

    for para in doc.paragraphs:
        text_content.append(para.text)

    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                text_content.append(cell.text)

    full_text = "\n".join(text_content)

    #  regex pattern to match various date formats
    date_pattern = r"\b(?:Mon|Tues|Wed|Thurs|Fri|Sat|Sun)?[, ]?\s*(?:Jan\.?|Feb\.?|Mar\.?|Apr\.?|May\.?|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?)\s*\d{1,2}\b"
    dates = re.findall(date_pattern, full_text)
    unique_dates = sorted(set(dates))

    return unique_dates

# Main function
if __name__ == "__main__":
    
    file_path = 'CAP4530 001-syllabus_Fall2024.docx'  
    extracted_dates = extract_dates_from_docx(file_path)
    if extracted_dates:
        print("Extracted Dates:")
        for date in extracted_dates:
            print(date)
    else:
        print("No dates found.")


Extracted Dates:

  Sep. 8
 Nov. 13
 Nov. 27
 Oct. 27
 Oct. 6
 Sep. 22
 Sep.1
 Sep.8
, Aug 29
, Dec. 5
, Dec.10
, Nov 28
, Nov.
       21
, Nov.
     7
, Nov. 14
, Oct 31
, Oct.
       10
, Oct.
     17
, Oct. 24
, Oct. 3
, Sep.
      12
, Sep. 19
, Sep. 26
, Sep. 5
Tues, Aug
     27
Tues, Dec. 3
Tues, Nov 26
Tues, Nov.
      12
Tues, Nov.
     5
Tues, Nov. 
       19
Tues, Oct.
     8
Tues, Oct.
    1
Tues, Oct. 
       15
Tues, Oct. 
     22
Tues, Oct. 29
Tues, Sep.
       10
Tues, Sep.
      17
Tues, Sep. 24
Tues, Sep.3


In [48]:
import re
from docx import Document

def extract_academic_info(docx_file):
    # Load the DOCX file
    doc = Document(docx_file)
    
    # Combine text from paragraphs and tables
    text_content = []

    # Extract text from paragraphs
    for para in doc.paragraphs:
        text_content.append(para.text)

    # Extract text from tables
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                text_content.append(cell.text)

    # Join all text into a single string for regex searching
    full_text = "\n".join(text_content)

    # Debugging: Print the full text being searched
    print("Full Text Extracted:\n", full_text)
    print(f"Total text length: {len(full_text)} characters\n")

    # Define regex patterns for various events
    patterns = {
        "Project": r"(Project|Project Due)\s*[-:]?\s*(?P<event_date>(?:Mon|Tues|Wed|Thurs|Fri|Sat|Sun)[, ]?\s*(?:Jan\.?|Feb\.?|Mar\.?|Apr\.?|May\.?|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?)\.?\s*\d{1,2}|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*\d{1,2}(?:th|st|nd|rd)?[, ]?\s*\d{4}?)\s*(?P<description>.*?)\s*(?=\n|$)",
        "Due": r"(Due|Due Date)\s*[-:]?\s*(?P<event_date>(?:Mon|Tues|Wed|Thurs|Fri|Sat|Sun)[, ]?\s*(?:Jan\.?|Feb\.?|Mar\.?|Apr\.?|May\.?|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?)\.?\s*\d{1,2}|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*\d{1,2}(?:th|st|nd|rd)?[, ]?\s*\d{4}?)\s*(?P<description>.*?)\s*(?=\n|$)",
        "CQ": r"(CQ)\s*[-:]?\s*(?P<event_date>(?:Mon|Tues|Wed|Thurs|Fri|Sat|Sun)[, ]?\s*(?:Jan\.?|Feb\.?|Mar\.?|Apr\.?|May\.?|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?)\.?\s*\d{1,2}|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*\d{1,2}(?:th|st|nd|rd)?[, ]?\s*\d{4}?)\s*(?P<description>.*?)\s*(?=\n|$)",
        "Final Exam": r"(Final Exam|Final Exam Due)\s*[-:]?\s*(?P<event_date>(?:Mon|Tues|Wed|Thurs|Fri|Sat|Sun)[, ]?\s*(?:Jan\.?|Feb\.?|Mar\.?|Apr\.?|May\.?|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?)\.?\s*\d{1,2}|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*\d{1,2}(?:th|st|nd|rd)?[, ]?\s*\d{4}?)\s*(?P<description>.*?)\s*(?=\n|$)",
        "Final Project": r"(Final Project|Final Project Due)\s*[-:]?\s*(?P<event_date>(?:Mon|Tues|Wed|Thurs|Fri|Sat|Sun)[, ]?\s*(?:Jan\.?|Feb\.?|Mar\.?|Apr\.?|May\.?|Jun\.?|Jul\.?|Aug\.?|Sep\.?|Oct\.?|Nov\.?|Dec\.?)\.?\s*\d{1,2}|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*\d{1,2}(?:th|st|nd|rd)?[, ]?\s*\d{4}?)\s*(?P<description>.*?)\s*(?=\n|$)"
    }

    # Prepare a list to hold event details
    all_event_details = []

    # Extract details for each event type
    for event_type, pattern in patterns.items():
        matches = re.finditer(pattern, full_text)
        for match in matches:
            event_name = match.group(0).strip()
            event_date = match.group('event_date').strip()
            description = match.group('description').strip()
            all_event_details.append({
                "Event Name": event_name,
                "Event Date": event_date,
                "Description": description
            })
    
    return all_event_details

# Main function
if __name__ == "__main__":
    # Path to the DOCX file
    file_path = 'CAP4530 001-syllabus_Fall2024.docx'  # Change this to your file path
    
    # Extract academic event information from the DOCX file
    extracted_events = extract_academic_info(file_path)
    
    # Output the extracted events
    if extracted_events:
        print("Extracted Academic Events:")
        for event in extracted_events:
            print(f"Event Name: {event['Event Name']}")
            print(f"Event Date: {event['Event Date']}")
            print(f"Description: {event['Description']}\n")
    else:
        print("No academic events found.")


Full Text Extracted:
 




Welcome!
Dear students welcome to the COP 4530 class. I hope we will have very productive semester. 

University Course Description
Understand and implement fundamentals of concise data structure and organization for program efficiency, clarity and simplification. Implementation of different data types and structures. Understanding of current data structures. Functional programming concepts will be covered.
Course Prerequisites
COP 3514, CDA 3103
Course Purpose 
A data structure is a specialized format for organizing, processing, retrieving and storing data. While there are several basic and advanced structure types, any data structure is designed to arrange data to suit a specific purpose so that it can be accessed and worked with in appropriate ways.
Instructor Contact Information and Communication 
The students can reach me regarding any questions through office hours. Also students can use the email system through Canvas or USF email. I will check the ema