## Keyword Strategy Testing

In [1]:
import re 

PRIMARY_KEYWORDS = [
    'ai policy', 'aigenerated content', 'ai-generated', 'ai-assisted',
    'generative ai', 'llm',
]
AI_CONTEXT_WORDS = [
    'ai', 'artificial intelligence', 'generative', 'chatgpt', 'llm', 'copilot', 'bard', 
    'large language model' ,
    'gemini', 'dall-e', 'gpt', 'midjourney', 'stable diffusion', 'ai tool', 'ai tools'
]
POLICY_KEYWORDS = [
    'academic integrity', 'academic dishonesty', 'plagiarism', 'cheating', 'unauthorized',
    'unauthorized use', 'unauthorized assistance', 'citation', 'cite', 'attribution',
    'acknowledge', 'permitted', 'allowed', 'prohibited', 'forbidden', 'disclosure', 
    'ethical use', 'responsible use', 'use', 'using', 'assistance', 'help', 'guidance', 
    'policy', 'rule'
]

def find_ai_policy_paragraphs(paragraphs):
    """
    KEEP a paragraph if it contains
    a PRIMARY_KEYWORD
    OR
    (an AI_CONTEXT_WORD AND a POLICY_KEYWORD)
    """
    found_policies = []
    
    # --- Pre-compile regex patterns for efficiency ---
    ai_context_patterns = [
        re.compile(r'\b' + re.escape(kw) + r'\b', re.IGNORECASE)
        for kw in AI_CONTEXT_WORDS
    ]

    for para in paragraphs:
        # Rule A: check PRIMARY keywords (mostly multi-word phrases) ---
        para_lower = para.lower()
        if any(kw in para_lower for kw in PRIMARY_KEYWORDS):
            found_policies.append({
                'text': para,
                'reason': 'Matched a primary keyword.'
            })
            continue

        # Rule B: require an AI-context word via regex ---
        contains_ai_context = any(pattern.search(para) for pattern in ai_context_patterns)
        if not contains_ai_context:
            continue
            
        # Policy keywords check (simple substring search is fine here)
        contains_policy_context = any(kw in para_lower for kw in POLICY_KEYWORDS)

        if contains_ai_context and contains_policy_context:
            found_policies.append({
                'text': para,
                'reason': 'Matched the combination of AI-context and policy keywords.'
            })
            
    return found_policies

sample_paragraphs = [
    "The use of generative AI tools like ChatGPT is permitted for brainstorming, but all submitted work must be original.",
    "Academic dishonesty, including plagiarism and cheating on exams, will result in a failing grade for the course.",
    "Students must cite any assistance from AI, as failure to do so is a violation of academic integrity.",
    "This course will include a lecture on the history of artificial intelligence and its impact on modern society.",
    "All sources must be properly cited in APA format. Failure to attribute your sources constitutes plagiarism."
]

extracted_policies = find_ai_policy_paragraphs(sample_paragraphs)

# --- Print results ---
print(f"--- Found {len(extracted_policies)} AI Policy Paragraphs ---")
for i, policy in enumerate(extracted_policies, 1):
    print(f"\n{i}. Paragraph:")
    print(f"   '{policy['text']}'")
    print(f"   Reason for extraction: {policy['reason']}")


--- Found 2 AI Policy Paragraphs ---

1. Paragraph:
   'The use of generative AI tools like ChatGPT is permitted for brainstorming, but all submitted work must be original.'
   Reason for extraction: Matched a primary keyword.

2. Paragraph:
   'Students must cite any assistance from AI, as failure to do so is a violation of academic integrity.'
   Reason for extraction: Matched the combination of AI-context and policy keywords.


## Final version from Qinjunjie



In [2]:
import re
import os
import sys 
from docx import Document
import pdfplumber
# import pytesseract
# from pdf2image import convert_from_path

# --- For Windows users, you may need to specify the path to the Tesseract executable ---
# Uncomment and update the line below if you are on Windows and Tesseract is not in your PATH
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Conditionally import the library for .doc files, only on Windows
if sys.platform == 'win32':
    try:
        import win32com.client as win32
    except ImportError:
        print("Warning: The 'pywin32' library is not installed. .doc files cannot be processed.")
        print("To enable .doc support on Windows, run: pip install pywin32")
        win32 = None
else:
    win32 = None

# ==============================================================================
# PART 1: KEYWORD DEFINITIONS & MAPPINGS
# ==============================================================================
PRIMARY_KEYWORDS = ['ai policy', 'aigenerated content', 'ai-generated', 'ai-assisted', 'generative ai']
AI_CONTEXT_WORDS = ['ai', 'artificial intelligence', 'generative', 'chatgpt', 'llm', 'copilot', 'bard', 'large language model', 'gemini', 'dall-e', 'gpt', 'midjourney', 'stable diffusion', 'ai tool', 'ai tools']
POLICY_KEYWORDS = ['academic integrity', 'academic dishonesty', 'plagiarism', 'cheating', 'unauthorized', 'unauthorized use', 'unauthorized assistance', 'citation', 'cite', 'attribution', 'acknowledge', 'permitted', 'allowed', 'prohibited', 'forbidden', 'disclosure', 'ethical use', 'responsible use', 'use', 'using', 'assistance', 'help', 'guidance', 'policy', 'rule']

DEPARTMENT_MAP = {
    'Africana Studies': 'AFRI', 'American Studies': ['AMST', 'ETHN'], 'Anthropology': 'ANTH',
    'Economics': 'ECON', 'Computer Science': 'CSCI', 'Applied Mathematics': 'APMA',
    'Public Health': 'PHP', 
    'Cognitive, Linguistic, and Psychological Sciences': 'CLPS',
}

def build_checkers():
    primary_checker = re.compile(r'|'.join([kw.replace(' ', r'\s*') for kw in PRIMARY_KEYWORDS]), re.IGNORECASE)
    ai_checker = re.compile(r'\b(' + '|'.join(AI_CONTEXT_WORDS) + r')\b', re.IGNORECASE)
    policy_checker = re.compile(r'\b(' + '|'.join(POLICY_KEYWORDS) + r')\b', re.IGNORECASE)
    return primary_checker, ai_checker, policy_checker

PRIMARY_CHECKER, AI_CHECKER, POLICY_CHECKER = build_checkers()

def is_policy_text(text):
    if PRIMARY_CHECKER.search(text): return 'Matched a primary keyword.'
    if AI_CHECKER.search(text) and POLICY_CHECKER.search(text): return 'Matched the combination of AI-context and policy keywords.'
    return None

# ==============================================================================
# PART 2: TEXT EXTRACTION (NOW WITH .DOC SUPPORT)
# ==============================================================================
def extract_paragraphs_from_doc(doc_path):
    """
    Extracts paragraphs from a .doc file using MS Word automation (Windows only).
    """
    if not win32:
        print(f"Skipping .doc file '{os.path.basename(doc_path)}' as 'pywin32' is not available on this system.")
        print("Please manually save it as .docx or .pdf to analyze.")
        return []
    
    word = None
    doc = None
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False
        # Get the full absolute path, which COM objects often require
        abs_path = os.path.abspath(doc_path)
        doc = word.Documents.Open(abs_path)
        paragraphs = [p.Range.Text.strip() for p in doc.Paragraphs if p.Range.Text.strip()]
        return paragraphs
    except Exception as e:
        print(f"Error processing .doc file with MS Word: {e}")
        return []
    finally:
        if doc:
            doc.Close(False) # Close the document without saving changes
        if word:
            word.Quit() # Quit the Word application

def extract_paragraphs_from_docx(doc_path):
    try:
        doc = Document(doc_path); blocks = [p.text for p in doc.paragraphs if p.text.strip()]
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip(): blocks.append(cell.text)
        return blocks
    except Exception as e: print(f"Error reading DOCX file {doc_path}: {e}"); return []

def _reconstruct_paragraphs_from_page(page):
    """Helper function to reconstruct paragraphs on a single page using coordinates."""
    words = page.extract_words(keep_blank_chars=False, x_tolerance=2)
    if not words: return []
    lines = {};
    for word in words:
        line_top = round(word['top'], 2)
        if line_top not in lines: lines[line_top] = []
        lines[line_top].append(word)
    for line_top in lines: lines[line_top].sort(key=lambda w: w['x0'])
    sorted_lines = sorted(lines.items(), key=lambda item: item[0])
    reconstructed_lines = []; line_heights = []; last_top = None
    for top, words_in_line in sorted_lines:
        text = " ".join(w['text'] for w in words_in_line)
        reconstructed_lines.append({'top': top, 'text': text})
        if last_top is not None: line_heights.append(top - last_top)
        last_top = top
    if not reconstructed_lines: return []
    avg_line_height = sum(line_heights) / len(line_heights) if line_heights else 12
    paragraph_break_threshold = avg_line_height * 1.5
    page_paragraphs = []; current_paragraph = reconstructed_lines[0]['text']
    for i in range(1, len(reconstructed_lines)):
        prev_line, curr_line = reconstructed_lines[i-1], reconstructed_lines[i]
        if (curr_line['top'] - prev_line['top']) > paragraph_break_threshold:
            page_paragraphs.append(current_paragraph)
            current_paragraph = curr_line['text']
        else:
            current_paragraph += " " + curr_line['text']
    page_paragraphs.append(current_paragraph)
    return page_paragraphs

def extract_paragraphs_from_pdf(pdf_path):
    """
    FINAL PDF METHOD: Uses coordinate geometry and cross-page stitching
    to perfectly reconstruct all paragraphs, even those split across pages.
    """
    all_paragraphs = []
    carry_over_paragraph = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                paragraphs_on_page = _reconstruct_paragraphs_from_page(page)
                if not paragraphs_on_page: continue

                # If there's a carry-over from the previous page, stitch it to the first paragraph
                if carry_over_paragraph:
                    paragraphs_on_page[0] = carry_over_paragraph + " " + paragraphs_on_page[0]
                    carry_over_paragraph = ""

                # Check if the last paragraph on THIS page is incomplete
                last_para = paragraphs_on_page[-1]
                # A simple but effective heuristic: if it doesn't end with punctuation, it's likely incomplete.
                if not last_para.strip().endswith(('.', '?', '!', '"', "'", ')', ':', ';')):
                    carry_over_paragraph = paragraphs_on_page.pop()

                all_paragraphs.extend(paragraphs_on_page)
        
        # Add any final carry-over from the very last page
        if carry_over_paragraph:
            all_paragraphs.append(carry_over_paragraph)

        if all_paragraphs: print("Successfully extracted paragraphs using coordinate-based method."); return all_paragraphs
        print("Coordinate-based method failed. Attempting OCR as last resort...")
    except Exception as e: print(f"Coordinate-based parsing failed: {e}. Attempting OCR...")
    # try:
    #     images = convert_from_path(pdf_path); full_text = ""
    #     for image in images: full_text += pytesseract.image_to_string(image) + "\n\n"
    #     return [p.strip().replace('\n', ' ') for p in full_text.split('\n\n') if p.strip()]
    # except Exception as e: print(f"OCR processing failed: {e}"); return []

# ==============================================================================
# PART 3: ANALYSIS LOGIC
# ==============================================================================
def find_course_code(paragraphs, search_limit=30):
    for long_name, short_code in DEPARTMENT_MAP.items():
        pattern = re.compile(r'\b' + re.escape(long_name) + r'\s*(\d{3,4}[A-Z]?)\b', re.IGNORECASE)
        for para in paragraphs[:search_limit]:
            if match := pattern.search(para): return f"{short_code} {match.group(1)}"
    fallback_pattern = re.compile(r'\b(([A-Z]{2,4}(\s*/\s*[A-Z]{2,4})*))\s*(\d{3,4}[A-Z]?)\b')
    for para in paragraphs[:search_limit]:
        if match := fallback_pattern.search(para):
            dept_part = match.group(1).replace(" ", ""); num_part = match.group(4)
            return f"{dept_part} {num_part}"
    return None

def analyze_ai_policy(paragraphs):
    found_policies = []
    for para in paragraphs:
        if reason := is_policy_text(para):
            found_policies.append({'text': para, 'reason': reason})
    return found_policies

# ==============================================================================
# PART 4: MAIN CONTROLLER
# ==============================================================================
def analyze_syllabus(file_path):
    print(f"\n{'='*20} Analyzing Syllabus: {os.path.basename(file_path)} {'='*20}")
    if not os.path.exists(file_path): print("Error: File not found."); return

    ext = os.path.splitext(file_path)[1].lower()
    paragraphs = []
    if ext == '.docx': paragraphs = extract_paragraphs_from_docx(file_path)
    elif ext == '.pdf': paragraphs = extract_paragraphs_from_pdf(file_path)
    elif ext == '.doc': paragraphs = extract_paragraphs_from_doc(file_path) 
    else: print(f"Error: Unsupported file type '{ext}'."); return

    if not paragraphs: print("Could not extract any usable text."); return
    
    print(f"Extracted {len(paragraphs)} distinct paragraphs. Analyzing...")
    course_code = find_course_code(paragraphs)
    print(f"--- Course Code: {course_code if course_code else 'Not Found'} ---")

    ai_policy_sections = analyze_ai_policy(paragraphs)
    if not ai_policy_sections:
        print("\n--- No AI policy paragraphs were found in this document. ---")
        return

    print(f"\n--- SUCCESS: Found {len(ai_policy_sections)} AI Policy Paragraph(s) ---\n")
    unique_policies = {p['text']: p for p in ai_policy_sections}.values()
    for i, policy in enumerate(unique_policies, 1):
        print(f"--- Relevant Paragraph {i} ---"); print(policy['text']); print(f"(Reason: {policy['reason']})\n")

# ==============================================================================
# PART 5: EXECUTION
# ==============================================================================
if __name__ == "__main__":
    docx_file1 = "8C2WFwQPrq6vcFU0Yml3vHcL1CYb5HnYcxp5s7A5.docx" 
    analyze_syllabus(docx_file1)
    print("\n" * 3)

    docx_file2 = "UMeRwRyvXAKAH6DwCkTmFyIunq3ti97bJPARlu7C.docx"
    analyze_syllabus(docx_file2)
    print("\n" * 3) 

    pdf_file = "0Fslu7lGZ8dJG0OYQGf1TgzFMPyFDEv0n5Q96BNq.pdf" 
    analyze_syllabus(pdf_file)
    print("\n" * 3)

    doc_file = "7Y6H93I4L6F5bcplrUpPbf5AjQcypbJfrevTSiNf.doc"
    analyze_syllabus(doc_file)


Error: File not found.





Error: File not found.





Error: File not found.





Error: File not found.


## ADD Department (My First Version)

In [3]:
import re
import os
import sys
from docx import Document
import pdfplumber

# Conditionally import the library for .doc files, only on Windows
if sys.platform == 'win32':
    try:
        import win32com.client as win32
    except ImportError:
        print("Warning: The 'pywin32' library is not installed. .doc files cannot be processed.")
        print("To enable .doc support on Windows, run: pip install pywin32")
        win32 = None
else:
    win32 = None

# ==============================================================================
# PART 1: KEYWORD DEFINITIONS & MAPPINGS
# ==============================================================================
PRIMARY_KEYWORDS = ['ai policy', 'aigenerated content', 'ai-generated', 'ai-assisted', 'generative ai']
AI_CONTEXT_WORDS = [
    'ai', 'artificial intelligence', 'generative', 'chatgpt', 'llm', 'copilot',
    'bard', 'large language model', 'gemini', 'dall-e', 'gpt', 'midjourney',
    'stable diffusion', 'ai tool', 'ai tools'
]
POLICY_KEYWORDS = [
    'academic integrity', 'academic dishonesty', 'plagiarism', 'cheating', 'unauthorized',
    'unauthorized use', 'unauthorized assistance', 'citation', 'cite', 'attribution',
    'acknowledge', 'permitted', 'allowed', 'prohibited', 'forbidden', 'disclosure',
    'ethical use', 'responsible use', 'use', 'using', 'assistance', 'help', 'guidance',
    'policy', 'rule'
]

DEPARTMENT_MAP = {
    'Africana Studies': 'AFRI',
    'American Studies': ['AMST', 'ETHN'],
    'Anthropology': 'ANTH',
    'Economics': 'ECON',
    'Computer Science': 'CSCI',
    'Applied Mathematics': 'APMA',
    'Public Health': 'PHP',
    'Cognitive, Linguistic, and Psychological Sciences': 'CLPS',
}

# ----------------------------------------------------------------
# build reverse lookup: short code → full department name
CODE_TO_DEPT = {}
for long_name, short in DEPARTMENT_MAP.items():
    if isinstance(short, list):
        for s in short:
            CODE_TO_DEPT[s] = long_name
    else:
        CODE_TO_DEPT[short] = long_name
# ----------------------------------------------------------------

def build_checkers():
    primary_checker = re.compile(
        r'|'.join([kw.replace(' ', r'\s*') for kw in PRIMARY_KEYWORDS]),
        re.IGNORECASE
    )
    ai_checker = re.compile(
        r'\b(' + '|'.join(AI_CONTEXT_WORDS) + r')\b',
        re.IGNORECASE
    )
    policy_checker = re.compile(
        r'\b(' + '|'.join(POLICY_KEYWORDS) + r')\b',
        re.IGNORECASE
    )
    return primary_checker, ai_checker, policy_checker

PRIMARY_CHECKER, AI_CHECKER, POLICY_CHECKER = build_checkers()

def is_policy_text(text):
    if PRIMARY_CHECKER.search(text):
        return 'Matched a primary keyword.'
    if AI_CHECKER.search(text) and POLICY_CHECKER.search(text):
        return 'Matched the combination of AI-context and policy keywords.'
    return None

# ==============================================================================
# PART 2: TEXT EXTRACTION
# ==============================================================================

def extract_paragraphs_from_doc(doc_path):
    if not win32:
        print(f"Skipping .doc file '{os.path.basename(doc_path)}' as 'pywin32' is not available on this system.")
        return []
    word = None
    doc = None
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False
        abs_path = os.path.abspath(doc_path)
        doc = word.Documents.Open(abs_path)
        paragraphs = [p.Range.Text.strip() for p in doc.Paragraphs if p.Range.Text.strip()]
        return paragraphs
    except Exception as e:
        print(f"Error processing .doc file: {e}")
        return []
    finally:
        if doc:
            doc.Close(False)
        if word:
            word.Quit()


def extract_paragraphs_from_docx(doc_path):
    try:
        doc = Document(doc_path)
        blocks = [p.text for p in doc.paragraphs if p.text.strip()]
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip():
                        blocks.append(cell.text)
        return blocks
    except Exception as e:
        print(f"Error reading DOCX file {doc_path}: {e}")
        return []


def _reconstruct_paragraphs_from_page(page):
    words = page.extract_words(keep_blank_chars=False, x_tolerance=2)
    if not words:
        return []
    lines = {}
    for word in words:
        line_top = round(word['top'], 2)
        lines.setdefault(line_top, []).append(word)
    for line_top in lines:
        lines[line_top].sort(key=lambda w: w['x0'])
    sorted_lines = sorted(lines.items(), key=lambda item: item[0])
    reconstructed_lines = []
    last_top = None
    line_heights = []
    for top, ws in sorted_lines:
        text = " ".join(w['text'] for w in ws)
        reconstructed_lines.append({'top': top, 'text': text})
        if last_top is not None:
            line_heights.append(top - last_top)
        last_top = top
    avg_height = sum(line_heights) / len(line_heights) if line_heights else 12
    threshold = avg_height * 1.5
    paragraphs = []
    current = reconstructed_lines[0]['text']
    for prev, curr in zip(reconstructed_lines, reconstructed_lines[1:]):
        if (curr['top'] - prev['top']) > threshold:
            paragraphs.append(current)
            current = curr['text']
        else:
            current += ' ' + curr['text']
    paragraphs.append(current)
    return paragraphs


def extract_paragraphs_from_pdf(pdf_path):
    all_paras = []
    carry = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                paras = _reconstruct_paragraphs_from_page(page)
                if not paras:
                    continue
                if carry:
                    paras[0] = carry + ' ' + paras[0]
                    carry = ""
                last = paras[-1]
                if not last.strip().endswith(('.', '?', '!', '"', "'", ')', ':', ';')):
                    carry = paras.pop()
                all_paras.extend(paras)
        if carry:
            all_paras.append(carry)
        if all_paras:
            print("Successfully extracted paragraphs using coordinate-based method.")
            return all_paras
        print("Coordinate-based method failed. No text extracted.")
    except Exception as e:
        print(f"Coordinate-based parsing failed: {e}")
    return []

# ==============================================================================
# PART 3: ANALYSIS LOGIC
# ==============================================================================
def find_course_code(paragraphs, search_limit=30):
    for long_name, short_code in DEPARTMENT_MAP.items():
        pattern = re.compile(r'\b' + re.escape(long_name) + r'\s*(\d{3,4}[A-Z]?)\b', re.IGNORECASE)
        for para in paragraphs[:search_limit]:
            if m := pattern.search(para):
                return f"{short_code} {m.group(1)}"
    fallback = re.compile(r"\b(([A-Z]{2,4}(\s*/\s*[A-Z]{2,4})*))\s*(\d{3,4}[A-Z]?)\b")
    for para in paragraphs[:search_limit]:
        if m := fallback.search(para):
            dept = m.group(1).replace(" ", "")
            num = m.group(4)
            return f"{dept} {num}"
    return None


def analyze_ai_policy(paragraphs):
    results = []
    for para in paragraphs:
        if reason := is_policy_text(para):
            results.append({'text': para, 'reason': reason})
    return results

# ==============================================================================
# PART 4: MAIN CONTROLLER
# ==============================================================================
def analyze_syllabus(file_path):
    print(f"\n{'='*20} Analyzing Syllabus: {os.path.basename(file_path)} {'='*20}")
    if not os.path.exists(file_path):
        print("Error: File not found."); return

    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.docx':
        paragraphs = extract_paragraphs_from_docx(file_path)
    elif ext == '.pdf':
        paragraphs = extract_paragraphs_from_pdf(file_path)
    elif ext == '.doc':
        paragraphs = extract_paragraphs_from_doc(file_path)
    else:
        print(f"Error: Unsupported file type '{ext}'."); return

    if not paragraphs:
        print("Could not extract any usable text."); return

    print(f"Extracted {len(paragraphs)} distinct paragraphs. Analyzing...")
    course_code = find_course_code(paragraphs)
    print(f"--- Course Code: {course_code if course_code else 'Not Found'} ---")

    # NEW: print department name
    if course_code:
        prefix = course_code.split()[0]
        dept_name = CODE_TO_DEPT.get(prefix, 'Unknown')
        print(f"--- Department: {dept_name} ---")

    policies = analyze_ai_policy(paragraphs)
    if not policies:
        print("\n--- No AI policy paragraphs were found in this document. ---")
        return

    print(f"\n--- SUCCESS: Found {len(policies)} AI Policy Paragraph(s) ---\n")
    unique = {p['text']: p for p in policies}.values()
    for i, p in enumerate(unique, 1):
        print(f"--- Relevant Paragraph {i} ---")
        print(p['text'])
        print(f"(Reason: {p['reason']})\n")

# ==============================================================================
# PART 5: EXECUTION
# ==============================================================================
if __name__ == '__main__':
    files = [
        '8C2WFwQPrq6vcFU0Yml3vHcL1CYb5HnYcxp5s7A5.docx',
        'UMeRwRyvXAKAH6DwCkTmFyIunq3ti97bJPARlu7C.docx',
        '0Fslu7lGZ8dJG0OYQGf1TgzFMPyFDEv0n5Q96BNq.pdf',
        '7Y6H93I4L6F5bcplrUpPbf5AjQcypbJfrevTSiNf.doc'
    ]
    for f in files:
        analyze_syllabus(f)
        print("\n" * 2)



Error: File not found.




Error: File not found.




Error: File not found.




Error: File not found.





## ADD Knowledge Area

In [4]:
import re
import os
import sys
from docx import Document
import pdfplumber

# Conditionally import the library for .doc files, only on Windows
if sys.platform == 'win32':
    try:
        import win32com.client as win32
    except ImportError:
        print("Warning: The 'pywin32' library is not installed. .doc files cannot be processed.")
        print("To enable .doc support on Windows, run: pip install pywin32")
        win32 = None
else:
    win32 = None

# ==============================================================================
# PART 1: KEYWORD DEFINITIONS & MAPPINGS
# ==============================================================================
PRIMARY_KEYWORDS = ['ai policy', 'aigenerated content', 'ai-generated', 'ai-assisted', 'generative ai']
AI_CONTEXT_WORDS = [
    'ai', 'artificial intelligence', 'generative', 'chatgpt', 'llm', 'copilot',
    'bard', 'large language model', 'gemini', 'dall-e', 'gpt', 'midjourney',
    'stable diffusion', 'ai tool', 'ai tools'
]
POLICY_KEYWORDS = [
    'academic integrity', 'academic dishonesty', 'plagiarism', 'cheating', 'unauthorized',
    'unauthorized use', 'unauthorized assistance', 'citation', 'cite', 'attribution',
    'acknowledge', 'permitted', 'allowed', 'prohibited', 'forbidden', 'disclosure',
    'ethical use', 'responsible use', 'use', 'using', 'assistance', 'help', 'guidance',
    'policy', 'rule'
]

DEPARTMENT_MAP = {
    'Africana Studies': 'AFRI',
    'American Studies': ['AMST', 'ETHN'],
    'Anthropology': 'ANTH',
    'Economics': 'ECON',
    'Computer Science': 'CSCI',
    'Applied Mathematics': 'APMA',
    'Public Health': 'PHP',
    'Cognitive, Linguistic, and Psychological Sciences': 'CLPS',
}

# build reverse lookup: short code → full department name
CODE_TO_DEPT = {}
for long_name, short in DEPARTMENT_MAP.items():
    if isinstance(short, list):
        for s in short:
            CODE_TO_DEPT[s] = long_name
    else:
        CODE_TO_DEPT[short] = long_name

# map each department to a knowledge area
KNOWLEDGE_AREA_MAP = {
    'Africana Studies': 'Humanities',
    'American Studies': 'Humanities',
    'Anthropology': 'Social Science',
    'Economics': 'Social Science',
    'Computer Science': 'Physical Sciences',
    'Applied Mathematics': 'Physical Sciences',
    'Public Health': 'Life Sciences',
    'Cognitive, Linguistic, and Psychological Sciences': 'Social Science',
}

# compile keyword checkers
def build_checkers():
    primary_checker = re.compile(
        r'|'.join([kw.replace(' ', r'\s*') for kw in PRIMARY_KEYWORDS]),
        re.IGNORECASE
    )
    ai_checker = re.compile(
        r'\b(' + '|'.join(AI_CONTEXT_WORDS) + r')\b',
        re.IGNORECASE
    )
    policy_checker = re.compile(
        r'\b(' + '|'.join(POLICY_KEYWORDS) + r')\b',
        re.IGNORECASE
    )
    return primary_checker, ai_checker, policy_checker

PRIMARY_CHECKER, AI_CHECKER, POLICY_CHECKER = build_checkers()

def is_policy_text(text):
    if PRIMARY_CHECKER.search(text):
        return 'Matched a primary keyword.'
    if AI_CHECKER.search(text) and POLICY_CHECKER.search(text):
        return 'Matched the combination of AI-context and policy keywords.'
    return None

# ==============================================================================
# PART 2: TEXT EXTRACTION
# ==============================================================================

def extract_paragraphs_from_doc(doc_path):
    if not win32:
        print(f"Skipping .doc file '{os.path.basename(doc_path)}' as 'pywin32' is not available on this system.")
        return []
    word = None
    doc = None
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False
        abs_path = os.path.abspath(doc_path)
        doc = word.Documents.Open(abs_path)
        return [p.Range.Text.strip() for p in doc.Paragraphs if p.Range.Text.strip()]
    except Exception as e:
        print(f"Error processing .doc file: {e}")
        return []
    finally:
        if doc: doc.Close(False)
        if word: word.Quit()


def extract_paragraphs_from_docx(doc_path):
    try:
        doc = Document(doc_path)
        blocks = [p.text for p in doc.paragraphs if p.text.strip()]
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip(): blocks.append(cell.text)
        return blocks
    except Exception as e:
        print(f"Error reading DOCX file {doc_path}: {e}")
        return []


def _reconstruct_paragraphs_from_page(page):
    words = page.extract_words(x_tolerance=2)
    if not words: return []
    lines = {}
    for w in words:
        top = round(w['top'],2)
        lines.setdefault(top, []).append(w)
    for top in lines: lines[top].sort(key=lambda x: x['x0'])
    items = sorted(lines.items(), key=lambda kv: kv[0])
    texts, heights = [], []
    for idx,(top, ws) in enumerate(items):
        texts.append({'top': top, 'text': ' '.join(w['text'] for w in ws)})
        if idx>0: heights.append(top - items[idx-1][0])
    avg = sum(heights)/len(heights) if heights else 12
    threshold = avg*1.5
    paras, cur = [], texts[0]['text']
    for prev, curr in zip(texts, texts[1:]):
        if (curr['top'] - prev['top'])>threshold:
            paras.append(cur); cur = curr['text']
        else:
            cur += ' ' + curr['text']
    paras.append(cur)
    return paras


def extract_paragraphs_from_pdf(pdf_path):
    all_p, carry = [], ''
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                paras = _reconstruct_paragraphs_from_page(page)
                if not paras: continue
                if carry: paras[0] = carry + ' ' + paras[0]; carry = ''
                if not paras[-1].strip().endswith(('.', '?', '!', '"', "'", ')', ':', ';')):
                    carry = paras.pop()
                all_p.extend(paras)
        if carry: all_p.append(carry)
        if all_p:
            print("Successfully extracted paragraphs using coordinate-based method.")
            return all_p
        print("Coordinate-based method failed. No text extracted.")
    except Exception as e:
        print(f"Coordinate-based parsing failed: {e}")
    return []

# ==============================================================================
# PART 3: ANALYSIS LOGIC
# ==============================================================================

def find_course_code(paragraphs, limit=30):
    for lname, scode in DEPARTMENT_MAP.items():
        pat = re.compile(r'\b'+re.escape(lname)+r'\s*(\d{3,4}[A-Z]?)\b', re.IGNORECASE)
        for para in paragraphs[:limit]:
            if m:=pat.search(para): return f"{scode if isinstance(scode,str) else scode[0]} {m.group(1)}"
    fb = re.compile(r"\b(([A-Z]{2,4}(\s*/\s*[A-Z]{2,4})*))\s*(\d{3,4}[A-Z]?)\b")
    for para in paragraphs[:limit]:
        if m:=fb.search(para): return f"{m.group(1).replace(' ','')} {m.group(4)}"
    return None


def analyze_ai_policy(paragraphs):
    return [
        {'text':p,'reason': reason}
        for p in paragraphs if (reason:=is_policy_text(p))
    ]

# ==============================================================================
# PART 4: MAIN CONTROLLER
# ==============================================================================

def analyze_syllabus(path):
    print(f"\n{'='*20} Analyzing Syllabus: {os.path.basename(path)} {'='*20}")
    if not os.path.exists(path):
        print("Error: File not found."); return
    ext = os.path.splitext(path)[1].lower()
    paras = []
    if ext=='.docx': paras=extract_paragraphs_from_docx(path)
    elif ext=='.pdf': paras=extract_paragraphs_from_pdf(path)
    elif ext=='.doc': paras=extract_paragraphs_from_doc(path)
    else:
        print(f"Error: Unsupported file type '{ext}'."); return
    if not paras:
        print("Could not extract any usable text."); return
    print(f"Extracted {len(paras)} distinct paragraphs. Analyzing...")
    code = find_course_code(paras)
    print(f"--- Course Code: {code if code else 'Not Found'} ---")
    if code:
        prefix = code.split()[0]
        dept = CODE_TO_DEPT.get(prefix, 'Unknown')
        print(f"--- Department: {dept} ---")
        area = KNOWLEDGE_AREA_MAP.get(dept, 'Unknown')
        print(f"--- Knowledge Area: {area} ---")
    policies = analyze_ai_policy(paras)
    if not policies:
        print("\n--- No AI policy paragraphs were found in this document. ---")
        return
    print(f"\n--- SUCCESS: Found {len(policies)} AI Policy Paragraph(s) ---\n")
    unique = {p['text']: p for p in policies}.values()
    for i,p in enumerate(unique,1):
        print(f"--- Relevant Paragraph {i} ---")
        print(p['text'])
        print(f"(Reason: {p['reason']})\n")

# ==============================================================================
# PART 5: EXECUTION
# ==============================================================================
if __name__=='__main__':
    files=[
        '8C2WFwQPrq6vcFU0Yml3vHcL1CYb5HnYcxp5s7A5.docx',
        'UMeRwRyvXAKAH6DwCkTmFyIunq3ti97bJPARlu7C.docx',
        '0Fslu7lGZ8dJG0OYQGf1TgzFMPyFDEv0n5Q96BNq.pdf',
        '7Y6H93I4L6F5bcplrUpPbf5AjQcypbJfrevTSiNf.doc'
    ]
    for f in files:
        analyze_syllabus(f)
        print("\n"*2)



Error: File not found.




Error: File not found.




Error: File not found.




Error: File not found.





## Create a csv

In [5]:
import re
import os
import sys
import csv
from docx import Document
import pdfplumber

# Conditionally import the library for .doc files, only on Windows
if sys.platform == 'win32':
    try:
        import win32com.client as win32
    except ImportError:
        print("Warning: The 'pywin32' library is not installed. .doc files cannot be processed.")
        print("To enable .doc support on Windows, run: pip install pywin32")
        win32 = None
else:
    win32 = None

# ==============================================================================
# PART 1: KEYWORD DEFINITIONS & MAPPINGS
# ==============================================================================
PRIMARY_KEYWORDS = ['ai policy', 'aigenerated content', 'ai-generated', 'ai-assisted', 'generative ai']
AI_CONTEXT_WORDS = [
    'ai', 'artificial intelligence', 'generative', 'chatgpt', 'llm', 'copilot',
    'bard', 'large language model', 'gemini', 'dall-e', 'gpt', 'midjourney',
    'stable diffusion', 'ai tool', 'ai tools'
]
POLICY_KEYWORDS = [
    'academic integrity', 'academic dishonesty', 'plagiarism', 'cheating', 'unauthorized',
    'unauthorized use', 'unauthorized assistance', 'citation', 'cite', 'attribution',
    'acknowledge', 'permitted', 'allowed', 'prohibited', 'forbidden', 'disclosure',
    'ethical use', 'responsible use', 'use', 'using', 'assistance', 'help', 'guidance',
    'policy', 'rule'
]
DEPARTMENT_MAP = {
    'Africana Studies': 'AFRI',
    'American Studies': ['AMST', 'ETHN'],
    'Anthropology': 'ANTH',
    'Economics': 'ECON',
    'Computer Science': 'CSCI',
    'Applied Mathematics': 'APMA',
    'Public Health': 'PHP',
    'Cognitive, Linguistic, and Psychological Sciences': 'CLPS',
}

# build reverse lookup: short code → full department name
CODE_TO_DEPT = {}
for long_name, short in DEPARTMENT_MAP.items():
    if isinstance(short, list):
        for s in short:
            CODE_TO_DEPT[s] = long_name
    else:
        CODE_TO_DEPT[short] = long_name

# map each department to a knowledge area
KNOWLEDGE_AREA_MAP = {
    'Africana Studies': 'Humanities',
    'American Studies': 'Humanities',
    'Anthropology': 'Social Science',
    'Economics': 'Social Science',
    'Computer Science': 'Physical Sciences',
    'Applied Mathematics': 'Physical Sciences',
    'Public Health': 'Life Sciences',
    'Cognitive, Linguistic, and Psychological Sciences': 'Social Science',
}

# compile keyword checkers
def build_checkers():
    primary_checker = re.compile(
        '|'.join([kw.replace(' ', r'\s*') for kw in PRIMARY_KEYWORDS]),
        re.IGNORECASE
    )
    ai_checker = re.compile(
        r'\b(' + '|'.join(AI_CONTEXT_WORDS) + r')\b',
        re.IGNORECASE
    )
    policy_checker = re.compile(
        r'\b(' + '|'.join(POLICY_KEYWORDS) + r')\b',
        re.IGNORECASE
    )
    return primary_checker, ai_checker, policy_checker

PRIMARY_CHECKER, AI_CHECKER, POLICY_CHECKER = build_checkers()

def is_policy_text(text):
    if PRIMARY_CHECKER.search(text):
        return 'Matched a primary keyword.'
    if AI_CHECKER.search(text) and POLICY_CHECKER.search(text):
        return 'Matched the combination of AI-context and policy keywords.'
    return None

# ==============================================================================
# PART 2: TEXT EXTRACTION
# ==============================================================================
def extract_paragraphs_from_doc(doc_path):
    if not win32:
        print(f"Skipping .doc file '{os.path.basename(doc_path)}' as 'pywin32' is not available on this system.")
        return []
    word = None
    doc = None
    try:
        word = win32.Dispatch("Word.Application")
        word.Visible = False
        abs_path = os.path.abspath(doc_path)
        doc = word.Documents.Open(abs_path)
        return [p.Range.Text.strip() for p in doc.Paragraphs if p.Range.Text.strip()]
    except Exception as e:
        print(f"Error processing .doc file: {e}")
        return []
    finally:
        if doc: doc.Close(False)
        if word: word.Quit()

def extract_paragraphs_from_docx(doc_path):
    try:
        doc = Document(doc_path)
        blocks = [p.text for p in doc.paragraphs if p.text.strip()]
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    if cell.text.strip(): blocks.append(cell.text)
        return blocks
    except Exception as e:
        print(f"Error reading DOCX file {doc_path}: {e}")
        return []

def _reconstruct_paragraphs_from_page(page):
    words = page.extract_words(x_tolerance=2)
    if not words: return []
    lines = {}
    for w in words:
        top = round(w['top'], 2)
        lines.setdefault(top, []).append(w)
    for top, ws in lines.items():
        ws.sort(key=lambda x: x['x0'])
    items = sorted(lines.items(), key=lambda kv: kv[0])
    texts, heights = [], []
    for idx, (top, ws) in enumerate(items):
        texts.append({'top': top, 'text': ' '.join(w['text'] for w in ws)})
        if idx > 0:
            heights.append(top - items[idx-1][0])
    avg_height = sum(heights) / len(heights) if heights else 12
    threshold = avg_height * 1.5
    paras, current = [], texts[0]['text']
    for prev, curr in zip(texts, texts[1:]):
        if (curr['top'] - prev['top']) > threshold:
            paras.append(current)
            current = curr['text']
        else:
            current += ' ' + curr['text']
    paras.append(current)
    return paras

def extract_paragraphs_from_pdf(pdf_path):
    all_paras, carry = [], ''
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                paras = _reconstruct_paragraphs_from_page(page)
                if not paras:
                    continue
                if carry:
                    paras[0] = carry + ' ' + paras[0]
                    carry = ''
                if not paras[-1].strip().endswith(('.', '?', '!', '"', "'", ')', ':', ';')):
                    carry = paras.pop()
                all_paras.extend(paras)
        if carry:
            all_paras.append(carry)
        if all_paras:
            print("Successfully extracted paragraphs using coordinate-based method.")
            return all_paras
        print("Coordinate-based method failed. No text extracted.")
    except Exception as e:
        print(f"Coordinate-based parsing failed: {e}")
    return []

# ==============================================================================
# PART 3: ANALYSIS LOGIC
# ==============================================================================
def find_course_code(paragraphs, limit=30):
    for long_name, short_code in DEPARTMENT_MAP.items():
        pattern = re.compile(r'\b' + re.escape(long_name) + r'\s*(\d{3,4}[A-Z]?)\b', re.IGNORECASE)
        for para in paragraphs[:limit]:
            match = pattern.search(para)
            if match:
                code = short_code if isinstance(short_code, str) else short_code[0]
                return f"{code} {match.group(1)}"
    # fallback: any CODE ### pattern
    fallback = re.compile(r"\b(([A-Z]{2,4}(?:\s*/\s*[A-Z]{2,4})*))\s*(\d{3,4}[A-Z]?)\b")
    for para in paragraphs[:limit]:
        match = fallback.search(para)
        if match:
            dept_part = match.group(1).replace(' ', '')
            num = match.group(3)
            return f"{dept_part} {num}"
    return None


def analyze_ai_policy(paragraphs):
    findings = []
    for para in paragraphs:
        reason = is_policy_text(para)
        if reason:
            findings.append({'text': para, 'reason': reason})
    return findings

# ==============================================================================
# PART 4: MAIN CONTROLLER & CSV OUTPUT
# ==============================================================================
results = []  # accumulate summary rows

def analyze_syllabus(path):
    print(f"\n{'='*20} Analyzing Syllabus: {os.path.basename(path)} {'='*20}")
    if not os.path.exists(path):
        print("Error: File not found."); return

    ext = os.path.splitext(path)[1].lower()
    paragraphs = []
    if ext == '.docx':
        paragraphs = extract_paragraphs_from_docx(path)
    elif ext == '.pdf':
        paragraphs = extract_paragraphs_from_pdf(path)
    elif ext == '.doc':
        paragraphs = extract_paragraphs_from_doc(path)
    else:
        print(f"Error: Unsupported file type '{ext}'."); return

    if not paragraphs:
        print("Could not extract any usable text."); return

    print(f"Extracted {len(paragraphs)} distinct paragraphs. Analyzing...")
    code = find_course_code(paragraphs)
    print(f"--- Course Code: {code if code else 'Not Found'} ---")

    dept = 'Unknown'
    area = 'Unknown'
    if code:
        prefix = code.split()[0]
        dept = CODE_TO_DEPT.get(prefix, 'Unknown')
        area = KNOWLEDGE_AREA_MAP.get(dept, 'Unknown')
        print(f"--- Department: {dept} ---")
        print(f"--- Knowledge Area: {area} ---")

    policies = analyze_ai_policy(paragraphs)
    flag = 1 if policies else 0
    if not policies:
        print("\n--- No AI policy paragraphs were found in this document. ---")
    else:
        print(f"\n--- SUCCESS: Found {len(policies)} AI Policy Paragraph(s) ---\n")
        unique = {p['text']: p for p in policies}.values()
        for i, p in enumerate(unique, 1):
            print(f"--- Relevant Paragraph {i} ---")
            print(p['text'])
            print(f"(Reason: {p['reason']})\n")

    # prepare CSV row
    if policies:
        unique = {p['text']: p for p in policies}.values()
        para_texts = [f"{i}:{p['text']} ({p['reason']})" for i, p in enumerate(unique, 1)]
        para_field = ' || '.join(para_texts)
    else:
        para_field = ''

    results.append({
        'Course Code': code or '',
        'Department': dept,
        'Knowledge Area': area,
        'AI Policy': flag,
        'AI Policy Paragraphs': para_field
    })

if __name__ == '__main__':
    files = [
        '8C2WFwQPrq6vcFU0Yml3vHcL1CYb5HnYcxp5s7A5.docx',
        'UMeRwRyvXAKAH6DwCkTmFyIunq3ti97bJPARlu7C.docx',
        '0Fslu7lGZ8dJG0OYQGf1TgzFMPyFDEv0n5Q96BNq.pdf',
        '7Y6H93I4L6F5bcplrUpPbf5AjQcypbJfrevTSiNf.doc'
    ]
    for f in files:
        analyze_syllabus(f)

    # write CSV summary
    csv_file = 'syllabus_summary.csv'
    with open(csv_file, 'w', newline='', encoding='utf-8-sig') as f:
        writer = csv.DictWriter(f, fieldnames=[
            'Course Code', 'Department', 'Knowledge Area', 'AI Policy', 'AI Policy Paragraphs'
        ])
        writer.writeheader()
        for row in results:
            writer.writerow(row)
    print(f"\nSummary CSV written to {csv_file}")



Error: File not found.

Error: File not found.

Error: File not found.

Error: File not found.

Summary CSV written to syllabus_summary.csv
